In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
otto_train = load_csv('./train.csv/train.csv', 'id')

In [4]:
columns = get_columns(otto_train)

In [5]:
X = get_features(otto_train, columns[:-1])

In [6]:
response = get_response(otto_train, columns[-1])
Y = get_encoded_labels(response)

In [7]:
# lets print out the correlation matrix to find out
# correlation between different variables

corr_matrix = X.corr()

In [8]:
# lets find out the variable which has high correlation with 
# many other variables
most_corr_variable = max_corr_variable(corr_matrix, threshold=0.2)

In [9]:
most_corr_variable

'feat_22'

In [10]:
# lets include all the features
X_feat = X

In [11]:
train_idx, test_idx = get_stratified_shuffle_splits(Y, n_iter=3, train_size=5000)

In [12]:
X_train = X_feat.ix[train_idx]
y_train = Y[train_idx]
X_test = X_feat.ix[test_idx]
y_test = Y[test_idx]

In [13]:
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((5000, 93), (5000,), (56879, 93), (56879,))


In [14]:
# lets check the class distribution before taking stratified shuffle split
get_class_distribution(response.value_counts(), response.shape[0])

{'Class_1': 0.031174246097158926,
 'Class_2': 0.26054494327547756,
 'Class_3': 0.1293513041791913,
 'Class_4': 0.043488800543003976,
 'Class_5': 0.04426452050809658,
 'Class_6': 0.22843336888716506,
 'Class_7': 0.045880603768706167,
 'Class_8': 0.1367852871779954,
 'Class_9': 0.080076925563205018}

In [15]:
# lets check the class distribution after taking stratified shuffle split
get_class_distribution(response.ix[train_idx].value_counts(), len(train_idx))

{'Class_1': 0.031399999999999997,
 'Class_2': 0.2606,
 'Class_3': 0.12939999999999999,
 'Class_4': 0.043400000000000001,
 'Class_5': 0.044400000000000002,
 'Class_6': 0.22819999999999999,
 'Class_7': 0.0458,
 'Class_8': 0.13700000000000001,
 'Class_9': 0.079799999999999996}

In [16]:
from sklearn.cross_validation import train_test_split
Xt, Xv, yt, yv = train_test_split(X_train, y_train, test_size=0.2, random_state=1728)

In [17]:
# lets try to model this using a single variable using logistic regression
from sklearn.linear_model import LogisticRegression

In [18]:
classLogit = LogisticRegression(C=.1, class_weight=None, fit_intercept=True, multi_class='ovr', solver='lbfgs')

In [19]:
classLogit.fit(Xt, yt)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)

In [20]:
predicted = classLogit.predict_proba(Xt)

In [21]:
print ('score on training set is %f' %(classLogit.score(Xt, yt)))

score on training set is 0.772500


In [22]:
from sklearn.metrics import log_loss
print (log_loss(yt, predicted))

0.627888860636


In [75]:
print ('score on testing set is %f' %(classLogit.score(Xv, yv)))

score on testing set is 0.736000


In [23]:
predictTest = classLogit.predict_proba(Xv)
print ('Log loss on the test set is {0}'.format(log_loss(yv, predictTest)))

Log loss on the test set is 0.728727854769


### Baseline Model
<p>This model with all the features produces a log loss of <b><em>0.627888</em></b> on the training set. We will compare each and every model with this log loss score.</p>