In [2]:
# Hide deprecation warnings
import warnings
warnings.filterwarnings('ignore')

# Common imports
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing
import pickle

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# To format floats
from IPython.display import display
pd.set_option('display.float_format', lambda x: '%.5f' % x)



In [2]:
models = {
    "SGD Classifier": SGDClassifier(),
    "Random Forests": RandomForestClassifier(),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Softmax Regression": LogisticRegression(),
    #"SVM": SVC(),
    "Decission Trees": DecisionTreeClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "Gradient Boost":GradientBoostingClassifier()
}

#### Model times for n_rows = 3889

In [3]:
#Load all the csv files into Pandas dataframes

X_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", \
                     usecols=(0,1,2,3,4,7,8,9,10,11), skiprows=184900000)
y_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", usecols=(6), skiprows=184900000)

In [4]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV AUC ROC: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 33.9 ms, sys: 4.25 ms, total: 38.1 ms
Wall time: 47.3 ms
SGD Classifier
CV AUC ROC: 0.687 +/- 0.134
----------------
CPU times: user 295 ms, sys: 1.62 ms, total: 297 ms
Wall time: 296 ms
Random Forests
CV AUC ROC: 0.892 +/- 0.112
----------------
CPU times: user 124 ms, sys: 0 ns, total: 124 ms
Wall time: 123 ms
k-Nearest Neighbors
CV AUC ROC: 0.746 +/- 0.075
----------------
CPU times: user 229 ms, sys: 0 ns, total: 229 ms
Wall time: 229 ms
Softmax Regression
CV AUC ROC: 0.862 +/- 0.077
----------------
CPU times: user 117 ms, sys: 0 ns, total: 117 ms
Wall time: 117 ms
Decission Trees
CV AUC ROC: 0.740 +/- 0.098
----------------
CPU times: user 1.5 s, sys: 0 ns, total: 1.5 s
Wall time: 1.5 s
AdaBoost
CV AUC ROC: 0.932 +/- 0.076
----------------
CPU times: user 1.63 s, sys: 0 ns, total: 1.63 s
Wall time: 1.63 s
Gradient Boost
CV AUC ROC: 0.915 +/- 0.052
----------------


#### Model times for n_rows = 103889

In [5]:
#Load all the csv files into Pandas dataframes

X_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", \
                     usecols=(0,1,2,3,4,7,8,9,10,11), skiprows=184800000)
y_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", usecols=(6), skiprows=184800000)

In [6]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 1.87 s, sys: 3.63 s, total: 5.49 s
Wall time: 774 ms
SGD Classifier
CV accuracy: 0.872 +/- 0.044
----------------
CPU times: user 8.6 s, sys: 380 ms, total: 8.98 s
Wall time: 8.51 s
Random Forests
CV accuracy: 0.925 +/- 0.024
----------------
CPU times: user 10.9 s, sys: 0 ns, total: 10.9 s
Wall time: 10.9 s
k-Nearest Neighbors
CV accuracy: 0.625 +/- 0.101
----------------
CPU times: user 8.42 s, sys: 4.34 s, total: 12.8 s
Wall time: 6.93 s
Softmax Regression
CV accuracy: 0.930 +/- 0.022
----------------
CPU times: user 3.72 s, sys: 413 ms, total: 4.13 s
Wall time: 3.59 s
Decission Trees
CV accuracy: 0.769 +/- 0.112
----------------
CPU times: user 22.7 s, sys: 0 ns, total: 22.7 s
Wall time: 22.7 s
AdaBoost
CV accuracy: 0.974 +/- 0.009
----------------
CPU times: user 46.1 s, sys: 0 ns, total: 46.1 s
Wall time: 46.1 s
Gradient Boost
CV accuracy: 0.968 +/- 0.027
----------------


#### Model times for n_rows = 203889

In [7]:
#Load all the csv files into Pandas dataframes

X_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", \
                     usecols=(0,1,2,3,4,7,8,9,10,11), skiprows=184700000)
y_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", usecols=(6), skiprows=184700000)

In [8]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 3.23 s, sys: 4.29 s, total: 7.52 s
Wall time: 1.86 s
SGD Classifier
CV accuracy: 0.876 +/- 0.031
----------------
CPU times: user 21 s, sys: 354 ms, total: 21.4 s
Wall time: 20.8 s
Random Forests
CV accuracy: 0.896 +/- 0.131
----------------
CPU times: user 27.4 s, sys: 0 ns, total: 27.4 s
Wall time: 27.3 s
k-Nearest Neighbors
CV accuracy: 0.661 +/- 0.077
----------------
CPU times: user 15 s, sys: 4.23 s, total: 19.3 s
Wall time: 13.4 s
Softmax Regression
CV accuracy: 0.932 +/- 0.014
----------------
CPU times: user 8.48 s, sys: 421 ms, total: 8.91 s
Wall time: 8.31 s
Decission Trees
CV accuracy: 0.775 +/- 0.121
----------------
CPU times: user 49.8 s, sys: 0 ns, total: 49.8 s
Wall time: 49.8 s
AdaBoost
CV accuracy: 0.979 +/- 0.002
----------------
CPU times: user 1min 48s, sys: 0 ns, total: 1min 48s
Wall time: 1min 48s
Gradient Boost
CV accuracy: 0.947 +/- 0.065
----------------


#### Model times for n_rows = 1003889

In [12]:
#Load all the csv files into Pandas dataframes

X_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", \
                     usecols=(0,1,2,3,4,7,8,9,10,11), skiprows=183900000)
y_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", usecols=(6), skiprows=183900000)

In [13]:
models = {
    "SGD Classifier": SGDClassifier(),
    "Softmax Regression": LogisticRegression(),
    "Random Forests": RandomForestClassifier()    
}

In [14]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 9 s, sys: 4.81 s, total: 13.8 s
Wall time: 7.74 s
SGD Classifier
CV accuracy: 0.890 +/- 0.016
----------------
CPU times: user 1min 3s, sys: 4.97 s, total: 1min 8s
Wall time: 1min 1s
Softmax Regression
CV accuracy: 0.929 +/- 0.003
----------------
CPU times: user 2min 48s, sys: 305 ms, total: 2min 49s
Wall time: 2min 48s
Random Forests
CV accuracy: 0.951 +/- 0.005
----------------


## SGD & Random forest optimization

In [15]:
models = {
    "SGD Classifier": SGDClassifier(),
    "Random Forests": RandomForestClassifier()    
}

SGD

In [16]:
param_grid = [{'penalty': ['none', 'l2','l1','elasticnet'], 
               'alpha': [ 0.00001,  0.0001,  0.001,  0.01],
               'loss': ['log'],
               'n_jobs': [-1]}]

sgd_clf = SGDClassifier()
grid_search_sgd = GridSearchCV(sgd_clf, param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search_sgd.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=none ..................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=none ..................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=none ..................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l2 ....................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l2 ....................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l2 ....................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l1 ....................
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l1 ....................
[CV] ... alpha=1e-05, loss=log, n_jobs=-1, penalty=none, total=   1.8s
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=l1 ....................
[CV] ... alpha=1e-05, loss=log, n_jobs=-1, penalty=none, total=   2.2s
[CV] alpha=1e-05, loss=log, n_jobs=-1, penalty=elasticnet ............
[CV] ..... alpha=1e-05, loss=log, n_jobs=-1, penalty=l2, total=   2.4s
[CV] alpha=1e-05

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.1s


[CV] ... alpha=0.001, loss=log, n_jobs=-1, penalty=none, total=   2.8s
[CV] alpha=0.001, loss=log, n_jobs=-1, penalty=elasticnet ............
[CV] ... alpha=0.001, loss=log, n_jobs=-1, penalty=none, total=   2.8s
[CV] alpha=0.001, loss=log, n_jobs=-1, penalty=elasticnet ............
[CV] ..... alpha=0.001, loss=log, n_jobs=-1, penalty=l2, total=   2.9s
[CV] alpha=0.001, loss=log, n_jobs=-1, penalty=elasticnet ............
[CV] ..... alpha=0.001, loss=log, n_jobs=-1, penalty=l2, total=   2.9s
[CV] alpha=0.01, loss=log, n_jobs=-1, penalty=none ...................
[CV] ..... alpha=0.001, loss=log, n_jobs=-1, penalty=l2, total=   2.9s
[CV] alpha=0.01, loss=log, n_jobs=-1, penalty=none ...................
[CV] ..... alpha=0.001, loss=log, n_jobs=-1, penalty=l1, total=   3.2s
[CV] alpha=0.01, loss=log, n_jobs=-1, penalty=none ...................
[CV] ..... alpha=0.001, loss=log, n_jobs=-1, penalty=l1, total=   3.4s
[CV] alpha=0.01, loss=log, n_jobs=-1, penalty=l2 .....................
[CV] .

[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   22.6s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'alpha': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log'], 'n_jobs': [-1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [17]:
cvres = grid_search_sgd.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.922998920127 {'alpha': 1e-05, 'loss': 'log', 'n_jobs': -1, 'penalty': 'none'}
0.89013691457 {'alpha': 1e-05, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l2'}
0.921149939531 {'alpha': 1e-05, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l1'}
0.900864166738 {'alpha': 1e-05, 'loss': 'log', 'n_jobs': -1, 'penalty': 'elasticnet'}
0.922197391353 {'alpha': 0.0001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'none'}
0.883628155985 {'alpha': 0.0001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l2'}
0.921346750125 {'alpha': 0.0001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l1'}
0.879222497879 {'alpha': 0.0001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'elasticnet'}
0.923848077595 {'alpha': 0.001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'none'}
0.883744793507 {'alpha': 0.001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l2'}
0.925640623594 {'alpha': 0.001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l1'}
0.884447235103 {'alpha': 0.001, 'loss': 'log', 'n_jobs': -1, 'penalty': 'elasticnet'}
0.924282745751 {'alpha': 0.01, 

In [18]:
grid_search_sgd.best_params_

{'alpha': 0.01, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l1'}

RF

In [19]:
param_grid = [{'max_depth': [ 30, 60], 
               'n_estimators': [ 80, 300],
               'max_features': [5, 10],
               'min_samples_leaf': [1, 10],
               'n_jobs': [-1]}]

forest_clf = RandomForestClassifier()
grid_search_rf = GridSearchCV(forest_clf, param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=80, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=80, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=80, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=300, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=300, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=300, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1 
[CV] max_depth=30, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1 
[CV]  max_depth=30, max_features=5, min_samples_leaf=1, n_estimators=80, n_jobs=-1, total= 9.5min
[CV] max_depth=30, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1 
[CV]  max_depth=30, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1, tot

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 100.0min


[CV] max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1 
[CV]  max_depth=30, max_features=10, min_samples_leaf=10, n_estimators=300, n_jobs=-1, total=57.6min
[CV] max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=300, n_jobs=-1 
[CV]  max_depth=60, max_features=5, min_samples_leaf=1, n_estimators=300, n_jobs=-1, total=33.0min
[CV] max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=300, n_jobs=-1 
[CV]  max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1, total= 8.5min
[CV] max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=300, n_jobs=-1 
[CV]  max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1, total= 8.3min
[CV] max_depth=60, max_features=10, min_samples_leaf=1, n_estimators=80, n_jobs=-1 
[CV]  max_depth=60, max_features=5, min_samples_leaf=10, n_estimators=80, n_jobs=-1, total= 8.6min
[CV] max_depth=60, max_features=10, min_samples_leaf=1, n_estimators=80, n_jobs=

[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 174.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'max_depth': [30, 60], 'n_estimators': [80, 300], 'max_features': [5, 10], 'min_samples_leaf': [1, 10], 'n_jobs': [-1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [21]:
cvres = grid_search_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.708002393665 {'max_depth': 30, 'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 80, 'n_jobs': -1}
0.714765088057 {'max_depth': 30, 'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 300, 'n_jobs': -1}
0.937627623837 {'max_depth': 30, 'max_features': 5, 'min_samples_leaf': 10, 'n_estimators': 80, 'n_jobs': -1}
0.945788915445 {'max_depth': 30, 'max_features': 5, 'min_samples_leaf': 10, 'n_estimators': 300, 'n_jobs': -1}
0.530207133027 {'max_depth': 30, 'max_features': 10, 'min_samples_leaf': 1, 'n_estimators': 80, 'n_jobs': -1}
0.515573709739 {'max_depth': 30, 'max_features': 10, 'min_samples_leaf': 1, 'n_estimators': 300, 'n_jobs': -1}
0.922575571532 {'max_depth': 30, 'max_features': 10, 'min_samples_leaf': 10, 'n_estimators': 80, 'n_jobs': -1}
0.923008420873 {'max_depth': 30, 'max_features': 10, 'min_samples_leaf': 10, 'n_estimators': 300, 'n_jobs': -1}
0.708607392248 {'max_depth': 60, 'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 80, 'n_jobs': -1}
0.71781

In [22]:
grid_search_rf.best_params_

{'max_depth': 60,
 'max_features': 5,
 'min_samples_leaf': 10,
 'n_estimators': 80,
 'n_jobs': -1}

### CV 3 vs CV 10

In [29]:
models = {
    "Random Forests": RandomForestClassifier(max_depth=60,max_features= 5,\
                                             min_samples_leaf= 10,n_estimators= 80,n_jobs= -1)    
}

In [30]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 1h 58min 11s, sys: 8.34 s, total: 1h 58min 19s
Wall time: 17min 19s
Random Forests
CV accuracy: 0.981 +/- 0.003
----------------


In [31]:
results = []
names = []

for k, v in models.items():
    %time cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=3, n_jobs=1, scoring='roc_auc')
    
    results.append(cv_scores)
    names.append(k)

    print(k)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores)))
    print('----------------')

CPU times: user 24min 55s, sys: 1.64 s, total: 24min 56s
Wall time: 3min 41s
Random Forests
CV accuracy: 0.946 +/- 0.039
----------------


## Train

In [3]:
%time X_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", \
                     usecols=(0,1,2,3,4,7,8,9,10,11), skiprows=100000000)
%time y_train = np.loadtxt(open("./data/processed/train.csv", "rb"), delimiter=",", usecols=(6), skiprows=100000000)

CPU times: user 21min 20s, sys: 5min 31s, total: 26min 51s
Wall time: 1h 10min 2s
CPU times: user 5min 3s, sys: 5.62 s, total: 5min 9s
Wall time: 5min 28s


In [4]:
rf = RandomForestClassifier(max_depth=60,max_features= 5,min_samples_leaf= 10,n_estimators= 80,n_jobs= -1)
%time rf.fit(X_train, y_train)

KeyboardInterrupt: 

## Test

In [None]:
X_test = np.loadtxt(open("./data/processed/test.csv", "rb"), delimiter=",", \
                     usecols=(1,2,3,4,5,7,8,9,10,11), skiprows=1)

In [None]:
SubmissionFormat = pd.read_csv('../data/raw/sample_submission.csv')

In [None]:
SubmissionFormat.head()

In [None]:
preds = rf.predict(X_test)

In [None]:
submission = pd.DataFrame({'click_id': SubmissionFormat["click_id"],'is_attributed': preds})
submission.to_csv('../data/predictions/RF_submission.csv', index=False)