# Import libraries
import pandas as pd

# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb 
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# cross validation and grid libraries 
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import RandomizedSearchCV

# plotting libraries in case we need it 
from matplotlib import pyplot as plt 
%matplotlib inline


# Intro 
This notebook is about to create Blender models, which includes: 
- **Layer1** model selection with grid search 
    - ~~Holdout % optimization with grid~~ <- done previously (60-40)
- **Layer2** model selection with grid search 
    - Holdout % optimization with grid
    - for dataset including layer1 predictions only 
    - for dataset including layer1 predictions and original dataset as well
- **Test for layer3**, which is using predictions from layer2 as train set and a portion of the holdout set for test

** Models: ** 
- Logistic regression
- Nearest neighbours
- Random forest 
- Xgboost 
- Naive Bayes with bernoulli 
- Adaboost
- Extra trees classifier
- Support vector Classifier 
- Quadratic discriminance analysis



----------
# Layer1 grid search 

With layer-holdout 60-40% split

In [2]:
# Load and split data
train = pd.read_csv('Data/FE_v1_train.csv')
test = pd.read_csv('Data/FE_v1_test.csv')
train_y = pd.read_csv('Data/train_y.csv')

splitter = StratifiedShuffleSplit(y=train_y, n_iter=1, train_size=0.6, 
                                      test_size=0.4, random_state=42)

for train_index, holdout_index in splitter: 
    train_layer1 = train.iloc[train_index, :]
    train_layer1.reset_index(drop=True, inplace=True)
    train_y_layer1 = train_y.iloc[train_index, :]
    train_y_layer1.reset_index(drop=True, inplace=True)
    train_l1_holdout = train.iloc[holdout_index, :]
    train_l1_holdout.reset_index(drop=True, inplace=True)
    train_y_l1_holdout = train_y.iloc[holdout_index, :]
    train_y_l1_holdout.reset_index(drop=True, inplace=True)

In [9]:
# Determine model parameters: 
params_logreg = {'n_jobs':[4],
                 'C':[0.0001,0.1,0.5,1.5,2,5,10], 
                 'fit_intercept':[False,True], 
                 'max_iter':[10,100,500,1000,2000],
                 'solver':['newton-cg','lbfgs','liblinear','sag'],
                 'tol':[0.00001,0.0001,0.001,0.1,0.5],
                 'verbose':[0],
                 'random_state':[42]}

params_knn = {'n_jobs':[4],
              'n_neighbors':[5,50,100,500,1000,1500,2000,3000,4000,5000], 
              'p':[1,2,3,4,5,10,11,12,15,20],
              'leaf_size':[10,20,30,40,50,60],
              'algorithm':['auto','ball_tree','kd_tree','brute']}

params_rf = {'n_jobs':[4], 
             'criterion':['gini', 'entropy'],
             'n_estimators':[250,500,1000,1500,2000,3000,4000,5000], 
             'max_features':[1,5,10,15,20,25], 
             'max_depth':[1,5,10,15,20,25,50,100], 
             'min_samples_split':[1,5,10,25,50], 
             'min_samples_leaf':[1,5,10,25,50], 
             'oob_score':[True,False], 
             'verbose':[0], 
             'random_state':[42]}

params_xgb = {'silent':[1],
              'nthread':[4], 
              'seed':[42], 
              'max_depth':[1,5,10,15,20,25],
              'subsample':[0.2,0.5,0.7,1],
              'reg_lambda':[1,2,5,10],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.3], 
              'gamma':[0,0.0001,0.001,0.01,0.1],
              'n_estimators':[250,500,1000,1500,2000,3000,4000,5000]
             }

params_naive = {'alpha':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500], 
                'binarize':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500],
                'fit_prior':[True, False], 
               }

params_ada = {'n_estimators':[250,500,1000,1500,2000,3000,4000,5000],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.5,0.75,1.5,2,5,10,25,50], 
              'random_state':[42] 
             }

# params_extra = {'n_jobs':[4],
#                 'criterion':['gini', 'entropy'],
#                 'n_estimators':[250,500,1000,1500,2000,3000,4000,5000], 
#                 'max_features':[1,5,10,15,20,25], 
#                 'max_depth':[1,5,10,15,20,25,50,100], 
#                 'min_samples_split':[1,5,10,25,50], 
#                 'min_samples_leaf':[1,5,10,25,50], 
#                 'oob_score':[True,False], 
#                 'bootstrap':[True],
#                 'verbose':[0], 
#                 'random_state':[42]}

params_svc = {'C':[0.0001,0.1,0.5,1.5,2,5,10], 
              'kernel':['linear','poly','rbf','sigmoid'], 
              'degree':[1,3,5,10,15,20,25,50], 
              'gamma':[0.0001,0.001,0.01,0.1,0.5,0.75,1,1.5,1.75,2,5,10],
              'coef0':[0.0001,0.001,0.01,0.1,0.5,0.75,1,1.5,1.75,2,5,10],
              'probability':[True],
              'shrinking':[True, False],
              'tol':[0.0001,0.00001],
              'random_state':[42]
             }

In [4]:
# Determine models
model_logreg = LogisticRegression()
model_knn = KNeighborsClassifier()
model_rf = RandomForestClassifier()
model_xgb = xgb.XGBClassifier()
model_naive = BernoulliNB()
model_ada = AdaBoostClassifier()
model_extra = ExtraTreesClassifier()
model_svc = SVC()

In [5]:
#Logistic regression
grid_logreg = RandomizedSearchCV(n_iter = 500, estimator=model_logreg, param_distributions=params_logreg, 
                                 n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss')

grid_logreg.fit(train_layer1, train_y_layer1.target)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   21.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  6.7min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed: 10.5min
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed: 14.8min
[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed: 20.1min
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed: 20.6min finished
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=500, n_jobs=2,
          param_distributions={'C': [0.0001, 0.1, 0.5, 1.5, 2, 5, 10], 'n_jobs': [4], 'verbose': [0], 'tol': [1e-05, 0.0001, 0.001, 0.1, 0.5], 'fit_intercept': [False, True], 'random_state': [42], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'max_iter': [10, 100, 500, 1000, 2000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [6]:
# Knn
grid_knn = RandomizedSearchCV(n_iter=10, estimator=model_knn, param_distributions=params_knn, n_jobs=2, cv=5, 
                             refit=True, verbose=1, scoring='log_loss')

grid_knn.fit(train_layer1, train_y_layer1.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 89.0min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 117.5min finished
  for s in gen_even_slices(Y.shape[0], n_jobs))
  for s in gen_even_slices(Y.shape[0], n_jobs))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_neighbors': [5, 50, 100, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'n_jobs': [4], 'leaf_size': [10, 20, 30, 40, 50, 60], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2, 3, 4, 5, 10, 11, 12, 15, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [7]:
# Random Forest
grid_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, n_jobs=2, cv=5, refit=True, 
                             verbose=1, scoring='log_loss')

grid_rf.fit(train_layer1, train_y_layer1.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 275.5min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 291.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_estimators': [250, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'min_samples_split': [1, 5, 10, 25, 50], 'oob_score': [True, False], 'n_jobs': [4], 'criterion': ['gini', 'entropy'], 'verbose': [0], 'max_features': [1, 5, 10, 15, 20, 25], 'random_state': [42], 'max_depth': [1, 5, 10, 15, 20, 25, 50, 100], 'min_samples_leaf': [1, 5, 10, 25, 50]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [7]:
# Bernoulli Bayes
grid_naive = RandomizedSearchCV(n_iter=40, estimator=model_naive, param_distributions=params_naive, n_jobs=8, 
                                cv=5, refit=True, verbose=1, scoring='log_loss')
grid_naive.fit(train_layer1, train_y_layer1.target)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.7s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
          fit_params={}, iid=True, n_iter=40, n_jobs=8,
          param_distributions={'binarize': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'alpha': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [None]:
# Adaboost
grid_ada = RandomizedSearchCV(n_iter=15, estimator=model_ada, param_distributions=params_ada, n_jobs=8, 
                              cv=5, refit=True, verbose=1, scoring='log_loss')
grid_ada.fit(train_layer1, train_y_layer1.target)

In [None]:
# XGB
splitter = StratifiedShuffleSplit(y=train_y_layer1, n_iter=1, train_size=0.7, 
                                      test_size=0.3, random_state=42)

for train_index, test_index in splitter: 
    train_l1_1 = train_layer1.iloc[train_index,:]
    train_l1_1.reset_index(drop=True, inplace=True)
    train_y_l1_1 = train_y_layer1.iloc[train_index,:]
    train_y_l1_1.reset_index(drop=True, inplace=True)
    train_l1_2 = train_layer1.iloc[test_index,:]
    train_l1_2.reset_index(drop=True, inplace=True)
    train_y_l1_2 = train_y_layer1.iloc[test_index,:]
    train_l1_2.reset_index(drop=True, inplace=True)


xgb_fit_params = {'eval_set':[(train_l1_1, train_y_l1_1),(train_l1_2, train_y_l1_2)], 
                  'early_stopping_rounds':25, 'verbose':0}

grid_xgb = RandomizedSearchCV(n_iter=20, estimator=model_xgb, param_distributions=params_xgb, 
                              n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss', 
                              fit_params=xgb_fit_params
                             )

grid_xgb.fit(train_layer1, train_y_layer1.target)

In [12]:
# these are results from bigdell rerun
print 'Logreg: ', grid_logreg.best_params_, grid_logreg.best_score_
print 'KNN: ', grid_knn.best_params_, grid_knn.best_score_
# print 'RandomForest: ', grid_rf.best_params_, grid_rf.best_score_
print 'Bernoulli bayes: ', grid_naive.best_params_, grid_naive.best_score_
# print 'Adaboost: ', grid_ada.best_params_, grid_ada.best_score_
# print 'Xgboost: ', grid_xgb.best_params_, grid_xgb.best_score_

 Logreg:  {'fit_intercept': False, 'C': 0.1, 'n_jobs': 4, 'verbose': 0, 'solver': 'sag', 'max_iter': 1000, 'random_state': 42, 'tol': 0.001} -0.691717645023
KNN:  {'p': 2, 'n_jobs': 4, 'leaf_size': 20, 'algorithm': 'ball_tree', 'n_neighbors': 1000} -0.692166108114
Bernoulli bayes:  {'binarize': 10, 'alpha': 1.5, 'fit_prior': True} -0.692666541942


In [13]:
# these are results from littleDell
print 'Logreg: ', grid_logreg.best_params_, grid_logreg.best_score_
print 'KNN: ', grid_knn.best_params_, grid_knn.best_score_
print 'RandomForest: ', grid_rf.best_params_, grid_rf.best_score_
print 'Bernoulli bayes: ', grid_naive.best_params_, grid_naive.best_score_
print 'Adaboost: ', grid_ada.best_params_, grid_ada.best_score_
print 'Xgboost: ', grid_xgb.best_params_, grid_xgb.best_score_

 Logreg:  {'fit_intercept': False, 'C': 0.1, 'n_jobs': 4, 'verbose': 0, 'solver': 'sag', 'max_iter': 100, 'random_state': 42, 'tol': 0.001} -0.691717645023
KNN:  {'p': 1, 'n_jobs': 4, 'leaf_size': 20, 'algorithm': 'ball_tree', 'n_neighbors': 4000} -0.692126133875
RandomForest:  {'oob_score': False, 'n_jobs': 4, 'verbose': 0, 'min_samples_leaf': 25, 'n_estimators': 1500, 'max_features': 15, 'random_state': 42, 'criterion': 'entropy', 'min_samples_split': 50, 'max_depth': 10} -0.692124787781
Bernoulli bayes:  {'binarize': 15, 'alpha': 0.5, 'fit_prior': True} -0.693057262966
Adaboost:  {'n_estimators': 5000, 'learning_rate': 0.0001, 'random_state': 42} -0.692654655439
Xgboost:  {'silent': 1, 'learning_rate': 0.1, 'nthread': 4, 'n_estimators': 5000, 'subsample': 0.7, 'reg_lambda': 5, 'seed': 42, 'max_depth': 1, 'gamma': 0.0001} -0.691804040637


In [None]:
# print out results into GridResults/

In [None]:
# I had some problems with the following algos: 

# Extra trees - parameter error maybe?
grid_extra = RandomizedSearchCV(estimator=model_extra, param_distributions=params_extra, n_jobs=1, cv=5,
                               refit=True, verbose=1, scoring='log_loss')
grid_extra.fit(train_layer1, train_y_layer1.target)

In [None]:
# support vector classifier - takes too much time, on bigdell one night is not enough
grid_svc = RandomizedSearchCV(estimator=model_svc, param_distributions=params_svc, n_jobs=8, cv=5, 
                             refit=True, verbose=1, scoring='log_loss')

grid_svc.fit(train_layer1, train_y_layer1.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


# Blend_v1
## Create predictions with layer1 models on holdout set and test set

In [22]:
# Set model parameters
params_logreg = {'n_jobs':4,
                 'C':0.1, 
                 'fit_intercept':False, 
                 'max_iter':100,
                 'solver':'sag',
                 'tol':0.001,
                 'verbose':0,
                 'random_state':42
                }

params_knn = {'n_jobs':4,
              'n_neighbors':4000, 
              'p':1,
              'leaf_size':20,
              'algorithm':'ball_tree'
             }

params_rf = {'n_jobs':4, 
             'criterion':'entropy',
             'n_estimators':1500, 
             'max_features':15, 
             'max_depth':10,
             'min_samples_split':50, 
             'min_samples_leaf':25, 
             'oob_score':False, 
             'verbose':0,
             'random_state':42
            }

params_xgb = {'silent':1,
              'nthread':4, 
              'seed':42, 
              'max_depth':1,
              'subsample':0.7,
              'reg_lambda':5,
              'learning_rate':0.1, 
              'gamma':0.0001,
              'n_estimators':5000
             }

params_naive = {'alpha':0.5,
                'binarize':15,
                'fit_prior':True 
               }

params_ada = {'n_estimators':5000,
              'learning_rate':0.0001, 
              'random_state':42 
             }

# Set models
model_logreg = LogisticRegression(**params_logreg)
model_knn = KNeighborsClassifier(**params_knn)
model_rf = RandomForestClassifier(**params_rf)
model_xgb = xgb.XGBClassifier(**params_xgb)
model_naive = BernoulliNB(**params_naive)
model_ada = AdaBoostClassifier(**params_ada)
# model_extra = ExtraTreesClassifier()
# model_svc = SVC() -> takes too much time for randomGrid

In [25]:
# create predictions for holdout set
model_logreg.fit(train_layer1, train_y_layer1.target)
print 'logreg done...'
model_knn.fit(train_layer1, train_y_layer1.target)
print 'knn done...'
model_rf.fit(train_layer1, train_y_layer1.target)
print 'rf done...'
model_naive.fit(train_layer1, train_y_layer1.target)
print 'naive done...'
model_ada.fit(train_layer1, train_y_layer1.target)
print 'ada done...'

logreg done...
knn done...
rf done...
naive done...
ada done...


In [26]:
#xgboost
# XGB
splitter = StratifiedShuffleSplit(y=train_y_layer1, n_iter=1, train_size=0.7, 
                                      test_size=0.3, random_state=42)

for train_index, test_index in splitter: 
    train_l1_1 = train_layer1.iloc[train_index,:]
    train_l1_1.reset_index(drop=True, inplace=True)
    train_y_l1_1 = train_y_layer1.iloc[train_index,:]
    train_y_l1_1.reset_index(drop=True, inplace=True)
    train_l1_2 = train_layer1.iloc[test_index,:]
    train_l1_2.reset_index(drop=True, inplace=True)
    train_y_l1_2 = train_y_layer1.iloc[test_index,:]
    train_l1_2.reset_index(drop=True, inplace=True)

model_xgb.fit(train_layer1, train_y_layer1.target,
            eval_set=[(train_l1_1, train_y_l1_1),(train_l1_2, train_y_l1_2)], 
            early_stopping_rounds=25, verbose=0)
print 'xgb done...'

xgb done...


In [27]:
# Predict on holdout set
holdout_predict_logreg = model_logreg.predict_proba(train_l1_holdout)
holdout_predict_knn = model_knn.predict_proba(train_l1_holdout)
holdout_predict_rf = model_rf.predict_proba(train_l1_holdout)
holdout_predict_naive = model_naive.predict_proba(train_l1_holdout)
holdout_predict_ada = model_ada.predict_proba(train_l1_holdout)
holdout_predict_xgb = model_xgb.predict_proba(train_l1_holdout)

In [30]:
# Calculate error on holdout set
print 'logloss', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_logreg[:,1])
print 'knn', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_knn[:,1])
print 'rf', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_rf[:,1])
print 'naive', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_naive[:,1])
print 'ada', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_ada[:,1])
print 'xgb', metrics.log_loss(y_true=train_y_l1_holdout, y_pred=holdout_predict_xgb[:,1])

logloss 0.691567060898
knn 0.691726372791
rf 0.691702025755
naive 0.693022286855
ada 0.692435168301
xgb 0.691709935212


Naive bayes and adaboost have a poor performance, the others are ok

In [34]:
# Predict on test set
test_predict_logreg = model_logreg.predict_proba(test.iloc[:,1])
test_predict_knn = model_knn.predict_proba(test.iloc[:,1])
test_predict_rf = model_rf.predict_proba(test.iloc[:,1])
test_predict_naive = model_naive.predict_proba(test.iloc[:,1])
test_predict_ada = model_ada.predict_proba(test.iloc[:,1])
test_predict_xgb = model_xgb.predict_proba(test.iloc[:,1])

In [58]:
# concat holdout with predictions 
temp = pd.concat([pd.DataFrame(holdout_predict_logreg[:,1]),
                 pd.DataFrame(holdout_predict_knn[:,1]), 
                 pd.DataFrame(holdout_predict_rf[:,1]),
                 pd.DataFrame(holdout_predict_naive[:,1]),
                 pd.DataFrame(holdout_predict_ada[:,1]), 
                 pd.DataFrame(holdout_predict_xgb[:,1])], axis=1)
temp.columns = ['pred_logreg', 'pred_knn', 'pred_rf', 'pred_naive', 'pred_ada', 'pred_xgb']

train_l1_holdout_blend_v1 = pd.concat([train_l1_holdout, temp], axis=1)


# concat test with predictions
temp = pd.concat([pd.DataFrame(test_predict_logreg[:,1]),
                 pd.DataFrame(test_predict_knn[:,1]), 
                 pd.DataFrame(test_predict_rf[:,1]),
                 pd.DataFrame(test_predict_naive[:,1]),
                 pd.DataFrame(test_predict_ada[:,1]), 
                 pd.DataFrame(test_predict_xgb[:,1])], axis=1)
temp.columns = ['pred_logreg', 'pred_knn', 'pred_rf', 'pred_naive', 'pred_ada', 'pred_xgb']

test_blend_v1 = pd.concat([test, temp], axis=1)

In [60]:
# save tables in csv files
pd.DataFrame.to_csv(train_layer1, 'Blend_data/Blend_v1_train_layer1.csv', index=False)
pd.DataFrame.to_csv(train_y_layer1, 'Blend_data/Blend_v1_train_y_layer1.csv', index=False)
pd.DataFrame.to_csv(train_l1_holdout_blend_v1, 'Blend_data/Blend_v1_train_l1_holdout.csv', index=False)
pd.DataFrame.to_csv(train_y_l1_holdout, 'Blend_data/Blend_v1_train_y_holdout.csv', index=False)

pd.DataFrame.to_csv(test_blend_v1, 'Blend_data/Blend_v1_test.csv', index=False)

## Grid search for blender model WITH original data and predictions

In [3]:
train_l1_holdout_blend_v1 = pd.read_csv('Blend_data/Blend_v1_train_l1_holdout.csv')
train_y_l1_holdout = pd.read_csv('Blend_data/Blend_v1_train_y_holdout.csv')

test_blend_v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')

In [4]:
# Determine model parameters: 
params_logreg = {'n_jobs':[4],
                 'C':[0.0001,0.1,0.5,1.5,2,5,10], 
                 'fit_intercept':[False,True], 
                 'max_iter':[10,100,500,1000,2000],
                 'solver':['newton-cg','lbfgs','liblinear','sag'],
                 'tol':[0.00001,0.0001,0.001,0.1,0.5],
                 'verbose':[0],
                 'random_state':[42]}

params_knn = {'n_jobs':[4],
              'n_neighbors':[5,50,100,500,1000,1500,2000,3000,4000,5000], 
              'p':[1,2,3,4,5,10,11,12,15,20],
              'leaf_size':[10,20,30,40,50,60],
              'algorithm':['auto','ball_tree','kd_tree','brute']}

params_rf = {'n_jobs':[4], 
             'criterion':['gini', 'entropy'],
             'n_estimators':[250,500,1000,1500,2000,3000,4000,5000], 
             'max_features':[1,5,10,15,20,25], 
             'max_depth':[1,5,10,15,20,25,50,100], 
             'min_samples_split':[1,5,10,25,50], 
             'min_samples_leaf':[1,5,10,25,50], 
             'oob_score':[True,False], 
             'verbose':[0], 
             'random_state':[42]}

params_xgb = {'silent':[1],
              'nthread':[4], 
              'seed':[42], 
              'max_depth':[1,5,10,15,20,25],
              'subsample':[0.2,0.5,0.7,1],
              'reg_lambda':[1,2,5,10],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.3], 
              'gamma':[0,0.0001,0.001,0.01,0.1],
              'n_estimators':[250,500,1000,1500,2000,3000,4000,5000]
             }

params_naive = {'alpha':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500], 
                'binarize':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500],
                'fit_prior':[True, False], 
               }

params_ada = {'n_estimators':[250,500,1000,1500,2000,3000,4000,5000],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.5,0.75,1.5,2,5,10,25,50], 
              'random_state':[42] 
             }

In [5]:
# Determine models
model_logreg = LogisticRegression()
model_knn = KNeighborsClassifier()
model_rf = RandomForestClassifier()
model_xgb = xgb.XGBClassifier()
model_naive = BernoulliNB()
model_ada = AdaBoostClassifier()

In [8]:
#Logistic regression
grid_logreg = RandomizedSearchCV(n_iter = 500, estimator=model_logreg, param_distributions=params_logreg, 
                                 n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss')

grid_logreg.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   30.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  5.2min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed: 10.8min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed: 17.0min
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed: 23.3min
[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed: 32.5min
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed: 33.8min finished
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=500, n_jobs=2,
          param_distributions={'C': [0.0001, 0.1, 0.5, 1.5, 2, 5, 10], 'n_jobs': [4], 'verbose': [0], 'tol': [1e-05, 0.0001, 0.001, 0.1, 0.5], 'fit_intercept': [False, True], 'random_state': [42], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'max_iter': [10, 100, 500, 1000, 2000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [9]:
# Knn
grid_knn = RandomizedSearchCV(n_iter=10, estimator=model_knn, param_distributions=params_knn, n_jobs=2, cv=5, 
                             refit=True, verbose=1, scoring='log_loss')

grid_knn.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 35.1min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 38.2min finished
  for s in gen_even_slices(Y.shape[0], n_jobs))
  for s in gen_even_slices(Y.shape[0], n_jobs))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_neighbors': [5, 50, 100, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'n_jobs': [4], 'leaf_size': [10, 20, 30, 40, 50, 60], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2, 3, 4, 5, 10, 11, 12, 15, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [10]:
# Random Forest
grid_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, n_jobs=2, cv=5, refit=True, 
                             verbose=1, scoring='log_loss')

grid_rf.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 115.9min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 123.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_estimators': [250, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'min_samples_split': [1, 5, 10, 25, 50], 'oob_score': [True, False], 'n_jobs': [4], 'criterion': ['gini', 'entropy'], 'verbose': [0], 'max_features': [1, 5, 10, 15, 20, 25], 'random_state': [42], 'max_depth': [1, 5, 10, 15, 20, 25, 50, 100], 'min_samples_leaf': [1, 5, 10, 25, 50]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [11]:
# Bernoulli Bayes
grid_naive = RandomizedSearchCV(n_iter=40, estimator=model_naive, param_distributions=params_naive, n_jobs=8, 
                                cv=5, refit=True, verbose=1, scoring='log_loss')
grid_naive.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=8)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.5s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
          fit_params={}, iid=True, n_iter=40, n_jobs=8,
          param_distributions={'binarize': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'alpha': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [12]:
# Adaboost - gives back some weird value error, about nans/infinite values in df X?!
grid_ada = RandomizedSearchCV(n_iter=10, estimator=model_ada, param_distributions=params_ada, n_jobs=8, 
                              cv=5, refit=True, verbose=1, scoring='log_loss')
grid_ada.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 31.9min


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x7f34d7be8eb0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/ngergo...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x7f34d7be8eb0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/ngergo...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    587         
    588         If a global instance already exists, this reinitializes and starts it
    589         """
    590         app = cls.instance(**kwargs)
    591         app.initialize(argv)
--> 592         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    593 
    594 #-----------------------------------------------------------------------------
    595 # utility functions, for convenience
    596 #-----------------------------------------------------------------------------

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    398         
    399         if self.poller is not None:
    400             self.poller.start()
    401         self.kernel.start()
    402         try:
--> 403             ioloop.IOLoop.instance().start()
    404         except KeyboardInterrupt:
    405             pass
    406 
    407 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    146             PollIOLoop.configure(ZMQIOLoop)
    147         return PollIOLoop.instance()
    148     
    149     def start(self):
    150         try:
--> 151             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    152         except ZMQError as e:
    153             if e.errno == ETERM:
    154                 # quietly return on ETERM
    155                 pass

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 5
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 5), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 5)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=5)
    428             # dispatch events:
    429             if events & IOLoop.ERROR:
    430                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    431                 return
    432             if events & IOLoop.READ:
--> 433                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    434                 if not self.socket:
    435                     return
    436             if events & IOLoop.WRITE:
    437                 self._handle_send()

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    460                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    461         else:
    462             if self._recv_callback:
    463                 callback = self._recv_callback
    464                 # self._recv_callback = None
--> 465                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    466                 
    467         # self.update_state()
    468         
    469 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    402         close our socket."""
    403         try:
    404             # Use a NullContext to ensure that all StackContexts are run
    405             # inside our blanket exception handler rather than outside.
    406             with stack_context.NullContext():
--> 407                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    408         except:
    409             gen_log.error("Uncaught exception, closing connection.",
    410                           exc_info=True)
    411             # Close the socket on an uncaught exception from a user callback

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    255         if self.control_stream:
    256             self.control_stream.on_recv(self.dispatch_control, copy=False)
    257 
    258         def make_dispatcher(stream):
    259             def dispatcher(msg):
--> 260                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    261             return dispatcher
    262 
    263         for s in self.shell_streams:
    264             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-09-23T22:06:50.089407', u'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', u'msg_type': u'execute_request', u'session': u'1422CF0855EF4AA286CC064870292C16', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', 'msg_type': u'execute_request', 'parent_header': {}})
    207             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    208         else:
    209             self.log.debug("%s: %s", msg_type, msg)
    210             self.pre_handler_hook()
    211             try:
--> 212                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['1422CF0855EF4AA286CC064870292C16']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-09-23T22:06:50.089407', u'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', u'msg_type': u'execute_request', u'session': u'1422CF0855EF4AA286CC064870292C16', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', 'msg_type': u'execute_request', 'parent_header': {}}
    213             except Exception:
    214                 self.log.error("Exception in message handler:", exc_info=True)
    215             finally:
    216                 self.post_handler_hook()

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['1422CF0855EF4AA286CC064870292C16'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-09-23T22:06:50.089407', u'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', u'msg_type': u'execute_request', u'session': u'1422CF0855EF4AA286CC064870292C16', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'C42EFFEC2ADD4235956D9533428BA7C0', 'msg_type': u'execute_request', 'parent_header': {}})
    365         if not silent:
    366             self.execution_count += 1
    367             self._publish_execute_input(code, parent, self.execution_count)
    368 
    369         reply_content = self.do_execute(code, silent, store_history,
--> 370                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    371 
    372         # Flush output before sending the reply.
    373         sys.stdout.flush()
    374         sys.stderr.flush()

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)"
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)", store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-12-48cae6f0dd09>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])
   3011                 code = compiler(mod, cell_name, "single")
-> 3012                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f348bfd29b0, file "<ipython-input-12-48cae6f0dd09>", line 4>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3013                     return True
   3014 
   3015             # Flush softspace
   3016             if softspace(sys.stdout, 0):

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f348bfd29b0, file "<ipython-input-12-48cae6f0dd09>", line 4>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f348bfd29b0, file "<ipython-input-12-48cae6f0dd09>", line 4>
        self.user_global_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'In': ['', u"train_l1_holdout_blend_v1 = pd.read_csv('Blend...v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')", u"# Import libraries\nimport pandas as pd\n\n# m... plt \nget_ipython().magic(u'matplotlib inline')", u"train_l1_holdout_blend_v1 = pd.read_csv('Blend...v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')", u"# Determine model parameters: \nparams_logreg ...            'random_state':[42] \n             }", u'# Determine models\nmodel_logreg = LogisticReg... BernoulliNB()\nmodel_ada = AdaBoostClassifier()', u"#Logistic regression\ngrid_logreg = Randomized..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# XGB\nsplitter = StratifiedShuffleSplit(y=tra..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"#Logistic regression\ngrid_logreg = Randomized..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Knn\ngrid_knn = RandomizedSearchCV(n_iter=10..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Random Forest\ngrid_rf = RandomizedSearchCV(..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Bernoulli Bayes\ngrid_naive = RandomizedSear..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)"], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {7: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 8: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 9: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 10: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 11: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1)}, 'QuadraticDiscriminantAnalysis': <class 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.grid_search.RandomizedSearchCV'>, ...}
        self.user_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'In': ['', u"train_l1_holdout_blend_v1 = pd.read_csv('Blend...v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')", u"# Import libraries\nimport pandas as pd\n\n# m... plt \nget_ipython().magic(u'matplotlib inline')", u"train_l1_holdout_blend_v1 = pd.read_csv('Blend...v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')", u"# Determine model parameters: \nparams_logreg ...            'random_state':[42] \n             }", u'# Determine models\nmodel_logreg = LogisticReg... BernoulliNB()\nmodel_ada = AdaBoostClassifier()', u"#Logistic regression\ngrid_logreg = Randomized..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# XGB\nsplitter = StratifiedShuffleSplit(y=tra..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"#Logistic regression\ngrid_logreg = Randomized..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Knn\ngrid_knn = RandomizedSearchCV(n_iter=10..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Random Forest\ngrid_rf = RandomizedSearchCV(..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Bernoulli Bayes\ngrid_naive = RandomizedSear..._l1_holdout_blend_v1, train_y_l1_holdout.target)", u"# Adaboost\ngrid_ada = RandomizedSearchCV(n_it..._l1_holdout_blend_v1, train_y_l1_holdout.target)"], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {7: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 8: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 9: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 10: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), 11: RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1)}, 'QuadraticDiscriminantAnalysis': <class 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.grid_search.RandomizedSearchCV'>, ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/home/ngergoo/Documents/DataMining/Numerai/ROUND_20160922/<ipython-input-12-48cae6f0dd09> in <module>()
      1 
      2 
      3 # Adaboost
----> 4 grid_ada = RandomizedSearchCV(n_iter=10, estimator=model_ada, param_distributions=params_ada, n_jobs=8, 
      5                               cv=5, refit=True, verbose=1, scoring='log_loss')
      6 grid_ada.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)
      7 
      8 
      9 
     10 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), X=       feature1  feature2  feature3  feature4  f...  0.506965  0.499143  

[38528 rows x 33 columns], y=0        0
1        1
2        1
3        1
4   ...
38526    0
38527    0
Name: target, dtype: int64)
    991 
    992         """
    993         sampled_params = ParameterSampler(self.param_distributions,
    994                                           self.n_iter,
    995                                           random_state=self.random_state)
--> 996         return self._fit(X, y, sampled_params)
        self._fit = <bound method RandomizedSearchCV._fit of Randomi...t=True,
          scoring='log_loss', verbose=1)>
        X =        feature1  feature2  feature3  feature4  f...  0.506965  0.499143  

[38528 rows x 33 columns]
        y = 0        0
1        1
2        1
3        1
4   ...
38526    0
38527    0
Name: target, dtype: int64
        sampled_params = <sklearn.grid_search.ParameterSampler object>
    997 
    998 
    999 
   1000 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='log_loss', verbose=1), X=       feature1  feature2  feature3  feature4  f...  0.506965  0.499143  

[38528 rows x 33 columns], y=0        0
1        1
2        1
3        1
4   ...
38526    0
38527    0
Name: target, dtype: int64, parameter_iterable=<sklearn.grid_search.ParameterSampler object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterSampler object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=8), iterable=<generator object <genexpr>>)
    807             if pre_dispatch == "all" or n_jobs == 1:
    808                 # The iterable was consumed all at once by the above for loop.
    809                 # No need to wait for async callbacks to trigger to
    810                 # consumption.
    811                 self._iterating = False
--> 812             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=8)>
    813             # Make sure that we get a last message telling us we are done
    814             elapsed_time = time.time() - self._start_time
    815             self._print('Done %3i out of %3i | elapsed: %s finished',
    816                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Fri Sep 23 22:39:04 2016
PID: 23643                Python 2.7.11: /home/ngergoo/anaconda2/bin/python
...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_est...=50,
          n_estimators=250, random_state=42), X=       feature1  feature2  feature3  feature4  f...  0.506965  0.499143  

[38528 rows x 33 columns], y=0        0
1        1
2        1
3        1
4   ...
38526    0
38527    0
Name: target, dtype: int64, scorer=make_scorer(log_loss, greater_is_better=False, needs_proba=True), train=array([ 7634,  7640,  7642, ..., 38525, 38526, 38527]), test=array([   0,    1,    2, ..., 7768, 7769, 7771]), verbose=1, parameters={'learning_rate': 50, 'n_estimators': 250, 'random_state': 42}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1545                              " numeric value. (Hint: if using 'raise', please"
   1546                              " make sure that it has been spelled correctly.)"
   1547                              )
   1548 
   1549     else:
-> 1550         test_score = _score(estimator, X_test, y_test, scorer)
   1551         if return_train_score:
   1552             train_score = _score(estimator, X_train, y_train, scorer)
   1553 
   1554     scoring_time = time.time() - start_time

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_est...=50,
          n_estimators=250, random_state=42), X_test=      feature1  feature2  feature3  feature4  fe...1  0.503546  0.480754  

[7706 rows x 33 columns], y_test=0       0
1       1
2       1
3       1
4       ... 1
7769    1
7771    1
Name: target, dtype: int64, scorer=make_scorer(log_loss, greater_is_better=False, needs_proba=True))
   1601 def _score(estimator, X_test, y_test, scorer):
   1602     """Compute the score of an estimator on a given test set."""
   1603     if y_test is None:
   1604         score = scorer(estimator, X_test)
   1605     else:
-> 1606         score = scorer(estimator, X_test, y_test)
   1607     if not isinstance(score, numbers.Number):
   1608         raise ValueError("scoring must return a number, got %s (%s) instead."
   1609                          % (str(score), type(score)))
   1610     return score

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self=make_scorer(log_loss, greater_is_better=False, needs_proba=True), clf=AdaBoostClassifier(algorithm='SAMME.R', base_est...=50,
          n_estimators=250, random_state=42), X=      feature1  feature2  feature3  feature4  fe...1  0.503546  0.480754  

[7706 rows x 33 columns], y=0       0
1       1
2       1
3       1
4       ... 1
7769    1
7771    1
Name: target, dtype: int64, sample_weight=None)
    119         if sample_weight is not None:
    120             return self._sign * self._score_func(y, y_pred,
    121                                                  sample_weight=sample_weight,
    122                                                  **self._kwargs)
    123         else:
--> 124             return self._sign * self._score_func(y, y_pred, **self._kwargs)
    125 
    126     def _factory_args(self):
    127         return ", needs_proba=True"
    128 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in log_loss(y_true=0       0
1       1
2       1
3       1
4       ... 1
7769    1
7771    1
Name: target, dtype: int64, y_pred=array([[ nan,  nan],
       [ nan,  nan],
      ... nan],
       [ nan,  nan],
       [ nan,  nan]]), eps=1e-15, normalize=True, sample_weight=None)
   1557         Y = np.append(1 - Y, Y, axis=1)
   1558 
   1559     # Check if dimensions are consistent.
   1560     check_consistent_length(T, Y)
   1561     T = check_array(T)
-> 1562     Y = check_array(Y)
   1563     if T.shape[1] != Y.shape[1]:
   1564         raise ValueError("y_true and y_pred have different number of classes "
   1565                          "%d, %d" % (T.shape[1], Y.shape[1]))
   1566 

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array=array([[ nan,  nan],
       [ nan,  nan],
      ... nan],
       [ nan,  nan],
       [ nan,  nan]]), accept_sparse=None, dtype=None, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None)
    393             array = array.astype(np.float64)
    394         if not allow_nd and array.ndim >= 3:
    395             raise ValueError("Found array with dim %d. %s expected <= 2."
    396                              % (array.ndim, estimator_name))
    397         if force_all_finite:
--> 398             _assert_all_finite(array)
    399 
    400     shape_repr = _shape_repr(array.shape)
    401     if ensure_min_samples > 0:
    402         n_samples = _num_samples(array)

...........................................................................
/home/ngergoo/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X=array([[ nan,  nan],
       [ nan,  nan],
      ... nan],
       [ nan,  nan],
       [ nan,  nan]]))
     49     # everything is finite; fall back to O(n) space np.isfinite to prevent
     50     # false positives from overflow in sum method.
     51     if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
     52             and not np.isfinite(X).all()):
     53         raise ValueError("Input contains NaN, infinity"
---> 54                          " or a value too large for %r." % X.dtype)
     55 
     56 
     57 def assert_all_finite(X):
     58     """Throw a ValueError if X contains NaN or infinity.

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
___________________________________________________________________________

In [7]:
# XGB
splitter = StratifiedShuffleSplit(y=train_y_l1_holdout, n_iter=1, train_size=0.7, 
                                      test_size=0.3, random_state=42)

for train_index, test_index in splitter: 
    train_1 = train_l1_holdout_blend_v1.iloc[train_index,:]
    train_1.reset_index(drop=True, inplace=True)
    train_y_1 = train_y_l1_holdout.iloc[train_index,:]
    train_y_1.reset_index(drop=True, inplace=True)
    train_2 = train_l1_holdout_blend_v1.iloc[test_index,:]
    train_2.reset_index(drop=True, inplace=True)
    train_y_2 = train_y_l1_holdout.iloc[test_index,:]
    train_y_2.reset_index(drop=True, inplace=True)


xgb_fit_params = {'eval_set':[(train_1, train_y_1.target),(train_2, train_y_2.target)], 
                  'early_stopping_rounds':25, 'verbose':0}

grid_xgb = RandomizedSearchCV(n_iter=20, estimator=model_xgb, param_distributions=params_xgb, 
                              n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss', 
                              fit_params=xgb_fit_params
                             )

grid_xgb.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 10.1min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 19.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={'eval_set': [(       feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0      0.695981  0.596473  0.515205  0.884092  0.320731  0.546269  0.095469
1      0.676988  0.644239  0.444708  0.558028  0.221347  0.092506  0.229707
2      0.387228  0.662142  0.639957  ...    1
11557    1
11558    0
Name: target, dtype: int64)], 'early_stopping_rounds': 25, 'verbose': 0},
          iid=True, n_iter=20, n_jobs=2,
          param_distributions={'n_estimators': [250, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'subsample': [0.2, 0.5, 0.7, 1], 'reg_lambda': [1, 2, 5,

In [13]:
# results
print 'Logreg: ', grid_logreg.best_params_, grid_logreg.best_score_
print 'KNN: ', grid_knn.best_params_, grid_knn.best_score_
print 'RandomForest: ', grid_rf.best_params_, grid_rf.best_score_
print 'Bernoulli bayes: ', grid_naive.best_params_, grid_naive.best_score_
# print 'Adaboost: ', grid_ada.best_params_, grid_ada.best_score_
print 'Xgboost: ', grid_xgb.best_params_, grid_xgb.best_score_

Logreg:  {'fit_intercept': True, 'C': 0.1, 'n_jobs': 4, 'verbose': 0, 'solver': 'lbfgs', 'max_iter': 500, 'random_state': 42, 'tol': 0.001} -0.69152967219
KNN:  {'p': 1, 'n_jobs': 4, 'leaf_size': 40, 'algorithm': 'brute', 'n_neighbors': 3000} -0.69168230852
RandomForest:  {'oob_score': True, 'n_jobs': 4, 'verbose': 0, 'min_samples_leaf': 25, 'n_estimators': 4000, 'max_features': 5, 'random_state': 42, 'criterion': 'gini', 'min_samples_split': 1, 'max_depth': 10} -0.691457346439
Bernoulli bayes:  {'binarize': 10, 'alpha': 80, 'fit_prior': False} -0.692724946458
Xgboost:  {'silent': 1, 'learning_rate': 0.1, 'nthread': 4, 'n_estimators': 2000, 'subsample': 1, 'reg_lambda': 10, 'seed': 42, 'max_depth': 1, 'gamma': 0} -0.691545945181


In [20]:
pd.DataFrame.to_csv(pd.DataFrame(grid_logreg.grid_scores_), 'GridResults/blender_logreg.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_knn.grid_scores_), 'GridResults/blender_knn.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_rf.grid_scores_), 'GridResults/blender_rf.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_naive.grid_scores_), 'GridResults/blender_naive.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_xgb.grid_scores_), 'GridResults/blender_xgb.csv', index=False)

TODO: 
- Make submission by using each individual layer1 model
    - Try to add more non-correlated predictions for layer1
- Make submission by using single blender model
- Make submission by using average of multiple blender model
    - Try to add more non-correlated blender models
- Make prediction by using layer1 predictions only. 

- Grid search for blender by using predictions only
- Grid search for blender by using predictions and basic stats only


Although these results looks promising, I bet, that there is a trick with feature engineering. 

As the next step, try to do similarity thing, and go through my notes to get some ideas. 


## Grid search for blender model with predictions only

In [2]:
train_l1_holdout_blend_v1 = pd.read_csv('Blend_data/Blend_v1_train_l1_holdout.csv')
train_y_l1_holdout = pd.read_csv('Blend_data/Blend_v1_train_y_holdout.csv')

test_blend_v1 = pd.read_csv('Blend_data/Blend_v1_test.csv')

In [9]:
train_l1_holdout_blend_v1 = train_l1_holdout_blend_v1.iloc[:,-6:]

In [12]:
# Determine model parameters: 
params_logreg = {'n_jobs':[4],
                 'C':[0.0001,0.1,0.5,1.5,2,5,10], 
                 'fit_intercept':[False,True], 
                 'max_iter':[10,100,500,1000,2000],
                 'solver':['newton-cg','lbfgs','liblinear','sag'],
                 'tol':[0.00001,0.0001,0.001,0.1,0.5],
                 'verbose':[0],
                 'random_state':[42]}

params_knn = {'n_jobs':[4],
              'n_neighbors':[5,50,100,500,1000,1500,2000,3000,4000,5000], 
              'p':[1,2,3,4,5,10,11,12,15,20],
              'leaf_size':[10,20,30,40,50,60],
              'algorithm':['auto','ball_tree','kd_tree','brute']}

# care for max_features
params_rf = {'n_jobs':[4], 
             'criterion':['gini', 'entropy'],
             'n_estimators':[250,500,1000,1500,2000,3000,4000,5000], 
             'max_features':[1,2,3,4,5,6], 
             'max_depth':[1,5,10,15,20,25,50,100], 
             'min_samples_split':[1,5,10,25,50], 
             'min_samples_leaf':[1,5,10,25,50], 
             'oob_score':[True,False], 
             'verbose':[0], 
             'random_state':[42]}

params_xgb = {'silent':[1],
              'nthread':[4], 
              'seed':[42], 
              'max_depth':[1,5,10,15,20,25],
              'subsample':[0.2,0.5,0.7,1],
              'reg_lambda':[1,2,5,10],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.3], 
              'gamma':[0,0.0001,0.001,0.01,0.1],
              'n_estimators':[250,500,1000,1500,2000,3000,4000,5000]
             }

params_naive = {'alpha':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500], 
                'binarize':[0.1,0.5,0.75,1,1.5,2,5,10,15,25,30,40,50,60,70,80,90,100,200,500],
                'fit_prior':[True, False], 
               }

params_ada = {'n_estimators':[250,500,1000,1500,2000,3000,4000,5000],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.5,0.75,1.5,2,5,10,25,50], 
              'random_state':[42] 
             }

# Determine models
model_logreg = LogisticRegression()
model_knn = KNeighborsClassifier()
model_rf = RandomForestClassifier()
model_xgb = xgb.XGBClassifier()
model_naive = BernoulliNB()
model_ada = AdaBoostClassifier()

In [13]:
#Logistic regression
grid_logreg = RandomizedSearchCV(n_iter = 500, estimator=model_logreg, param_distributions=params_logreg, 
                                 n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss')

grid_logreg.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   22.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   49.3s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:  2.3min
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed:  4.8min
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed:  4.9min finished
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
  for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=500, n_jobs=2,
          param_distributions={'C': [0.0001, 0.1, 0.5, 1.5, 2, 5, 10], 'n_jobs': [4], 'verbose': [0], 'tol': [1e-05, 0.0001, 0.001, 0.1, 0.5], 'fit_intercept': [False, True], 'random_state': [42], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'max_iter': [10, 100, 500, 1000, 2000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [14]:
# Knn
grid_knn = RandomizedSearchCV(n_iter=10, estimator=model_knn, param_distributions=params_knn, n_jobs=2, cv=5, 
                             refit=True, verbose=1, scoring='log_loss')

grid_knn.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  5.6min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:  6.5min finished
  for s in gen_even_slices(Y.shape[0], n_jobs))
  for s in gen_even_slices(Y.shape[0], n_jobs))


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_neighbors': [5, 50, 100, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'n_jobs': [4], 'leaf_size': [10, 20, 30, 40, 50, 60], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2, 3, 4, 5, 10, 11, 12, 15, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [15]:
# Random Forest
grid_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=params_rf, n_jobs=2, cv=5, refit=True, 
                             verbose=1, scoring='log_loss')

grid_rf.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 16.8min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 17.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_estimators': [250, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'min_samples_split': [1, 5, 10, 25, 50], 'oob_score': [True, False], 'n_jobs': [4], 'criterion': ['gini', 'entropy'], 'verbose': [0], 'max_features': [1, 2, 3, 4, 5, 6], 'random_state': [42], 'max_depth': [1, 5, 10, 15, 20, 25, 50, 100], 'min_samples_leaf': [1, 5, 10, 25, 50]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [16]:
# Bernoulli Bayes
grid_naive = RandomizedSearchCV(n_iter=40, estimator=model_naive, param_distributions=params_naive, n_jobs=8, 
                                cv=5, refit=True, verbose=1, scoring='log_loss')
grid_naive.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=8)]: Done  88 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.7s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
          fit_params={}, iid=True, n_iter=40, n_jobs=8,
          param_distributions={'binarize': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'alpha': [0.1, 0.5, 0.75, 1, 1.5, 2, 5, 10, 15, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='log_loss', verbose=1)

In [17]:
# XGB
splitter = StratifiedShuffleSplit(y=train_y_l1_holdout, n_iter=1, train_size=0.7, 
                                      test_size=0.3, random_state=42)

for train_index, test_index in splitter: 
    train_1 = train_l1_holdout_blend_v1.iloc[train_index,:]
    train_1.reset_index(drop=True, inplace=True)
    train_y_1 = train_y_l1_holdout.iloc[train_index,:]
    train_y_1.reset_index(drop=True, inplace=True)
    train_2 = train_l1_holdout_blend_v1.iloc[test_index,:]
    train_2.reset_index(drop=True, inplace=True)
    train_y_2 = train_y_l1_holdout.iloc[test_index,:]
    train_y_2.reset_index(drop=True, inplace=True)


xgb_fit_params = {'eval_set':[(train_1, train_y_1.target),(train_2, train_y_2.target)], 
                  'early_stopping_rounds':25, 'verbose':0}

grid_xgb = RandomizedSearchCV(n_iter=20, estimator=model_xgb, param_distributions=params_xgb, 
                              n_jobs=2, cv=5, refit=True, verbose=1, scoring='log_loss', 
                              fit_params=xgb_fit_params
                             )

grid_xgb.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  4.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={'eval_set': [(       pred_logreg  pred_knn   pred_rf  pred_naive  pred_ada  pred_xgb
0         0.520858   0.52000  0.535538    0.504673  0.505963  0.524817
1         0.478932   0.51325  0.504731    0.504673  0.512173  0.490121
2         0.539671   0.52250  0.521396    0.504673  0.512173 ...    1
11557    1
11558    0
Name: target, dtype: int64)], 'early_stopping_rounds': 25, 'verbose': 0},
          iid=True, n_iter=20, n_jobs=2,
          param_distributions={'n_estimators': [250, 500, 1000, 1500, 2000, 3000, 4000, 5000], 'subsample': [0.2, 0.5, 0.7, 1], 'reg_lambda': [1,

In [None]:
# Adaboost - gives back some weird value error, about nans/infinite values in df X?!
grid_ada = RandomizedSearchCV(n_iter=10, estimator=model_ada, param_distributions=params_ada, n_jobs=8, 
                              cv=5, refit=True, verbose=1, scoring='log_loss')
grid_ada.fit(train_l1_holdout_blend_v1, train_y_l1_holdout.target)

In [19]:
# results
print 'Logreg: ', grid_logreg.best_params_, grid_logreg.best_score_
print 'KNN: ', grid_knn.best_params_, grid_knn.best_score_
print 'RandomForest: ', grid_rf.best_params_, grid_rf.best_score_
print 'Bernoulli bayes: ', grid_naive.best_params_, grid_naive.best_score_
# print 'Adaboost: ', grid_ada.best_params_, grid_ada.best_score_
print 'Xgboost: ', grid_xgb.best_params_, grid_xgb.best_score_

Logreg:  {'fit_intercept': True, 'C': 1.5, 'n_jobs': 4, 'verbose': 0, 'solver': 'lbfgs', 'max_iter': 500, 'random_state': 42, 'tol': 0.0001} -0.691378634761
KNN:  {'p': 2, 'n_jobs': 4, 'leaf_size': 50, 'algorithm': 'ball_tree', 'n_neighbors': 4000} -0.69173965538
RandomForest:  {'oob_score': False, 'n_jobs': 4, 'verbose': 0, 'min_samples_leaf': 1, 'n_estimators': 3000, 'max_features': 1, 'random_state': 42, 'criterion': 'gini', 'min_samples_split': 1, 'max_depth': 5} -0.691632767865
Bernoulli bayes:  {'binarize': 0.75, 'alpha': 0.5, 'fit_prior': True} -0.693093824732
Xgboost:  {'silent': 1, 'learning_rate': 0.01, 'nthread': 4, 'n_estimators': 2000, 'subsample': 0.7, 'reg_lambda': 10, 'seed': 42, 'max_depth': 5, 'gamma': 0} -0.691921694662


In [20]:
pd.DataFrame.to_csv(pd.DataFrame(grid_logreg.grid_scores_), 'GridResults/blender_predonly_logreg.csv', 
                    index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_knn.grid_scores_), 'GridResults/blender_predonly_knn.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_rf.grid_scores_), 'GridResults/blender_predonly_rf.csv', index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_naive.grid_scores_), 'GridResults/blender_predonly_naive.csv', 
                    index=False)
pd.DataFrame.to_csv(pd.DataFrame(grid_xgb.grid_scores_), 'GridResults/blender_predonly_xgb.csv', index=False)