In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv('data/train.csv')
target = pd.read_csv('data/trainLabels.csv')

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=42)

clf = GradientBoostingClassifier(random_state=7)

In [3]:
n_estimators = range(10, 500, 5)
max_depth = range(1, 100, 2)
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

grid_search = GridSearchCV(clf, param_grid, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, cv=5)

grid_result = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 4900 candidates, totalling 24500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 254 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 627 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 977 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 1427 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2627 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3377 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4227 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5177 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 6227 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 7377 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 8627 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 9977 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 11427 tasks      

In [4]:
print('Best Accuracy: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -0.114820 using {'max_depth': 5, 'n_estimators': 115}


In [5]:
grid_result.best_params_

{'max_depth': 5, 'n_estimators': 115}

In [6]:
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'], 
                                          n_estimators=grid_result.best_params_['n_estimators'])

clf_bestparam.fit(x_train, y_train)
clf_bestparam.score(x_test, y_test)

  y = column_or_1d(y, warn=True)


0.852

In [7]:
test = pd.read_csv('data-science-london-scikit-learn/test.csv', header=None)

y_pred = clf_bestparam.predict(test)
index_column = np.arange(1, y_pred.shape[0] + 1)
data_submission = pd.DataFrame(y_pred, columns=['Solution'], dtype=int)
data_submission['Id'] = index_column
data_submission.index.name = 'Id'
data_submission.set_index(index_column)

Unnamed: 0,Solution,Id
1,1,1
2,0,2
3,1,3
4,0,4
5,0,5
6,0,6
7,0,7
8,1,8
9,0,9
10,0,10


In [8]:
data_submission.to_csv('Day_048_submission.csv', index=False)