In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

datasets_train = pd.read_csv("./input/train.csv")
datasets_test = pd.read_csv("./input/test.csv")

def get_X_y(datasets):
    return datasets_train.drop("label", axis=1), datasets_train["label"].copy()

X_train, y_train = get_X_y(datasets_train)
X_test = datasets_test

def save_submission(model, X_test, try_times=0):
    y_pred = model.predict(X_test)
    data = np.c_[np.array([idx + 1 for idx in range(len(y_pred))]), y_pred]
    y_pred_csv = pd.DataFrame(data=data, columns=["ImageId", "Label"])
    y_pred_csv.to_csv("./output/submission_%(try_times)02d.csv" % {"try_times": try_times}, index=False)
        

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC()
score = cross_val_score(
    svm_clf, 
    X_train,
    y_train,
    cv=3,
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
)
score.mean()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=28.9min
[CV] ................................................. , total=28.9min
[CV] ................................................. , total=28.9min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 29.0min finished


0.11152380727340205

In [5]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(
    solver="lbfgs",
    alpha=1e-5,
    hidden_layer_sizes=(5,4,3,2)
)

score = cross_val_score(
    mlp_clf, 
    X_train,
    y_train,
    cv=3,
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
)
score.mean()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 1.8min
[CV] ................................................. , total= 1.8min
[CV] ................................................. , total= 1.8min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished


0.27773992604076664

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
score = cross_val_score(
    rf_clf, 
    X_train, 
    y_train, 
    cv=3, 
    scoring="accuracy", 
    n_jobs=-1,
    verbose=2
)
score.mean()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   2.2s
[CV] ................................................. , total=   2.2s
[CV] ................................................. , total=   2.2s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.2s finished


0.934476408858996

In [13]:
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=500))
score = cross_val_score(
    ab_clf, 
    X_train, 
    y_train, 
    cv=3, 
    scoring="accuracy", 
    n_jobs=-1,
    verbose=2
)
score.mean()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 1.8min
[CV] ................................................. , total= 1.8min
[CV] ................................................. , total= 1.8min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.9min finished


0.9642383930556203

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "base_estimator__n_estimators": [100,200,300],
    "base_estimator__max_depth": range(1, 10),
#     "base_estimator__max_features": ,
}

ab_clf = AdaBoostClassifier(base_estimator=RandomForestClassifier())

grid_search = GridSearchCV(
    ab_clf, 
    param_grid=param_grid,
    cv=3, 
    scoring="accuracy", 
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
final_model = grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=100 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=100 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=100 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=200 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=200 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=200 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=300 ...
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=300 ...
[CV]  base_estimator__max_depth=1, base_estimator__n_estimators=100, total= 2.6min
[CV] base_estimator__max_depth=1, base_estimator__n_estimators=300 ...
[CV]  base_estimator__max_depth=1, base_estimator__n_estimators=100, total= 2.6min
[CV] base_estimator__max_depth=2, base_estimator__n_estimators=100 ...
[CV]  base_estimator__max_depth=1, base_estimator__n_estimators

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 29.5min


[CV]  base_estimator__max_depth=4, base_estimator__n_estimators=100, total= 6.7min
[CV] base_estimator__max_depth=4, base_estimator__n_estimators=300 ...
[CV]  base_estimator__max_depth=4, base_estimator__n_estimators=100, total= 6.7min
[CV] base_estimator__max_depth=4, base_estimator__n_estimators=300 ...
[CV]  base_estimator__max_depth=3, base_estimator__n_estimators=300, total=15.5min
[CV] base_estimator__max_depth=4, base_estimator__n_estimators=300 ...
[CV]  base_estimator__max_depth=3, base_estimator__n_estimators=300, total=15.4min
[CV] base_estimator__max_depth=5, base_estimator__n_estimators=100 ...
[CV]  base_estimator__max_depth=3, base_estimator__n_estimators=300, total=15.4min
[CV] base_estimator__max_depth=5, base_estimator__n_estimators=100 ...
[CV]  base_estimator__max_depth=4, base_estimator__n_estimators=200, total=13.3min
[CV] base_estimator__max_depth=5, base_estimator__n_estimators=100 ...
[CV]  base_estimator__max_depth=4, base_estimator__n_estimators=200, total=1

[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 202.3min finished


{'base_estimator__max_depth': 9, 'base_estimator__n_estimators': 200}
0.9686428571428571


In [15]:
save_submission(final_model, X_test, 4)