In [1]:
import numpy as np
import scipy
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

In [2]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data
    
def grid_summary(grid):
    print("Best parameters set found on development set:\n")
    print(grid.best_params_)
    print("Best CV score: ", grid.best_score_)
    print()
    print("Grid scores on development set:\n")

    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

# Shuffle the order of the training data just in case
X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
# Make a smaller dataset with only 10% the size
X_train10 = X_train[:len(X_train)//10]
y_train10 = y_train[:len(y_train)//10]
X_train20 = X_train[:len(X_train)//5]
y_train20 = y_train[:len(y_train)//5]

# X_train = scipy.sparse.csr_matrix(X_train)
# X_test  = scipy.sparse.csr_matrix(X_test)

In [173]:
kf = StratifiedKFold(n_splits=4)
SVCTop = SVC(C=4.0, gamma=0.0002)
norm = preprocessing.StandardScaler()
# pca = PCA(n_components=200)
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=1)

cv = []

for train_index, test_index in kf.split(X_train10, y_train10):
    X_train2, X_cv = X_train10[train_index], X_train10[test_index]
    y_train2, y_cv = y_train10[train_index], y_train10[test_index]
    
    # Preprocessing - Norm, PCA, fit SVC and subtract its predictions
    X_train2 = norm.fit_transform(X_train2)
#     X_train2 = pca.fit_transform(X_train2)
#     SVCTop.fit(X_train2, y_train2)
#     SVCTop_pred = SVCTop.predict(X_train2)
    y_train2_res = y_train2 #- SVCTop_pred
    
    # Fit the residues to a Adaboost
    ada.fit(X_train2, y_train2_res)
    
    # CV set. Same preprocessing
    # Norm, PCA, use the fitted SVC as a baseline prediction
    X_cv = norm.transform(X_cv)
#     X_cv = pca.transform(X_cv)
    y_pred = 0#SVCTop.predict(X_cv)
    
    # Add the predictions from the adaboost to y_pred
    y_pred += ada.predict(X_cv)
    
    cv.append(accuracy_score(y_pred, y_cv))
    
print(np.mean(cv))

0.78925


In [172]:
pipe = make_pipeline(preprocessing.StandardScaler(), 
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), 
                                        learning_rate=1))

grid = GridSearchCV(pipe, param_grid={"adaboostclassifier__n_estimators":[1, 10, 30, 100, 300, 1000]}, n_jobs=4, cv=4, verbose=10)
grid.fit(X_train10, y_train10)
grid_summary(grid)

# Using 20%!!

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   52.6s
[Parallel(n_jobs=4)]: Done  20 out of  24 | elapsed:   52.9s remaining:   10.5s
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  2.6min finished


Best parameters set found on development set:

{'adaboostclassifier__n_estimators': 300}
Best CV score:  0.7915

Grid scores on development set:

0.599 (+/-0.009) for {'adaboostclassifier__n_estimators': 1}
0.699 (+/-0.024) for {'adaboostclassifier__n_estimators': 10}
0.753 (+/-0.017) for {'adaboostclassifier__n_estimators': 30}
0.789 (+/-0.030) for {'adaboostclassifier__n_estimators': 100}
0.791 (+/-0.012) for {'adaboostclassifier__n_estimators': 300}
0.780 (+/-0.007) for {'adaboostclassifier__n_estimators': 1000}


In [175]:
pipe = make_pipeline(preprocessing.StandardScaler(), 
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), 
                                        learning_rate=1))

grid = GridSearchCV(pipe, param_grid={"adaboostclassifier__n_estimators":[1, 10, 30, 100, 300, 1000]}, n_jobs=4, cv=4, verbose=10)
grid.fit(X_train10, y_train10)
grid_summary(grid)

# Using 10%!!

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   24.4s
[Parallel(n_jobs=4)]: Done  20 out of  24 | elapsed:   24.9s remaining:    4.9s
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  1.2min finished


Best parameters set found on development set:

{'adaboostclassifier__n_estimators': 1000}
Best CV score:  0.7755

Grid scores on development set:

0.568 (+/-0.029) for {'adaboostclassifier__n_estimators': 1}
0.685 (+/-0.061) for {'adaboostclassifier__n_estimators': 10}
0.736 (+/-0.024) for {'adaboostclassifier__n_estimators': 30}
0.764 (+/-0.049) for {'adaboostclassifier__n_estimators': 100}
0.763 (+/-0.035) for {'adaboostclassifier__n_estimators': 300}
0.775 (+/-0.032) for {'adaboostclassifier__n_estimators': 1000}


In [4]:
pipe = make_pipeline(preprocessing.StandardScaler(), 
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), learning_rate=1))

grid = GridSearchCV(pipe, param_grid={"adaboostclassifier__n_estimators":[1, 10, 30]},
                    n_jobs=4, cv=4, verbose=10)

grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done   7 out of  12 | elapsed:   12.6s remaining:    9.0s
[Parallel(n_jobs=4)]: Done   9 out of  12 | elapsed:   30.8s remaining:   10.2s
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:   31.7s finished


Best parameters set found on development set:

{'adaboostclassifier__n_estimators': 30}
Best CV score:  0.774

Grid scores on development set:

0.600 (+/-0.007) for {'adaboostclassifier__n_estimators': 1}
0.706 (+/-0.013) for {'adaboostclassifier__n_estimators': 10}
0.774 (+/-0.011) for {'adaboostclassifier__n_estimators': 30}


In [5]:
pipe = make_pipeline(preprocessing.StandardScaler(), 
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), learning_rate=1))

grid = GridSearchCV(pipe, param_grid={"adaboostclassifier__n_estimators":[100, 300, 1000]},
                    n_jobs=4, cv=4, verbose=10)

grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  3.8min
[Parallel(n_jobs=4)]: Done   7 out of  12 | elapsed:  3.8min remaining:  2.7min
[Parallel(n_jobs=4)]: Done   9 out of  12 | elapsed: 13.6min remaining:  4.5min
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed: 13.6min finished


Best parameters set found on development set:

{'adaboostclassifier__n_estimators': 300}
Best CV score:  0.83055

Grid scores on development set:

0.816 (+/-0.012) for {'adaboostclassifier__n_estimators': 100}
0.831 (+/-0.009) for {'adaboostclassifier__n_estimators': 300}
0.830 (+/-0.013) for {'adaboostclassifier__n_estimators': 1000}


In [9]:
pipe = make_pipeline(preprocessing.StandardScaler(),
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), 
                                        learning_rate=1))

grid = GridSearchCV(pipe, param_grid={"adaboostclassifier__n_estimators":[1, 10, 30, 100, 300, 1000]}, n_jobs=4, cv=4, verbose=10)
grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   19.4s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   56.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  7.8min
[Parallel(n_jobs=4)]: Done  20 out of  24 | elapsed:  7.8min remaining:  1.6min
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed: 24.9min finished


Best parameters set found on development set:

{'adaboostclassifier__n_estimators': 300}
Best CV score:  0.8247

Grid scores on development set:

0.604 (+/-0.008) for {'adaboostclassifier__n_estimators': 1}
0.741 (+/-0.024) for {'adaboostclassifier__n_estimators': 10}
0.797 (+/-0.006) for {'adaboostclassifier__n_estimators': 30}
0.822 (+/-0.015) for {'adaboostclassifier__n_estimators': 100}
0.825 (+/-0.011) for {'adaboostclassifier__n_estimators': 300}
0.808 (+/-0.008) for {'adaboostclassifier__n_estimators': 1000}


In [3]:
# Use the best adaboost classifier above to fit full data
pipe = make_pipeline(preprocessing.StandardScaler(), 
                     AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), 
                                        n_estimators=300, learning_rate=1))
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
  ...one,
            splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None))])

In [4]:
# Check the training score
print(pipe.score(X_train, y_train))

0.85355


In [5]:
pred = pipe.predict(X_test)

In [6]:
with open("out6.txt", "w") as f:
    f.write("Id,Prediction\n")
    for i in range(len(pred)):
        f.write("{0},{1}\n".format(i+1, int(pred[i])))