In [1]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn import ensemble

In [6]:
train_data = np.array(pd.read_csv('./data/train_data.csv'))
X_train_val = train_data[:, :-1]
y_train_val = train_data[:, -1]

test_data = np.array(pd.read_csv('./data/test_data.csv'))
X_test = test_data[:, :-1]
y_test = test_data[:, -1]

number_of_games_in_16_17 = 1074

x_train = X_train_val[:-number_of_games_in_16_17, :]
x_validation = X_train_val[-number_of_games_in_16_17:, :]

y_train = y_train_val[:-number_of_games_in_16_17]
y_validation = y_train_val[-number_of_games_in_16_17:]

In [8]:
scaler_tv = preprocessing.StandardScaler()
scaler_tv.fit(x_train)
x_train = scaler_tv.transform(x_train)
x_validation = scaler_tv.transform(x_validation)

scaler_tt = preprocessing.StandardScaler()
scaler_tt.fit(X_train_val)
X_train_val = scaler_tt.transform(X_train_val)
X_test = scaler_tt.transform(X_test)

### Ada Boost

In [9]:
estimators = [10, 15, 20, 25, 30, 40, 50]
depths = [2, 3, 5]
best_score = 0
best_params = {'n_estimators': 0, 'max_depth': 0}
for n_estimators in estimators:
    for depth in depths:
        clf = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth),
                                          algorithm="SAMME", n_estimators=n_estimators, random_state=7)
        clf.fit(x_train, y_train)
        y_predicted = clf.predict(x_validation)
        score = metrics.accuracy_score(y_validation, y_predicted)
        if score > best_score: 
            best_score = score
            best_params['n_estimators'] = n_estimators
            best_params['max_depth'] = depth

In [10]:
best_params

{'n_estimators': 25, 'max_depth': 2}

In [11]:
bdt = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=best_params['max_depth']),
                         algorithm="SAMME",
                         n_estimators=best_params['n_estimators'], random_state=7)

bdt.fit(X_train_val, y_train_val)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                               

In [12]:
y_predicted = bdt.predict(X_test)

In [13]:
metrics.accuracy_score(y_test, y_predicted)

0.6623255813953488

In [14]:
y_pred_train = bdt.predict(X_train_val)

In [15]:
metrics.accuracy_score(y_train_val, y_pred_train)

0.6890959925442685