In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn.decomposition import PCA

In [2]:
train_data = np.array(pd.read_csv('./data/train_data.csv'))
X_train_val = train_data[:, :-1]
y_train_val = train_data[:, -1]

test_data = np.array(pd.read_csv('./data/test_data.csv'))
X_test = test_data[:, :-1]
y_test = test_data[:, -1]

number_of_games_in_16_17 = 1074

x_train = X_train_val[:-number_of_games_in_16_17, :]
x_validation = X_train_val[-number_of_games_in_16_17:, :]

y_train = y_train_val[:-number_of_games_in_16_17]
y_validation = y_train_val[-number_of_games_in_16_17:]

In [3]:
scaler_tv = preprocessing.StandardScaler()
scaler_tv.fit(x_train)
x_train = scaler_tv.transform(x_train)
x_validation = scaler_tv.transform(x_validation)

scaler_tt = preprocessing.StandardScaler()
scaler_tt.fit(X_train_val)
X_train_val = scaler_tt.transform(X_train_val)
X_test = scaler_tt.transform(X_test)

In [4]:
Cs = np.array([10**i for i in range(-4, 4)])
variance = np.arange(5, 10) / 10
best_score = 0
best_params = {'C':0, 'var':0}

for var in variance:
    x_tr = x_train
    x_val = x_validation
    pca = PCA(var)
    pca.fit(x_tr)
    x_tr = pca.transform(x_tr)
    x_val = pca.transform(x_val)
    for C in Cs:
        model = linear_model.LogisticRegression(C=C, solver='lbfgs')
        model.fit(x_tr, y_train)
        y_predicted = model.predict(x_val)
        score = metrics.accuracy_score(y_validation, y_predicted)
        if score > best_score: 
            best_score = score
            best_params['C'] = C
            best_params['var'] = var

In [5]:
best_params

{'C': 0.1, 'var': 0.6}

In [6]:
pca = PCA(best_params['var'])
pca.fit(X_train_val)
X_train_val = pca.transform(X_train_val)
X_test = pca.transform(X_test)

In [7]:
clf = linear_model.LogisticRegression(C=best_params['C'], solver = 'lbfgs')

In [9]:
clf.fit(X_train_val, y_train_val)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_predicted = clf.predict(X_test)
metrics.accuracy_score(y_test, y_predicted)

0.653953488372093