In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

In [2]:
train_data = np.array(pd.read_csv('./data/train_data.csv'))
X_train = train_data[:, :-1]
y_train = train_data[:, -1]

In [3]:
test_data = np.array(pd.read_csv('./data/test_data.csv'))
X_test = test_data[:, :-1]
y_test = test_data[:, -1]

In [4]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
clf = linear_model.LogisticRegression()

clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
y_predicted = clf.predict(X_test)

In [7]:
metrics.confusion_matrix(y_test, y_predicted)

array([[267, 182],
       [198, 428]])

In [8]:
metrics.accuracy_score(y_test, y_predicted)

0.6465116279069767

In [9]:
y_train_predicted = clf.predict(X_train)
metrics.accuracy_score(y_train, y_train_predicted)

0.6814538676607642

### Testing different hyperparameter values on a validation set

In [10]:
number_of_games_in_16_17 = 1074

In [11]:
X_train_val = train_data[:, :-1]
y_train_val = train_data[:, -1]

In [12]:
x_train = X_train_val[:-number_of_games_in_16_17, :]
x_validation = X_train_val[-number_of_games_in_16_17:, :]

In [13]:
y_train = y_train_val[:-number_of_games_in_16_17]
y_validation = y_train_val[-number_of_games_in_16_17:]

In [14]:
scaler_tv = preprocessing.StandardScaler()
scaler_tv.fit(x_train)
x_train = scaler_tv.transform(x_train)
x_validation = scaler_tv.transform(x_validation)

In [15]:
Cs = np.array([10**i for i in range(-5, 5)])
best_score = 0
best_C = 0
for C in Cs:
    model = linear_model.LogisticRegression(C = C)
    model.fit(x_train, y_train)        
    y_predicted = model.predict(x_validation)
    score = metrics.accuracy_score(y_validation, y_predicted)
    if score>best_score: 
        best_score = score
        best_C = C



In [16]:
best_score

0.6433891992551211

In [17]:
best_C

0.001

In [18]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_val)
X_train_val = scaler.transform(X_train_val)

In [19]:
clf2 = linear_model.LogisticRegression(C=best_C)

clf2.fit(X_train_val, y_train_val)



LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = clf2.predict(X_test)

In [24]:
print(metrics.confusion_matrix(y_test, y_pred))

[[265 184]
 [178 448]]


In [22]:
metrics.accuracy_score(y_test, y_pred)

0.6632558139534884

In [23]:
y_train_pred = clf2.predict(X_train_val)
metrics.accuracy_score(y_train_val, y_train_pred)

0.6771668219944083