In [10]:
import math

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

random_seed = 42

In [11]:
data_ws = pd.read_csv("../data/featured/winter_spring.csv", sep=';')
data_s = pd.read_csv("../data/featured/spring.csv", sep=';')

Xdata_ws = data_ws.drop('teamA_win', axis=1)
ydata_ws = data_ws.teamA_win

Xdata_s = data_s.drop('teamA_win', axis=1)
ydata_s = data_s.teamA_win

In [12]:
Xtrain_ws, Xval_ws, ytrain_ws, yval_ws = train_test_split(Xdata_ws, ydata_ws, test_size=0.4, random_state=random_seed)
Xtrain_s, Xval_s, ytrain_s, yval_s = train_test_split(Xdata_s, ydata_s, test_size=0.4, random_state=random_seed)

test_data = pd.read_csv("../data/featured/summer.csv", sep=';')
Xtest = test_data.drop('teamA_win', axis=1)
ytest = test_data.teamA_win

In [13]:
clfRF = RandomForestClassifier(n_estimators = 20, max_depth = 8, random_state = 42)
clfRF.fit(Xtrain_ws, ytrain_ws)

print('Winter & Spring Accuracy score (train): {0:.6f}'.format(metrics.accuracy_score(ytrain_ws, clfRF.predict(Xtrain_ws))))
print('Winter & Spring Accuracy score (val): {0:.6f}'.format(metrics.accuracy_score(yval_ws, clfRF.predict(Xval_ws))))

clfRF = RandomForestClassifier(n_estimators = 20, max_depth = 8, random_state = 42)
clfRF.fit(Xtrain_s, ytrain_s)

print('\nSpring Accuracy score (train): {0:.6f}'.format(metrics.accuracy_score(ytrain_s, clfRF.predict(Xtrain_s))))
print('Spring Accuracy score (val): {0:.6f}'.format(metrics.accuracy_score(yval_s, clfRF.predict(Xval_s))))

Winter & Spring Accuracy score (train): 0.831958
Winter & Spring Accuracy score (val): 0.717660

Spring Accuracy score (train): 0.882554
Spring Accuracy score (val): 0.733333


In [14]:
param_grid = {
    'n_estimators': range(3,50),
    'max_depth': range(3, 30)
}

param_comb = ParameterGrid(param_grid)

val_metric = []
for params in param_comb:
    clf = RandomForestClassifier(**params, random_state = 42).fit(Xtrain_ws, ytrain_ws)
    val_metric.append(metrics.accuracy_score(yval_ws, clf.predict(Xval_ws)))
    
best_params = param_comb[np.argmax(val_metric)]
print(f"We found the best params {best_params} with validation acuraccy {max(val_metric):.4f}.")

clfRFb = RandomForestClassifier(**best_params, random_state = 42).fit(Xtrain_ws, ytrain_ws)
print('Winter & Spring accuracy score (test): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))

We found the best params {'n_estimators': 35, 'max_depth': 17} with validation acuraccy 0.7627.
Winter & Spring accuracy score (test): 0.685792


In [16]:
param_grid = {
    'n_estimators': range(3,50),
    'max_depth': range(3, 30)
}

param_comb = ParameterGrid(param_grid)

val_metric = []
for params in param_comb:
    clf = RandomForestClassifier(**params, random_state = 42).fit(Xtrain_s, ytrain_s)
    val_metric.append(metrics.accuracy_score(yval_s, clf.predict(Xval_s)))
    
best_params = param_comb[np.argmax(val_metric)]
print(f"We found the best params {best_params} with validation acuraccy {max(val_metric):.4f}.")

clfRFb = RandomForestClassifier(**best_params, random_state = 42).fit(Xtrain_s, ytrain_s)
print('Spring accuracy score (test): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))

We found the best params {'n_estimators': 29, 'max_depth': 15} with validation acuraccy 0.7607.
Spring accuracy score (test): 0.636612
