In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold 
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import joblib

In [2]:
df = pd.read_json('clean_data_1.json')

In [3]:
df.head()

Unnamed: 0,Season,Capacity,Elo_home,Elo_away,Outcome,Home_Streak,Away_Streak,Home_Win,Home_Attack,Home_Defence,...,Home_Goals_avg_3,Home_Goals_avg_10,Elo_away_avg_3,Elo_away_avg_10,Away_Goals_avg_3,Away_Goals_avg_10,Home_Team_Outcome_sum_3,Home_Team_Outcome_sum_10,Away_Team_Outcome_sum_3,Away_Team_Outcome_sum_10
0,1990,59168,72,81,-1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,1990,35624,74,79,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
2,1990,32950,74,73,1,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
3,1990,35472,77,71,1,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
4,1990,26661,76,83,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0


In [4]:
X = df.loc[:, ~df.columns.isin(['Outcome', 'Home_Win'])]
y = df['Home_Win']

X = X.drop([
    'Season',
    'Elo_home_avg_3',
    'Elo_home_avg_10',
    'Elo_away_avg_3',
    'Elo_away_avg_10',
    'Home_Goals_avg_3',
    'Away_Goals_avg_3',
    'Home_Defence',
    'Away_Defence',
    'Home_Streak',
    'Away_Streak',
    'Home_Team_Outcome_sum_3',
    'Away_Team_Outcome_sum_3',
    'Home_Goals_avg_10',
    'Away_Goals_avg_10'   
    ], axis=1)

In [5]:

def split_datasets(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = split_datasets(X, y)

In [7]:
def compare_models(models, X_train, y_train, y_test):
    for model in models:
        model[1].fit(X_train, y_train)
        y_pred = model[1].predict(X_test)
        accu = accuracy_score(y_test, y_pred) * 100
        print(
            f"{model[0]}: "
            f"Accuracy: {accu:.2f}"
            )
    return

In [8]:
np.random.seed(2)

models = [
    ('lgr', make_pipeline(StandardScaler(), LogisticRegression())),
    ('rfc', RandomForestClassifier(max_depth=2)),
    ('knn', make_pipeline(StandardScaler(), KNeighborsClassifier())),
    ('dtc', DecisionTreeClassifier()),
    ('abc', AdaBoostClassifier()),
    ('gbc', GradientBoostingClassifier())
    ]

X_train, X_test, y_train, y_test = split_datasets(X, y)
compare_models(models, X_train, y_train, y_test)

lgr: Accuracy: 60.50
rfc: Accuracy: 58.46
knn: Accuracy: 56.42
dtc: Accuracy: 54.05
abc: Accuracy: 60.25
gbc: Accuracy: 60.55


Hypertuning 



In [9]:
model_1 = LogisticRegression()
params = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}

cv = KFold(n_splits=10, random_state=1, shuffle=True)
grid_search = GridSearchCV(model_1, params, scoring='accuracy', n_jobs=-1, cv=cv)
grid_result = grid_search.fit(X_train, y_train)
print(f'Best Score: {grid_result.best_score_ * 100:.2f}%')
print(f'Best Hyperparameters: {grid_result.best_params_}')

480 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/home/zain/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/zain/miniconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/zain/miniconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' i

Best Score: 60.86%
Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [12]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num = 10)]
min_split = [int(x) for x in np.linspace(start = 100, stop = 500, num = 50)]
max_depth = [int(x) for x in np.linspace(2, 10, num = 2)]
min_leaf = [int(x) for x in np.linspace(start = 1, stop = 30, num = 10)]
max_features = [int(x) for x in np.linspace(start = 7, stop = 10, num = 7)]

grid = {'n_estimators': n_estimators,
        'min_samples_split': min_split,
        'max_depth': max_depth,
        'min_samples_leaf': min_leaf,
        'max_features': max_features,
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,1.0],
        }

model_2 = GradientBoostingClassifier(n_estimators= 130, learning_rate=0.1, max_depth=2, min_samples_split=100, max_features=7, subsample = 0.9)
grid_search = GridSearchCV(model_2, grid, cv = 3, verbose=2, n_jobs = -1)
grid_result = grid_search.fit(X_train, y_train)
print(f'Best Score: {grid_result.best_score_ * 100:.2f}%')
print(f'Best Hyperparameters: {grid_result.best_params_}')

Fitting 3 folds for each of 490000 candidates, totalling 1470000 fits
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.6; total time=   0.8s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.6; total time=   0.7s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.7; total time=   0.8s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.7; total time=   0.8s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.6; total time=   0.8s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.75; total time=   0.9s
[CV] END max_depth=2, max_features=7, min_samples_leaf=1, min_samples_split=100, n_estimators=10, subsample=0.75; total time=   0.9s
[CV]

KeyboardInterrupt: 