In [2]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import (
    auc,
    confusion_matrix,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")
from IPython.display import clear_output
import time

#### Import, Filter, Adjust Data

In [3]:
df = pd.read_csv('competition_table.csv', index_col = 0)

In [4]:
df = df[(df.odds_home_team_win != 0) & (df.odds_draw != 0) & (df.odds_away_team_win != 0)]

In [5]:
df['home_team_relative_point_roll1_sum'] = df.home_team_point_roll1_sum - df.home_team_expected_point_roll1_sum
df['home_team_relative_point_roll2_sum'] = df.home_team_point_roll2_sum - df.home_team_expected_point_roll2_sum
df['home_team_relative_point_roll3_sum'] = df.home_team_point_roll3_sum - df.home_team_expected_point_roll3_sum
df['home_team_relative_point_roll4_sum'] = df.home_team_point_roll4_sum - df.home_team_expected_point_roll4_sum

In [6]:
df['away_team_relative_point_roll1_sum'] = df.away_team_point_roll1_sum - df.away_team_expected_point_roll1_sum
df['away_team_relative_point_roll2_sum'] = df.away_team_point_roll2_sum - df.away_team_expected_point_roll2_sum
df['away_team_relative_point_roll3_sum'] = df.away_team_point_roll3_sum - df.away_team_expected_point_roll3_sum
df['away_team_relative_point_roll4_sum'] = df.away_team_point_roll4_sum - df.away_team_expected_point_roll4_sum

In [7]:
df['prob_home_win'] = 1 / df.odds_home_team_win
df['prob_draw'] = 1 / df.odds_draw
df['prob_away_win'] = 1 / df.odds_away_team_win

In [8]:
x_variables = set(df.columns)

In [9]:
remove = set(['home_win_flag', 'draw_flag', 'away_win_flag', 'match_id', 'season', 'div'])

In [10]:
x_variables = list(x_variables - remove)

#### Build Model

In [11]:
df_train, df_holdout = train_test_split(df, train_size=0.8, random_state=2023)

In [12]:
tune_grid = {
    "max_iter": [50, 100, 200],
    "max_depth": [1, 5, 10],
    "learning_rate": [0.1, 0.15, 0.2],
    "min_samples_leaf": [5, 10, 20, 30],
}

**Home_win_flag**

In [13]:
gbm_home = HistGradientBoostingClassifier()

In [14]:
gbm_home_cv = GridSearchCV(
    gbm_home,
    tune_grid,
    cv=5,
    scoring="roc_auc",
    verbose=10,
)

In [15]:
start = time.time()
gbm_home_cv.fit(df_train[x_variables], df_train.home_win_flag)
stop = time.time()
clear_output()
process_time = str(round(stop - start, 0))
print(f'Process time: {process_time} seconds.')

Process time: 1041.0 seconds.


In [17]:
dump(gbm_home_cv, 'gbm_home_cv.joblib')

['gbm_home_cv.joblib']

In [21]:
model = load('gbm_pipe_home.joblib')

In [25]:
model.predict_proba(df_holdout[x_variables])

array([[0.20222524, 0.79777476],
       [0.59231895, 0.40768105],
       [0.76864796, 0.23135204],
       ...,
       [0.57344831, 0.42655169],
       [0.8727495 , 0.1272505 ],
       [0.47306186, 0.52693814]])