In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    auc,
    roc_auc_score
)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

#### Import, Filter, Adjust Data

In [27]:
df = pd.read_csv('competition_table.csv', index_col = 0)

In [28]:
df.head()

Unnamed: 0,match_id,season,div,odds_home_team_win,odds_draw,odds_away_team_win,home_team_match_nr,home_team_goal_roll1_sum,home_team_goal_roll2_sum,home_team_goal_roll3_sum,...,odds_away_team_draw_roll4_mean,odds_away_team_defeat_roll1_mean,odds_away_team_defeat_roll2_mean,odds_away_team_defeat_roll3_mean,odds_away_team_defeat_roll4_mean,away_team_point_mean,away_team_expected_point_mean,home_win_flag,draw_flag,away_win_flag
0,0,2008-2009,div0,1.73,3.5,5.0,5,3,7,8,...,3.825,11.0,7.665,6.377,5.908,0.5,3.0,0,0,1
1,1,2008-2009,div0,1.25,5.5,12.0,7,0,2,5,...,3.582,2.1,2.25,2.833,2.475,0.333,0.0,0,1,0
2,2,2008-2009,div0,1.57,3.8,5.75,9,1,4,4,...,3.525,6.75,4.275,3.883,3.638,0.75,1.5,1,0,0
3,3,2008-2009,div0,1.17,7.0,15.0,11,2,6,7,...,3.975,4.2,2.75,2.333,2.55,0.6,0.0,1,0,0
4,4,2008-2009,div0,1.17,7.0,15.0,14,2,4,7,...,3.925,3.1,2.175,2.05,2.015,0.833,0.0,1,0,0


In [29]:
df = df[(df.odds_home_team_win != 0) & (df.odds_draw != 0) & (df.odds_away_team_win != 0)]

In [30]:
df['home_team_relative_point_roll1_sum'] = df.home_team_point_roll1_sum - df.home_team_expected_point_roll1_sum
df['home_team_relative_point_roll2_sum'] = df.home_team_point_roll2_sum - df.home_team_expected_point_roll2_sum
df['home_team_relative_point_roll3_sum'] = df.home_team_point_roll3_sum - df.home_team_expected_point_roll3_sum
df['home_team_relative_point_roll4_sum'] = df.home_team_point_roll4_sum - df.home_team_expected_point_roll4_sum

In [31]:
df['away_team_relative_point_roll1_sum'] = df.away_team_point_roll1_sum - df.away_team_expected_point_roll1_sum
df['away_team_relative_point_roll2_sum'] = df.away_team_point_roll2_sum - df.away_team_expected_point_roll2_sum
df['away_team_relative_point_roll3_sum'] = df.away_team_point_roll3_sum - df.away_team_expected_point_roll3_sum
df['away_team_relative_point_roll4_sum'] = df.away_team_point_roll4_sum - df.away_team_expected_point_roll4_sum

In [32]:
df['prob_home_win'] = 1 / df.odds_home_team_win
df['prob_draw'] = 1 / df.odds_draw
df['prob_away_win'] = 1 / df.odds_away_team_win

In [33]:
x_variables = set(df.columns)

In [34]:
remove = set(['home_win_flag', 'draw_flag', 'away_win_flag', 'match_id', 'season'])

In [35]:
x_variables = list(x_variables - remove)

In [24]:
x_variables

['odds_away_team_defeat_roll1_mean',
 'away_team_opponents_corner_roll2_sum',
 'home_team_corner_roll3_sum',
 'away_team_point_roll2_sum',
 'odds_away_team_win_roll2_mean',
 'home_team_point_roll4_sum',
 'odds_away_team_draw_roll1_mean',
 'away_team_relative_point_roll3_sum',
 'away_team_relative_point_roll2_sum',
 'away_team_shot_on_target_roll2_sum',
 'home_team_expected_point_roll1_sum',
 'odds_home_team_draw_roll1_mean',
 'home_team_opponents_yellow_card_roll3_sum',
 'away_team_corner_roll1_sum',
 'odds_away_team_win_roll3_mean',
 'away_team_opponents_yellow_card_roll3_sum',
 'odds_away_team_draw_roll3_mean',
 'away_team_opponents_shot_roll4_sum',
 'away_team_opponents_shot_on_target_roll3_sum',
 'home_team_shot_roll3_sum',
 'home_team_shot_roll1_sum',
 'odds_home_team_win_roll1_mean',
 'odds_home_team_defeat_roll1_mean',
 'home_team_yellow_card_roll3_sum',
 'home_team_yellow_card_roll4_sum',
 'odds_home_team_win_roll4_mean',
 'away_team_expected_point_mean',
 'away_team_expected_p

#### Import model

In [36]:
gbm_pipe_home = load('gbm_pipe_home.joblib')

In [25]:
gbm_home = load('gbm_home.joblib')

In [38]:
categorical_columns =  ['div']
numerical_columns = [col for col in x_variables if col != 'div']

In [39]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", "passthrough", numerical_columns),
    ]
)

gbm_pipe_home = Pipeline(
    [("preprocess", preprocessing), ("classifier", gbm_home)], verbose=True
)

In [41]:
y_pred_home = gbm_pipe_home.fit().predict_proba(df[x_variables])

TypeError: fit() missing 1 required positional argument: 'X'