In [23]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import sklearn.metrics as metrics
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
)
from sklearn.metrics import (
    auc,
    brier_score_loss,
    confusion_matrix,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.eval_measures import rmse
import warnings
warnings.filterwarnings("ignore")

In [6]:
%%time
df = pd.read_excel('http://dataevents.hu/202303/filedataevents/competition_table.xlsx', engine='openpyxl')

Wall time: 2min 44s


In [11]:
df['home_team_relative_point_roll1_sum'] = df.home_team_point_roll1_sum - df.home_team_expected_point_roll1_sum
df['home_team_relative_point_roll2_sum'] = df.home_team_point_roll2_sum - df.home_team_expected_point_roll2_sum
df['home_team_relative_point_roll3_sum'] = df.home_team_point_roll3_sum - df.home_team_expected_point_roll3_sum
df['home_team_relative_point_roll4_sum'] = df.home_team_point_roll4_sum - df.home_team_expected_point_roll4_sum

In [12]:
df['away_team_relative_point_roll1_sum'] = df.away_team_point_roll1_sum - df.away_team_expected_point_roll1_sum
df['away_team_relative_point_roll2_sum'] = df.away_team_point_roll2_sum - df.away_team_expected_point_roll2_sum
df['away_team_relative_point_roll3_sum'] = df.away_team_point_roll3_sum - df.away_team_expected_point_roll3_sum
df['away_team_relative_point_roll4_sum'] = df.away_team_point_roll4_sum - df.away_team_expected_point_roll4_sum

In [13]:
x_variables = set(df.columns)

In [14]:
remove = set(['home_win_flag', 'draw_flag', 'away_win_flag', 'match_id'])

In [15]:
x_variables = list(x_variables - remove)

In [16]:
df_train, df_holdout = train_test_split(df, train_size=0.8, random_state=2023)

In [19]:
y, X = dmatrices("home_win_flag ~ " + " + ".join(x_variables), df_train)

### home win flag

In [21]:
x_train_normalized = pd.DataFrame(
    StandardScaler().fit_transform(X)X.design_info.column_names,
    columns=,
)

In [24]:
lambdas = list(10 ** np.arange(-1, -4.01, -1 / 3))

In [26]:
n_obs = x_train_normalized.shape[0] * 4 / 5
C_values = [
    1 / (l * n_obs) for l in lambdas
] 

In [28]:
lr = LogisticRegressionCV(
    Cs=C_values,
    penalty="l1",
    cv=5,
    refit=True,
    scoring="roc_auc",
    solver="liblinear",
    random_state=42,
)

In [29]:
lr_model = lr.fit(x_train_normalized, y)

In [49]:
pd.DataFrame({
        "lambdas": lambdas,
        "C_values": C_values,
        "mean_cv_auc": lr_model.scores_[1].mean(axis=0),
    })

Unnamed: 0,lambdas,C_values,mean_cv_auc
0,0.1,0.000244,0.678558
1,0.046416,0.000525,0.679082
2,0.021544,0.00113,0.679222
3,0.01,0.002435,0.680236
4,0.004642,0.005246,0.68018
5,0.002154,0.011303,0.68012
6,0.001,0.024351,0.679439
7,0.000464,0.052463,0.678367
8,0.000215,0.113029,0.677377
9,0.0001,0.243513,0.676714


In [43]:
df_lasso_coeffs = pd.DataFrame(lr_model.coef_[0], columns = ['coefficient'], index = X.design_info.column_names)
df_lasso_coeffs[df_lasso_coeffs.coefficient > 0]

Unnamed: 0,coefficient
away_team_opponents_shot_roll1_sum,0.008952
home_team_shot_roll2_sum,0.009868
home_team_corner_roll4_sum,0.022608
away_team_opponents_shot_roll3_sum,0.009682
away_team_opponents_corner_roll4_sum,0.001044
odds_away_team_win,0.435164
home_team_expected_point_roll4_sum,0.01958
home_team_corner_roll1_sum,0.008308
away_team_opponents_shot_roll4_sum,0.028923
home_team_expected_point_roll2_sum,0.003651


In [50]:
y_holdout, X_holdout = dmatrices("home_win_flag ~ " + " + ".join(x_variables), df_holdout)

In [51]:
x_holdout_normalized = pd.DataFrame(
    StandardScaler().fit_transform(X_holdout),
    columns=X_holdout.design_info.column_names,
)

In [52]:
lr_model.score(x_holdout_normalized, y_holdout)

0.6867140317877192

In [58]:
df.iloc[0:10, 3:6]

Unnamed: 0,odds_home_team_win,odds_draw,odds_away_team_win
0,1.73,3.5,5.0
1,1.25,5.5,12.0
2,1.57,3.8,5.75
3,1.17,7.0,15.0
4,1.17,7.0,15.0
5,1.62,3.8,5.5
6,1.44,4.0,8.0
7,1.25,5.5,12.0
8,1.33,5.0,9.0
9,1.14,7.5,17.0


In [60]:
df[['odds_home_team_win', 'odds_draw', 'odds_away_team_win', 'home_win_flag', 'draw_flag', 'away_win_flag']].iloc[0:40]

Unnamed: 0,odds_home_team_win,odds_draw,odds_away_team_win,home_win_flag,draw_flag,away_win_flag
0,1.73,3.5,5.0,0,0,1
1,1.25,5.5,12.0,0,1,0
2,1.57,3.8,5.75,1,0,0
3,1.17,7.0,15.0,1,0,0
4,1.17,7.0,15.0,1,0,0
5,1.62,3.8,5.5,1,0,0
6,1.44,4.0,8.0,1,0,0
7,1.25,5.5,12.0,0,0,1
8,1.33,5.0,9.0,1,0,0
9,1.14,7.5,17.0,1,0,0
