## Import packages and set working directory

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Get the current working directory
os.getcwd()

'c:\\Users\\marcu\\Documents\\git repository\\game_prediction_with_FPL_data'

## Importing and checking data

In [18]:
df = pd.read_csv("train_score.csv", sep = ";", decimal=",")

In [19]:
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)

Checking columns data types

In [20]:
df.dtypes #.to_dict()

Unnamed: 0                                 int64
id                                         int64
team_h                                     int64
team_a                                     int64
result                                    object
                                           ...  
best_assists_m_player_playing_player_a     int64
best_assists_f_player_playing_player_a     int64
team_name_home                            object
team_name_away                            object
season                                    object
Length: 103, dtype: object

Checking number of NA is each column

In [21]:
print(df.isnull().sum())

Unnamed: 0                                0
id                                        0
team_h                                    0
team_a                                    0
result                                    9
                                         ..
best_assists_m_player_playing_player_a    0
best_assists_f_player_playing_player_a    0
team_name_home                            0
team_name_away                            0
season                                    0
Length: 103, dtype: int64


Following features has NA:
- win_rate_h_team_h
- win_rate_h_last_5_team_h
- draw_rate_h_team_h
- draw_rate_h_last_5_team_h
- loss_rate_h_team_h
- loss_rate_h_last_5_team_h
- avg_scored_goals_h_team_h
- avg_scored_goals_h_last_5_team_h
- avg_conceded_goals_h_team_h
- avg_conceded_goals_h_last_5_team_h
- win_rate_a_team_a
- win_rate_a_last_5_team_a
- draw_rate_a_team_a
- draw_rate_a_last_5_team_a
- loss_rate_a_team_a
- loss_rate_a_last_5_team_a
- avg_scored_goals_a_team_a
- avg_scored_goals_a_last_5_team_a
- avg_conceded_goals_a_team_a
- avg_conceded_goals_a_last_5_team_a


In [22]:
pd.set_option('display.max_rows', 10)

In [23]:
# pd.set_option('display.max_columns', 101)

## Fixing targets and creating train- and score-datasets

In [24]:
X = df.loc[df.result.notna()].copy()
score = df.loc[df.result.isna()].copy()

In [25]:
X["result_home_win"] = np.where(X.result == "1", 1, 0)
X["result_draw"] = np.where(X.result == "X", 1, 0)
X["result_away_win"] = np.where(X.result == "2", 1, 0)

## ML

In [26]:
# pd.options.display.precision = 4
# pd.options.mode.chained_assignment = None  

# Machine learning pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

set_config(display = "diagram")

seed = 134680

In [27]:
features = X.columns.drop(['team_h', 'team_a', 'result', 'match_day', 'result_draw', 'result_away_win', 'result_home_win', 'team_name_home', 'team_name_away'])

In [28]:
numerical = X[features].select_dtypes('number').columns
print(f"Numerical features: {', '.join(numerical)}")

Numerical features: Unnamed: 0, id, scored_goals_team_h, conceded_goals_team_h, win_team_h, draw_team_h, loss_team_h, win_rate_team_h, win_rate_last_5_team_h, win_rate_h_team_h, win_rate_h_last_5_team_h, draw_rate_team_h, draw_rate_last_5_team_h, draw_rate_h_team_h, draw_rate_h_last_5_team_h, loss_rate_team_h, loss_rate_last_5_team_h, loss_rate_h_team_h, loss_rate_h_last_5_team_h, avg_scored_goals_team_h, avg_scored_goals_last_5_team_h, avg_scored_goals_h_team_h, avg_scored_goals_h_last_5_team_h, avg_conceded_goals_team_h, avg_conceded_goals_last_5_team_h, avg_conceded_goals_h_team_h, avg_conceded_goals_h_last_5_team_h, scored_goals_team_a, conceded_goals_team_a, win_team_a, draw_team_a, loss_team_a, win_rate_team_a, win_rate_last_5_team_a, win_rate_a_team_a, win_rate_a_last_5_team_a, draw_rate_team_a, draw_rate_last_5_team_a, draw_rate_a_team_a, draw_rate_a_last_5_team_a, loss_rate_team_a, loss_rate_last_5_team_a, loss_rate_a_team_a, loss_rate_a_last_5_team_a, avg_scored_goals_team_a,

In [29]:
categorical = pd.Index(np.setdiff1d(features, numerical))
print(f"Categorical features: {', '.join(categorical)}")

Categorical features: season


In [30]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('encoder', OneHotEncoder(drop = 'first', handle_unknown = 'error', sparse = False))
])

preprocessors = ColumnTransformer(transformers = [
    ('num', numerical_pipe, numerical),
    ('cat', categorical_pipe, categorical)
])

pipe_to_cv = Pipeline([
    ('preprocessors', preprocessors)
])

In [None]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

### Predicting home-win

In [31]:
target = 'result_home_win'

#### CV

In [32]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [33]:
X_to_csv = pipe_to_cv.transform(X[features])

In [34]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [35]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [36]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 20,
 'min_samples_leaf': 20,
 'n_estimators': 50}

#### Final model

In [37]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))
])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [39]:
pipe_rf.fit(X_train, y_train)

In [40]:
print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.8666
Test ROC-AUC: 0.7076


In [41]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.42556502, 0.33452922, 0.13201417, 0.44875921, 0.62579495,
       0.4089446 , 0.37026545, 0.66412683, 0.30840728])

In [42]:
result = score[["id", "team_h", "team_name_home", "team_a", "team_name_away", "match_day"]].copy()
result["home_win_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [43]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability
787,71,2,AVL,17,SOU,2022-09-16,0.425565
788,78,16,NFO,9,FUL,2022-09-16,0.334529
789,80,20,WOL,13,MCI,2022-09-17,0.132014
790,77,15,NEW,3,BOU,2022-09-17,0.448759
791,79,18,TOT,10,LEI,2022-09-17,0.625795
792,72,4,BRE,1,ARS,2022-09-18,0.408945
793,75,8,EVE,19,WHU,2022-09-18,0.370265
794,76,14,MUN,11,LEE,2022-09-18,0.664127
795,74,6,CHE,12,LIV,2022-09-18,0.308407


### Predicting draw


In [44]:
target = 'result_draw'

#### CV

In [45]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [46]:
X_to_csv = pipe_to_cv.transform(X[features])

In [47]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [48]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [49]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 10,
 'min_samples_leaf': 4,
 'n_estimators': 50}

#### Final model

In [50]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [52]:
pipe_rf.fit(X_train, y_train)

In [53]:
print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 1.0000
Test ROC-AUC: 0.4993


In [54]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.21095815, 0.32002115, 0.48749661, 0.35536991, 0.24518831,
       0.32200505, 0.30042408, 0.31369264, 0.30606061])

In [55]:
result["draw_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [56]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability,draw_probability
787,71,2,AVL,17,SOU,2022-09-16,0.425565,0.210958
788,78,16,NFO,9,FUL,2022-09-16,0.334529,0.320021
789,80,20,WOL,13,MCI,2022-09-17,0.132014,0.487497
790,77,15,NEW,3,BOU,2022-09-17,0.448759,0.35537
791,79,18,TOT,10,LEI,2022-09-17,0.625795,0.245188
792,72,4,BRE,1,ARS,2022-09-18,0.408945,0.322005
793,75,8,EVE,19,WHU,2022-09-18,0.370265,0.300424
794,76,14,MUN,11,LEE,2022-09-18,0.664127,0.313693
795,74,6,CHE,12,LIV,2022-09-18,0.308407,0.306061


### Predicting away-win

In [57]:
target = 'result_away_win'

#### CV

In [58]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [59]:
X_to_csv = pipe_to_cv.transform(X[features])

In [60]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [61]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [62]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 10,
 'min_samples_leaf': 20,
 'n_estimators': 100}

#### Final model

In [63]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))])

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [65]:
pipe_rf.fit(X_train, y_train)

In [66]:
print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.8702
Test ROC-AUC: 0.6269


In [67]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.31140663, 0.39799478, 0.52974422, 0.26014915, 0.15560718,
       0.30894774, 0.39296814, 0.1631963 , 0.40562718])

In [68]:
result["away_win_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [69]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability,draw_probability,away_win_probability
787,71,2,AVL,17,SOU,2022-09-16,0.425565,0.210958,0.311407
788,78,16,NFO,9,FUL,2022-09-16,0.334529,0.320021,0.397995
789,80,20,WOL,13,MCI,2022-09-17,0.132014,0.487497,0.529744
790,77,15,NEW,3,BOU,2022-09-17,0.448759,0.35537,0.260149
791,79,18,TOT,10,LEI,2022-09-17,0.625795,0.245188,0.155607
792,72,4,BRE,1,ARS,2022-09-18,0.408945,0.322005,0.308948
793,75,8,EVE,19,WHU,2022-09-18,0.370265,0.300424,0.392968
794,76,14,MUN,11,LEE,2022-09-18,0.664127,0.313693,0.163196
795,74,6,CHE,12,LIV,2022-09-18,0.308407,0.306061,0.405627


## Adjust score

In [70]:
result["home_win_probability_adj"] = result["home_win_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])
result["draw_probability_adj"] = result["draw_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])
result["away_win_probability_adj"] = result["away_win_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])

In [71]:
result[["match_day", "team_name_home", "team_name_away", "home_win_probability_adj", "draw_probability_adj", "away_win_probability_adj"]]

Unnamed: 0,match_day,team_name_home,team_name_away,home_win_probability_adj,draw_probability_adj,away_win_probability_adj
787,2022-09-16,AVL,SOU,0.448941,0.222546,0.328512
788,2022-09-16,NFO,FUL,0.317829,0.304045,0.378126
789,2022-09-17,WOL,MCI,0.114869,0.424185,0.460946
790,2022-09-17,NEW,BOU,0.421656,0.333907,0.244437
791,2022-09-17,TOT,LEI,0.609586,0.238838,0.151577
792,2022-09-18,BRE,ARS,0.393255,0.309651,0.297094
793,2022-09-18,EVE,WHU,0.348106,0.282444,0.36945
794,2022-09-18,MUN,LEE,0.582049,0.274924,0.143027
795,2022-09-18,CHE,LIV,0.302332,0.300031,0.397637


In [72]:
X.result_home_win.mean()

0.4078780177890724

In [73]:
X.result_draw.mean()

0.23761118170266837

In [74]:
X.result_away_win.mean()

0.3545108005082592

Play on games when probability * odds > 1