## Import packages and set working directory

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Get the current working directory
os.getcwd()

'C:\\Users\\mbe158'

In [3]:
# Change the current working directory
os.chdir('C:\\Users\\mbe158\\Marcus tutorial\\FPL')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))

Current working directory: C:\Users\mbe158\Marcus tutorial\FPL


## Importing and checking data

In [4]:
df = pd.read_csv("train_score.csv", sep = ";", decimal=",")

In [5]:
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)

Checking columns data types

In [6]:
df.dtypes #.to_dict()

Unnamed: 0                                 int64
id                                         int64
team_h                                     int64
team_a                                     int64
result                                    object
                                           ...  
best_assists_m_player_playing_player_a     int64
best_assists_f_player_playing_player_a     int64
team_name_home                            object
team_name_away                            object
season                                    object
Length: 103, dtype: object

Checking number of NA is each column

In [7]:
print(df.isnull().sum())

Unnamed: 0                                0
id                                        0
team_h                                    0
team_a                                    0
result                                    9
                                         ..
best_assists_m_player_playing_player_a    0
best_assists_f_player_playing_player_a    0
team_name_home                            0
team_name_away                            0
season                                    0
Length: 103, dtype: int64


Following features has NA:
- win_rate_h_team_h
- win_rate_h_last_5_team_h
- draw_rate_h_team_h
- draw_rate_h_last_5_team_h
- loss_rate_h_team_h
- loss_rate_h_last_5_team_h
- avg_scored_goals_h_team_h
- avg_scored_goals_h_last_5_team_h
- avg_conceded_goals_h_team_h
- avg_conceded_goals_h_last_5_team_h
- win_rate_a_team_a
- win_rate_a_last_5_team_a
- draw_rate_a_team_a
- draw_rate_a_last_5_team_a
- loss_rate_a_team_a
- loss_rate_a_last_5_team_a
- avg_scored_goals_a_team_a
- avg_scored_goals_a_last_5_team_a
- avg_conceded_goals_a_team_a
- avg_conceded_goals_a_last_5_team_a


In [8]:
pd.set_option('display.max_rows', 10)

In [9]:
# pd.set_option('display.max_columns', 101)

## Fixing targets and creating train- and score-datasets

In [10]:
X = df.loc[df.result.notna()].copy()
score = df.loc[df.result.isna()].copy()

In [11]:
X["result_home_win"] = np.where(X.result == "1", 1, 0)
X["result_draw"] = np.where(X.result == "X", 1, 0)
X["result_away_win"] = np.where(X.result == "2", 1, 0)

## ML

In [12]:
# Data manipulation
import numpy as np
import pandas as pd
# pd.options.display.precision = 4
# pd.options.mode.chained_assignment = None  

# Machine learning pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score

set_config(display = "diagram")

seed = 134680

In [13]:
features = X.columns.drop(['team_h', 'team_a', 'result', 'match_day', 'result_draw', 'result_away_win', 'result_home_win', 'team_name_home', 'team_name_away'])

In [14]:
numerical = X[features].select_dtypes('number').columns
print(f"Numerical features: {', '.join(numerical)}")

Numerical features: Unnamed: 0, id, scored_goals_team_h, conceded_goals_team_h, win_team_h, draw_team_h, loss_team_h, win_rate_team_h, win_rate_last_5_team_h, win_rate_h_team_h, win_rate_h_last_5_team_h, draw_rate_team_h, draw_rate_last_5_team_h, draw_rate_h_team_h, draw_rate_h_last_5_team_h, loss_rate_team_h, loss_rate_last_5_team_h, loss_rate_h_team_h, loss_rate_h_last_5_team_h, avg_scored_goals_team_h, avg_scored_goals_last_5_team_h, avg_scored_goals_h_team_h, avg_scored_goals_h_last_5_team_h, avg_conceded_goals_team_h, avg_conceded_goals_last_5_team_h, avg_conceded_goals_h_team_h, avg_conceded_goals_h_last_5_team_h, scored_goals_team_a, conceded_goals_team_a, win_team_a, draw_team_a, loss_team_a, win_rate_team_a, win_rate_last_5_team_a, win_rate_a_team_a, win_rate_a_last_5_team_a, draw_rate_team_a, draw_rate_last_5_team_a, draw_rate_a_team_a, draw_rate_a_last_5_team_a, loss_rate_team_a, loss_rate_last_5_team_a, loss_rate_a_team_a, loss_rate_a_last_5_team_a, avg_scored_goals_team_a,

In [15]:
categorical = pd.Index(np.setdiff1d(features, numerical))
print(f"Categorical features: {', '.join(categorical)}")

Categorical features: season


In [16]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('encoder', OneHotEncoder(drop = 'first', handle_unknown = 'error', sparse = False))
])

preprocessors = ColumnTransformer(transformers = [
    ('num', numerical_pipe, numerical),
    ('cat', categorical_pipe, categorical)
])

pipe_to_cv = Pipeline([
    ('preprocessors', preprocessors)
])

### Predicting home-win

In [17]:
target = 'result_home_win'

#### CV

In [18]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [19]:
X_to_csv = pipe_to_cv.transform(X[features])

In [20]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [21]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [22]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 40,
 'max_features': 10,
 'min_samples_leaf': 10,
 'n_estimators': 100}

#### Final model

In [23]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))
])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [25]:
pipe_rf.fit(X_train, y_train)

In [26]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.9413
Test ROC-AUC: 0.7365


In [27]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.40424471, 0.17730758, 0.42963332, 0.48810978, 0.51818202,
       0.30887908, 0.67932637, 0.37462727, 0.11063665])

In [28]:
result = score[["id", "team_h", "team_name_home", "team_a", "team_name_away", "match_day"]].copy()
result["home_win_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [29]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability
776,46,10,LEI,14,MUN,2022-09-01,0.404245
777,55,8,EVE,12,LIV,2022-09-03,0.177308
778,52,4,BRE,11,LEE,2022-09-03,0.429633
779,54,6,CHE,19,WHU,2022-09-03,0.48811
780,57,15,NEW,7,CRY,2022-09-03,0.518182
781,58,16,NFO,3,BOU,2022-09-03,0.308879
782,59,18,TOT,9,FUL,2022-09-03,0.679326
783,60,20,WOL,17,SOU,2022-09-03,0.374627
784,51,2,AVL,13,MCI,2022-09-03,0.110637


### Predicting draw


In [30]:
target = 'result_draw'

#### CV

In [31]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [32]:
X_to_csv = pipe_to_cv.transform(X[features])

In [33]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [34]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [35]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 2,
 'min_samples_leaf': 3,
 'n_estimators': 50}

#### Final model

In [36]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [38]:
pipe_rf.fit(X_train, y_train)

In [39]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.9997
Test ROC-AUC: 0.5360


In [40]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.26754618, 0.34035481, 0.30989105, 0.22915079, 0.30988312,
       0.41458034, 0.13077074, 0.32311111, 0.45454798])

In [41]:
result["draw_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [42]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability,draw_probability
776,46,10,LEI,14,MUN,2022-09-01,0.404245,0.267546
777,55,8,EVE,12,LIV,2022-09-03,0.177308,0.340355
778,52,4,BRE,11,LEE,2022-09-03,0.429633,0.309891
779,54,6,CHE,19,WHU,2022-09-03,0.48811,0.229151
780,57,15,NEW,7,CRY,2022-09-03,0.518182,0.309883
781,58,16,NFO,3,BOU,2022-09-03,0.308879,0.41458
782,59,18,TOT,9,FUL,2022-09-03,0.679326,0.130771
783,60,20,WOL,17,SOU,2022-09-03,0.374627,0.323111
784,51,2,AVL,13,MCI,2022-09-03,0.110637,0.454548


### Predicting away-win

In [43]:
target = 'result_away_win'

#### CV

In [44]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [45]:
X_to_csv = pipe_to_cv.transform(X[features])

In [46]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [47]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [48]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 5,
 'min_samples_leaf': 10,
 'n_estimators': 10}

#### Final model

In [49]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    #('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
    ('model', RandomForestClassifier(**grid_search.best_params_))])

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [51]:
pipe_rf.fit(X_train, y_train)

In [52]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.9019
Test ROC-AUC: 0.5563


In [53]:
pipe_rf.predict_proba(score[features])[:, 1]

array([0.30818918, 0.46733983, 0.12041928, 0.4064611 , 0.18060893,
       0.15746184, 0.10695198, 0.37593995, 0.63877116])

In [54]:
result["away_win_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [55]:
result

Unnamed: 0,id,team_h,team_name_home,team_a,team_name_away,match_day,home_win_probability,draw_probability,away_win_probability
776,46,10,LEI,14,MUN,2022-09-01,0.404245,0.267546,0.308189
777,55,8,EVE,12,LIV,2022-09-03,0.177308,0.340355,0.46734
778,52,4,BRE,11,LEE,2022-09-03,0.429633,0.309891,0.120419
779,54,6,CHE,19,WHU,2022-09-03,0.48811,0.229151,0.406461
780,57,15,NEW,7,CRY,2022-09-03,0.518182,0.309883,0.180609
781,58,16,NFO,3,BOU,2022-09-03,0.308879,0.41458,0.157462
782,59,18,TOT,9,FUL,2022-09-03,0.679326,0.130771,0.106952
783,60,20,WOL,17,SOU,2022-09-03,0.374627,0.323111,0.37594
784,51,2,AVL,13,MCI,2022-09-03,0.110637,0.454548,0.638771


## Adjust score

In [56]:
result["home_win_probability_adj"] = result["home_win_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])
result["draw_probability_adj"] = result["draw_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])
result["away_win_probability_adj"] = result["away_win_probability"] / (result["home_win_probability"] + result["draw_probability"] + result["away_win_probability"])

In [59]:
result[["match_day", "team_name_home", "team_name_away", "home_win_probability_adj", "draw_probability_adj", "away_win_probability_adj"]]

Unnamed: 0,match_day,team_name_home,team_name_away,home_win_probability_adj,draw_probability_adj,away_win_probability_adj
776,2022-09-01,LEI,MUN,0.412503,0.273012,0.314485
777,2022-09-03,EVE,LIV,0.180007,0.345537,0.474456
778,2022-09-03,BRE,LEE,0.499606,0.360362,0.140032
779,2022-09-03,CHE,WHU,0.434369,0.203921,0.36171
780,2022-09-03,NEW,CRY,0.513726,0.307218,0.179056
781,2022-09-03,NFO,BOU,0.350632,0.470621,0.178747
782,2022-09-03,TOT,FUL,0.740774,0.1426,0.116626
783,2022-09-03,WOL,SOU,0.348919,0.300938,0.350142
784,2022-09-03,AVL,MCI,0.091894,0.377545,0.53056


In [63]:
X.result_home_win.mean()

0.4059278350515464

In [64]:
X.result_draw.mean()

0.23711340206185566

In [65]:
X.result_away_win.mean()

0.35695876288659795

Play on games when probability * odds > 1