## Import packages and set working directory

In [278]:
import os
import pandas as pd
import numpy as np

In [279]:
# Get the current working directory
os.getcwd()

'C:\\Users\\mbe158\\Marcus tutorial\\FPL'

In [280]:
# Change the current working directory
os.chdir('C:\\Users\\mbe158\\Marcus tutorial\\FPL')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))

Current working directory: C:\Users\mbe158\Marcus tutorial\FPL


## Importing and checking data

In [281]:
df = pd.read_csv("train_score.csv", sep = ";", decimal=",")

In [282]:
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)

Checking columns data types

In [283]:
df.dtypes #.to_dict()

Unnamed: 0                                 int64
id                                         int64
team_h                                     int64
team_a                                     int64
result                                    object
                                           ...  
best_assists_m_player_playing_player_a     int64
best_assists_f_player_playing_player_a     int64
team_name_home                            object
team_name_away                            object
season                                    object
Length: 103, dtype: object

Checking number of NA is each column

In [284]:
print(df.isnull().sum())

Unnamed: 0                                 0
id                                         0
team_h                                     0
team_a                                     0
result                                    11
                                          ..
best_assists_m_player_playing_player_a     0
best_assists_f_player_playing_player_a     0
team_name_home                             0
team_name_away                             0
season                                     0
Length: 103, dtype: int64


Following features has NA:
- win_rate_h_team_h
- win_rate_h_last_5_team_h
- draw_rate_h_team_h
- draw_rate_h_last_5_team_h
- loss_rate_h_team_h
- loss_rate_h_last_5_team_h
- avg_scored_goals_h_team_h
- avg_scored_goals_h_last_5_team_h
- avg_conceded_goals_h_team_h
- avg_conceded_goals_h_last_5_team_h
- win_rate_a_team_a
- win_rate_a_last_5_team_a
- draw_rate_a_team_a
- draw_rate_a_last_5_team_a
- loss_rate_a_team_a
- loss_rate_a_last_5_team_a
- avg_scored_goals_a_team_a
- avg_scored_goals_a_last_5_team_a
- avg_conceded_goals_a_team_a
- avg_conceded_goals_a_last_5_team_a


In [285]:
pd.set_option('display.max_rows', 10)

In [286]:
# pd.set_option('display.max_columns', 101)

## Fixing targets and creating train- and score-datasets

In [287]:
X = df.loc[df.result.notna()].copy()
score = df.loc[df.result.isna()].copy()

In [288]:
X["result_home_win"] = np.where(X.result == "1", 1, 0)
X["result_draw"] = np.where(X.result == "X", 1, 0)
X["result_away_win"] = np.where(X.result == "2", 1, 0)

## ML

In [289]:
# Data manipulation
import numpy as np
import pandas as pd
# pd.options.display.precision = 4
# pd.options.mode.chained_assignment = None  

# Machine learning pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score

set_config(display = "diagram")

seed = 134680

In [290]:
features = X.columns.drop(['team_h', 'team_a', 'result', 'match_day', 'result_draw', 'result_away_win', 'result_home_win', 'team_name_home', 'team_name_away'])

In [291]:
numerical = X[features].select_dtypes('number').columns
print(f"Numerical features: {', '.join(numerical)}")

Numerical features: Unnamed: 0, id, scored_goals_team_h, conceded_goals_team_h, win_team_h, draw_team_h, loss_team_h, win_rate_team_h, win_rate_last_5_team_h, win_rate_h_team_h, win_rate_h_last_5_team_h, draw_rate_team_h, draw_rate_last_5_team_h, draw_rate_h_team_h, draw_rate_h_last_5_team_h, loss_rate_team_h, loss_rate_last_5_team_h, loss_rate_h_team_h, loss_rate_h_last_5_team_h, avg_scored_goals_team_h, avg_scored_goals_last_5_team_h, avg_scored_goals_h_team_h, avg_scored_goals_h_last_5_team_h, avg_conceded_goals_team_h, avg_conceded_goals_last_5_team_h, avg_conceded_goals_h_team_h, avg_conceded_goals_h_last_5_team_h, scored_goals_team_a, conceded_goals_team_a, win_team_a, draw_team_a, loss_team_a, win_rate_team_a, win_rate_last_5_team_a, win_rate_a_team_a, win_rate_a_last_5_team_a, draw_rate_team_a, draw_rate_last_5_team_a, draw_rate_a_team_a, draw_rate_a_last_5_team_a, loss_rate_team_a, loss_rate_last_5_team_a, loss_rate_a_team_a, loss_rate_a_last_5_team_a, avg_scored_goals_team_a,

In [292]:
categorical = pd.Index(np.setdiff1d(features, numerical))
print(f"Categorical features: {', '.join(categorical)}")

Categorical features: season


In [293]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('encoder', OneHotEncoder(drop = 'first', handle_unknown = 'error', sparse = False))
])

preprocessors = ColumnTransformer(transformers = [
    ('num', numerical_pipe, numerical),
    ('cat', categorical_pipe, categorical)
])

pipe_to_cv = Pipeline([
    ('preprocessors', preprocessors)
])

### Predicting home-win

In [294]:
target = 'result_home_win'

#### CV

In [295]:
fit_to_csv = pipe_to_cv.fit(X[features])

In [296]:
X_to_csv = pipe_to_cv.transform(X[features])

In [297]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60],
    'max_features': [2, 3, 5, 10, 20],
    'min_samples_leaf': [3, 4, 5, 10, 20],
    'n_estimators': [10, 50, 100]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [298]:
grid_search.fit(X_to_csv, X[target])

Fitting 3 folds for each of 225 candidates, totalling 675 fits


In [299]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 40,
 'max_features': 10,
 'min_samples_leaf': 20,
 'n_estimators': 10}

#### Final model

In [300]:
pipe_rf = Pipeline([
    ('preprocessors', preprocessors),
    ('model', RandomForestClassifier(bootstrap = True, max_depth = 40, max_features = 5, min_samples_leaf = 5, n_estimators = 100))
])

In [301]:
X_train, X_test, y_train, y_test = train_test_split(X[features], 
                                                    X[target], 
                                                    test_size = .2, 
                                                    random_state = seed, 
                                                    stratify = X[target])

In [302]:
pipe_rf.fit(X_train, y_train)

In [303]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

print(f"Train ROC-AUC: {calculate_roc_auc(pipe_rf, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe_rf, X_test, y_test):.4f}")

Train ROC-AUC: 0.9890
Test ROC-AUC: 0.7015


In [304]:
pipe_rf.predict_proba(score[features])[:, 1]

ValueError: Input contains infinity or a value too large for dtype('float64').

In [305]:
score

Unnamed: 0.1,Unnamed: 0,id,team_h,team_a,result,match_day,scored_goals_team_h,conceded_goals_team_h,win_team_h,draw_team_h,loss_team_h,win_rate_team_h,win_rate_last_5_team_h,win_rate_h_team_h,win_rate_h_last_5_team_h,draw_rate_team_h,draw_rate_last_5_team_h,draw_rate_h_team_h,draw_rate_h_last_5_team_h,loss_rate_team_h,loss_rate_last_5_team_h,loss_rate_h_team_h,loss_rate_h_last_5_team_h,avg_scored_goals_team_h,avg_scored_goals_last_5_team_h,avg_scored_goals_h_team_h,avg_scored_goals_h_last_5_team_h,avg_conceded_goals_team_h,avg_conceded_goals_last_5_team_h,avg_conceded_goals_h_team_h,avg_conceded_goals_h_last_5_team_h,scored_goals_team_a,conceded_goals_team_a,win_team_a,draw_team_a,loss_team_a,win_rate_team_a,win_rate_last_5_team_a,win_rate_a_team_a,win_rate_a_last_5_team_a,draw_rate_team_a,draw_rate_last_5_team_a,draw_rate_a_team_a,draw_rate_a_last_5_team_a,loss_rate_team_a,loss_rate_last_5_team_a,loss_rate_a_team_a,loss_rate_a_last_5_team_a,avg_scored_goals_team_a,avg_scored_goals_last_5_team_a,...,avg_conceded_goals_last_5_team_a,avg_conceded_goals_a_team_a,avg_conceded_goals_a_last_5_team_a,max_value_player_h,max_value_position_g_player_h,max_value_position_d_player_h,max_value_position_m_player_h,max_value_position_f_player_h,avg_goals_per_hour_5_player_h,avg_goals_per_hour_10_player_h,avg_assists_per_hour_5_player_h,avg_assists_per_hour_10_player_h,avg_goals_conceded_per_hour_5_player_h,avg_goals_conceded_per_hour_10_player_h,most_valuable_player_playing_player_h,most_valuable_g_player_playing_player_h,most_valuable_f_player_playing_player_h,most_valuable_m_player_playing_player_h,best_goal_scorer_playing_player_h,best_m_goal_scorer_playing_player_h,best_f_goal_scorer_playing_player_h,best_assists_player_playing_player_h,best_assists_d_player_playing_player_h,best_assists_m_player_playing_player_h,best_assists_f_player_playing_player_h,max_value_player_a,max_value_position_g_player_a,max_value_position_d_player_a,max_value_position_m_player_a,max_value_position_f_player_a,avg_goals_per_hour_5_player_a,avg_goals_per_hour_10_player_a,avg_assists_per_hour_5_player_a,avg_assists_per_hour_10_player_a,avg_goals_conceded_per_hour_5_player_a,avg_goals_conceded_per_hour_10_player_a,most_valuable_player_playing_player_a,most_valuable_g_player_playing_player_a,most_valuable_f_player_playing_player_a,most_valuable_m_player_playing_player_a,best_goal_scorer_playing_player_a,best_m_goal_scorer_playing_player_a,best_f_goal_scorer_playing_player_a,best_assists_player_playing_player_a,best_assists_d_player_playing_player_a,best_assists_m_player_playing_player_a,best_assists_f_player_playing_player_a,team_name_home,team_name_away,season
776,777,46,10,14,,2022-09-01,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25,0.25,0.5,0.5,0.75,0.75,0.5,0.5,1.5,1.5,1.5,1.5,2.5,2.5,2.0,2.0,1.0,0.0,1.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,...,1.75,2.0,2.0,94,40,45,69,94,0.050595,0.050595,0.140625,0.140625,1.634854,1.634854,1.0,0,1,0,0.0,0,0,0.0,1,1,1,104,49,48,98,104,0.039816,0.039816,0.035714,0.035714,0.956887,0.956887,1.0,1,1,1,1.0,1,0,1.0,1,1,0,LEI,MUN,2022/2023
777,778,55,8,12,,2022-09-03,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.60,0.60,0.5,0.5,0.40,0.40,0.5,0.5,0.8,0.8,0.5,0.5,1.2,1.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.2,0.2,0.0,0.0,0.4,0.4,0.5,0.5,0.4,0.4,0.5,0.5,2.6,2.6,...,1.20,2.0,2.0,55,45,49,55,0,0.036364,0.036364,0.190909,0.190909,1.016779,1.016779,0.0,1,0,1,1.0,1,0,1.0,1,1,0,130,55,75,130,80,0.150380,0.150380,0.101628,0.101628,0.900592,0.900592,1.0,1,0,1,1.0,1,1,1.0,0,1,1,EVE,LIV,2022/2023
778,779,52,4,11,,2022-09-03,1.0,1.0,0.0,1.0,0.0,0.2,0.2,0.5,0.5,0.60,0.60,0.5,0.5,0.20,0.20,0.0,0.0,2.0,2.0,2.5,2.5,1.4,1.4,0.5,0.5,1.0,1.0,0.0,1.0,0.0,0.4,0.4,0.0,0.0,0.4,0.4,0.5,0.5,0.2,0.2,0.5,0.5,1.6,1.6,...,1.00,1.5,1.5,72,45,50,55,72,0.186075,0.186075,0.108079,0.108079,0.982072,0.982072,1.0,1,1,1,1.0,1,1,1.0,1,1,1,73,45,49,65,73,0.075419,0.075419,0.073659,0.073659,0.591396,0.591396,1.0,1,1,0,1.0,1,0,1.0,1,1,1,BRE,LEE,2022/2023
779,780,54,6,19,,2022-09-03,1.0,2.0,0.0,0.0,1.0,0.4,0.4,0.5,0.5,0.20,0.20,0.5,0.5,0.40,0.40,0.0,0.0,1.2,1.2,2.0,2.0,1.6,1.6,1.5,1.5,0.0,1.0,0.0,0.0,1.0,0.2,0.2,0.5,0.5,0.0,0.0,0.0,0.0,0.8,0.8,0.5,0.5,0.2,0.2,...,1.20,0.5,0.5,99,50,58,99,78,0.050511,0.050511,0.051930,0.051930,2.364683,2.364683,1.0,1,0,1,1.0,1,0,1.0,1,1,0,82,50,49,82,73,0.012121,0.012121,0.012121,0.012121,0.909502,0.909502,1.0,1,1,1,1.0,1,0,1.0,0,1,0,CHE,WHU,2022/2023
780,781,57,15,7,,2022-09-03,1.0,0.0,1.0,0.0,0.0,0.4,0.4,0.5,0.5,0.60,0.60,0.5,0.5,0.00,0.00,0.0,0.0,1.4,1.4,2.5,2.5,0.8,0.8,1.5,1.5,1.0,1.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,0.4,0.4,0.5,0.5,0.4,0.4,0.5,0.5,1.4,1.4,...,1.80,2.5,2.5,70,50,51,60,70,0.146637,0.146637,0.036364,0.036364,0.324614,0.324614,0.0,1,0,0,0.0,1,0,0.0,0,0,0,71,45,45,71,54,0.105802,0.105802,0.048517,0.048517,1.311427,1.311427,1.0,1,0,1,1.0,1,1,1.0,1,1,1,NEW,CRY,2022/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782,783,59,18,9,,2022-09-03,1.0,0.0,1.0,0.0,0.0,0.8,0.8,1.0,1.0,0.20,0.20,0.0,0.0,0.00,0.00,0.0,0.0,2.0,2.0,2.5,2.5,0.6,0.6,0.5,0.5,2.0,1.0,1.0,0.0,0.0,0.4,0.4,0.0,0.0,0.4,0.4,0.5,0.5,0.2,0.2,0.5,0.5,1.6,1.6,...,1.40,1.0,1.0,118,55,56,118,114,0.085561,0.085561,0.212197,0.212197,0.299322,0.299322,1.0,1,1,1,1.0,1,1,1.0,1,1,1,67,45,45,55,67,0.071795,0.071795,0.045321,0.045321,0.813154,0.813154,1.0,1,1,0,1.0,1,1,1.0,1,1,0,TOT,FUL,2022/2023
783,784,60,20,17,,2022-09-03,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.60,0.60,1.0,1.0,0.40,0.40,0.0,0.0,0.4,0.4,0.5,0.5,0.8,0.8,0.5,0.5,2.0,1.0,1.0,0.0,0.0,0.4,0.4,0.5,0.5,0.2,0.2,0.0,0.0,0.4,0.4,0.5,0.5,1.4,1.4,...,1.80,2.5,2.5,69,50,50,60,69,0.011111,0.011111,0.015244,0.015244,0.461716,0.461716,1.0,1,1,1,1.0,1,0,1.0,0,1,0,65,45,45,65,64,0.142341,0.142341,0.082899,0.082899,0.877343,0.877343,1.0,1,1,1,1.0,1,1,1.0,1,1,1,WOL,SOU,2022/2023
784,785,51,2,13,,2022-09-03,1.0,1.0,0.0,1.0,0.0,0.2,0.2,0.5,0.5,0.20,0.20,0.0,0.0,0.60,0.60,0.5,0.5,0.8,0.8,1.0,1.0,1.6,1.6,1.0,1.0,5.0,0.0,1.0,0.0,0.0,0.8,0.8,0.5,0.5,0.2,0.2,0.5,0.5,0.0,0.0,0.0,0.0,3.6,3.6,...,1.00,1.5,1.5,74,50,48,60,74,0.048485,0.048485,0.041026,0.041026,1.298491,1.298491,1.0,1,1,0,1.0,1,1,1.0,0,1,1,122,55,71,122,118,0.174128,0.174128,0.167445,0.167445,0.501060,0.501060,1.0,1,1,1,1.0,1,1,1.0,1,1,1,AVL,MCI,2022/2023
785,786,53,5,10,,2022-09-04,1.0,2.0,0.0,0.0,1.0,0.6,0.6,0.5,0.5,0.20,0.20,0.5,0.5,0.20,0.20,0.0,0.0,1.2,1.2,0.5,0.5,0.6,0.6,0.0,0.0,,,,,,,,,,,,,,,,,,,,...,,,,65,46,46,65,65,0.053333,0.053333,0.067317,0.067317,0.329982,0.329982,1.0,1,1,1,1.0,1,0,1.0,1,1,1,0,0,0,0,0,,,,,,,-inf,0,0,0,-inf,0,0,-inf,0,0,0,BHA,LEI,2022/2023


In [None]:
result = score[["id", "team_h", "team_name_home", "team_a", "team_name_away", "match_day"]].copy()
result["home_win_probability"] = pipe_rf.predict_proba(score[features])[:, 1]

In [None]:
result