In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Data Import and Feature Engineering

In [3]:
form_n = 5
form_stats = ['Goals', 'xG', 'npxG', 'Points', '538 xG', 'nsxG', 'Adj Score',]

df = pd.DataFrame()
for year in range(2018,2023):
    print(year)
    
    yr_df = pd.read_pickle(f'../data/{year}_matches.pkl') # load df for year
    
    # Add pts column
    hgoals = yr_df['Home Goals'].values
    agoals = yr_df['Away Goals'].values
    yr_df['Home Points'] = (hgoals > agoals)*3 + (hgoals==agoals)
    yr_df['Away Points'] = (agoals > hgoals)*3 + (hgoals==agoals)
    
    # Add clean sheet flag column
    yr_df['Home CS'] = np.array(hgoals == 0, dtype=int)
    yr_df['Away CS'] = np.array(agoals == 0, dtype=int)

    # Add form values
    for idx in yr_df.index:

        # Add for home team and away team
        hteam = yr_df.loc[idx,'Home Team']
        ateam = yr_df.loc[idx,'Away Team']
        for key, team in [('Home',hteam), ('Away',ateam)]:

            # Dataframe of all of team's games
            team_df = yr_df[(yr_df['Home Team']==team) | (yr_df['Away Team']==team)]
            # Dataframe of form_n most recent games
            team_recent_df = team_df[team_df['Date']<yr_df.loc[idx,'Date']][-form_n:]
            # Skip for teams that haven't played enough games yet
            if team_recent_df.shape[0]<form_n:
                continue
            assert team_recent_df.shape[0] == form_n # double-check

            # Add columns for form stats, one column for the stat in the form_n most recent games
            for stat in form_stats:
                team_recent_stat = (
                    ((team_recent_df['Home Team']==team) * team_recent_df[f'Home {stat}']) 
                    + ((team_recent_df['Away Team']==team) * team_recent_df[f'Away {stat}'])
                )
                for i, val in enumerate(team_recent_stat):
                    yr_df.loc[idx, f'{key} {stat} Form -{form_n-i}'] = val
        
    df = pd.concat([df, yr_df], ignore_index=True, axis=0)
    
# Drop rows with NaNs
print(df.shape)
df = df.dropna()
print(df.shape)

2018
2019
2020
2021
2022
(1900, 113)
(1634, 113)


# Model Training

In [58]:
feature_cols = [
    'Home ELO', 'Away ELO', 
    'Home SPI', 'Away SPI', 
    'Prob Home Win', 'Prob Away Win', 'Prob Tie', 
    'Home Proj Score', 'Away Proj Score', 
    'Home Importance', 'Away Importance',
]
# Add form columns
for stat in form_stats:
    for team in ['Home', 'Away']:
        feature_cols += [f'{team} {stat} Form -{i}' for i in range(1,form_n+1)]

label_col = 'Home CS'

train_size, validation_size, test_size = 0.7, 0.15, 0.15
np.random.seed(18)

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(
    df[feature_cols], df[label_col],
    train_size=train_size+validation_size
)
# Train-validation split
train_X, validation_X, train_y, validation_y = train_test_split(
    train_X, train_y, 
    train_size=train_size/(train_size+validation_size)
)

In [83]:
model = GradientBoostingClassifier(verbose=1,
                                   n_estimators=10000)
model.fit(train_X, train_y)

      Iter       Train Loss   Remaining Time 
         1           1.0694            0.00s
         2           1.0487            1.30m
         3           1.0302            1.74m
         4           1.0139            1.95m
         5           0.9987            2.08m
         6           0.9872            2.17m
         7           0.9762            1.86m
         8           0.9633            2.09m
         9           0.9531            1.86m
        10           0.9445            1.93m
        20           0.8648            1.93m
        30           0.8113            1.93m
        40           0.7783            1.92m
        50           0.7422            1.87m
        60           0.7169            1.88m
        70           0.6835            1.90m
        80           0.6593            1.89m
        90           0.6266            1.92m
       100           0.6023            1.96m
       200           0.3722            1.97m
       300           0.2487            1.92m
       40

GradientBoostingClassifier(n_estimators=10000, verbose=1)

In [116]:
pd.DataFrame(data=np.vstack([train_X.columns,
                             model.feature_importances_,]).T).sort_values(by=1, ascending=False)

Unnamed: 0,0,1
5,Prob Away Win,0.051248
57,Away 538 xG Form -2,0.039767
0,Home ELO,0.035386
54,Home 538 xG Form -4,0.032573
4,Prob Home Win,0.03185
...,...,...
41,Home Points Form -1,0.000355
20,Away Goals Form -5,0.000255
43,Home Points Form -3,0.000125
45,Home Points Form -5,0.000076


In [97]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=validation_y, 
                 y_pred=model.predict(validation_X))

array([[156,  31],
       [ 49,   9]], dtype=int64)