In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Data Import and Feature Engineering

In [21]:
form_n = 5

df = pd.DataFrame()
for year in range(2018,2023):
    print(year)
    
    yr_df = pd.read_pickle(f'../data/{year}_matches.pkl') # load df for year
    
    # Add pts column
    hgoals = yr_df['Home Goals'].values
    agoals = yr_df['Away Goals'].values
    yr_df['Home Points'] = (hgoals > agoals)*3 + (hgoals==agoals)
    yr_df['Away Points'] = (agoals > hgoals)*3 + (hgoals==agoals)
    
    # Add clean sheet flag column
    yr_df['Home CS'] = np.array(hgoals == 0, dtype=int)
    yr_df['Away CS'] = np.array(agoals == 0, dtype=int)

    # Add form values
    for i in yr_df.index:

        hteam = yr_df.loc[i,'Home Team']
        ateam = yr_df.loc[i,'Away Team']
        for key, team in [('Home',hteam), ('Away',ateam)]:

            team_df = yr_df[(yr_df['Home Team']==team) | (yr_df['Away Team']==team)]
            team_recent_df = team_df[team_df['Date']<yr_df.loc[i,'Date']][-form_n:]
            if team_recent_df.shape[0]<form_n:
                continue
            assert team_recent_df.shape[0] == form_n

            # Goals
            team_recent_goals = (
                ((team_recent_df['Home Team']==team) * team_recent_df['Home Goals']) 
                + ((team_recent_df['Away Team']==team) * team_recent_df['Away Goals'])
            )
            yr_df.loc[i,f'{key} Goal Form'] = team_recent_goals.mean()

            # xG
            team_recent_xg = (
                ((team_recent_df['Home Team']==team) * team_recent_df['Home xG']) 
                + ((team_recent_df['Away Team']==team) * team_recent_df['Away xG'])
            )
            yr_df.loc[i,f'{key} xG Form'] = team_recent_xg.mean()
            
            # npxG
            team_recent_npxg = (
                ((team_recent_df['Home Team']==team) * team_recent_df['Home npxG']) 
                + ((team_recent_df['Away Team']==team) * team_recent_df['Away npxG'])
            )
            yr_df.loc[i,f'{key} npxG Form'] = team_recent_npxg.mean()
            
            # Points
            team_recent_npxg = (
                ((team_recent_df['Home Team']==team) * team_recent_df['Home Points']) 
                + ((team_recent_df['Away Team']==team) * team_recent_df['Away Points'])
            )
            yr_df.loc[i,f'{key} Points Form'] = team_recent_npxg.mean()
        

    df = pd.concat([df, yr_df], ignore_index=True, axis=0)
    
# Drop rows with NaNs
df = df.dropna()

2018
2019
2020
2021
2022


# Model Training

In [90]:
feature_cols = [
    'Home ELO', 'Away ELO', 'Home Goal Form', 'Away Goal Form', 'Home xG Form',
    'Away xG Form', 'Home npxG Form', 'Away npxG Form', 'Home Points Form',
    'Away Points Form',
]

train_size, validation_size, test_size = 0.7, 0.15, 0.15
np.random.seed(18)

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(
    df[feature_cols], df['Home CS'],
    train_size=train_size+validation_size
)
# Train-validation split
train_X, validation_X, train_y, validation_y = train_test_split(
    train_X, train_y, 
    train_size=train_size/(train_size+validation_size)
)

In [91]:
model = GradientBoostingClassifier()
model.fit(train_X, train_y)

GradientBoostingClassifier()

In [92]:
yhat = model.predict_proba(validation_X)[:,1] # CS probas are in second col

In [94]:
temp = df.loc[validation_y[yhat>0.5].index, ['Date','Matchweek','Home Team','Away Team','Home CS','Away npxG Form']]
temp['yhat'] = yhat[yhat>0.5]
temp

Unnamed: 0,Date,Matchweek,Home Team,Away Team,Home CS,Away npxG Form,yhat
851,2019-10-26,10,Brighton & Hove Albion,Everton,0,1.58,0.662569
1341,2021-01-31,21,West Ham United,Liverpool,0,1.66,0.612061
112,2017-11-18,12,Leicester City,Manchester City,1,2.26,0.68043
1779,2022-03-05,28,Burnley,Chelsea,1,0.92,0.580526
1669,2021-12-10,16,Brentford,Watford,0,1.18,0.563552
1841,2022-04-23,34,Brentford,Tottenham Hotspur,1,1.88,0.673526
1035,2020-02-29,28,Watford,Liverpool,0,2.3,0.896342
1132,2020-07-26,38,Leicester City,Manchester United,1,1.4,0.84982
1729,2022-01-22,23,Southampton,Manchester City,0,1.5,0.615781
840,2019-10-19,9,Crystal Palace,Manchester City,1,2.88,0.795566
