In [138]:
import pandas as pd
#clean the data and format columns
nwsl_data = pd.read_csv('NWSL.csv')
nwsl_data.columns = nwsl_data.iloc[0]
nwsl_data = nwsl_data.drop([0])
nwsl_data

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
1,1,Sat,2023-03-25,14:00 (13:00),Courage,0.7,1–0,0.8,Current,4948,Sahlen's Stadium at WakeMed Soccer Park,Alex Billeter,Match Report,
2,1,Sun,2023-03-26,14:00 (16:00),Thorns,3.7,4–0,0.6,Pride,15204,Providence Park,Eric Tattersall,Match Report,
3,1,Sun,2023-03-26,16:00 (15:00),Spirit,0.7,1–0,2.2,Reign,11281,Audi Field,Alyssa Nichols,Match Report,
4,1,Sun,2023-03-26,18:00,Dash,1.3,0–0,1.2,Louisville,5722,Shell Energy Stadium,Danielle Chesky,Match Report,
5,1,Sat,2023-03-25,19:00 (21:00),Wave,2.1,3–2,2.5,Red Stars,30854,Snapdragon Stadium,Elton Garcia,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,22,Sun,2023-10-15,14:00 (16:00),Angel City,,,,Thorns,,,,Head-to-Head,
155,22,Sun,2023-10-15,16:00,Red Stars,,,,Reign,,,,Head-to-Head,
156,22,Sun,2023-10-15,17:00 (16:00),Pride,,,,Dash,,,,Head-to-Head,
157,22,Sun,2023-10-15,17:00 (16:00),Spirit,,,,Courage,,,,Head-to-Head,


In [139]:
#clean the data, dropping empty rows, empty columns, and extra rows specifying column name
nwsl_data = nwsl_data.dropna(how='all').reset_index(drop=True)
nwsl_data = nwsl_data.drop(nwsl_data[nwsl_data['Wk']=='Wk'].index).reset_index(drop=True)
column_names = list(nwsl_data.columns)
column_names[4] = 'home_team'
column_names[8]='away_team'
column_names[5]= 'Home_xG'
column_names[7] = 'Away_xG'
nwsl_data.columns = column_names
nwsl_data['Home_xG'] = pd.to_numeric(nwsl_data['Home_xG'])
nwsl_data['Away_xG'] = pd.to_numeric(nwsl_data['Away_xG'])
nwsl_data

Unnamed: 0,Wk,Day,Date,Time,home_team,Home_xG,Score,Away_xG,away_team,Attendance,Venue,Referee,Match Report,Notes
0,1,Sat,2023-03-25,14:00 (13:00),Courage,0.7,1–0,0.8,Current,4948,Sahlen's Stadium at WakeMed Soccer Park,Alex Billeter,Match Report,
1,1,Sun,2023-03-26,14:00 (16:00),Thorns,3.7,4–0,0.6,Pride,15204,Providence Park,Eric Tattersall,Match Report,
2,1,Sun,2023-03-26,16:00 (15:00),Spirit,0.7,1–0,2.2,Reign,11281,Audi Field,Alyssa Nichols,Match Report,
3,1,Sun,2023-03-26,18:00,Dash,1.3,0–0,1.2,Louisville,5722,Shell Energy Stadium,Danielle Chesky,Match Report,
4,1,Sat,2023-03-25,19:00 (21:00),Wave,2.1,3–2,2.5,Red Stars,30854,Snapdragon Stadium,Elton Garcia,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,22,Sun,2023-10-15,14:00 (16:00),Angel City,,,,Thorns,,,,Head-to-Head,
128,22,Sun,2023-10-15,16:00,Red Stars,,,,Reign,,,,Head-to-Head,
129,22,Sun,2023-10-15,17:00 (16:00),Pride,,,,Dash,,,,Head-to-Head,
130,22,Sun,2023-10-15,17:00 (16:00),Spirit,,,,Courage,,,,Head-to-Head,


In [140]:
# Train a Poisson Bradley-Terry model ----

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Wrangle data for the Bradley-Terry model. We need two rows for every game, one for expected goals 
# by the home team and one for expected goals  by the away team.
goal_model_data = pd.concat(
    objs=[
        nwsl_data[['home_team','away_team','Home_xG']].assign(
            home=1
        ).rename(
            columns={'home_team':'offense', 'away_team':'defense', 'Home_xG':'xGoals'}
        ),
        nwsl_data[['away_team','home_team','Away_xG']].assign(
            home=0
        ).rename(
            columns={'away_team': 'offense', 'home_team':'defense', 'Away_xG':'xGoals'}
        )
    ]
)
goal_model_data


Unnamed: 0,offense,defense,xGoals,home
0,Courage,Current,0.7,1
1,Thorns,Pride,3.7,1
2,Spirit,Reign,0.7,1
3,Dash,Louisville,1.3,1
4,Wave,Red Stars,2.1,1
...,...,...,...,...
127,Thorns,Angel City,,0
128,Reign,Red Stars,,0
129,Dash,Pride,,0
130,Courage,Spirit,,0


In [141]:

'''
Fit a Poisson Bradley-Terry model to predict number of expected goals using home, offense and defense
'''
poisson_model = smf.glm(formula="xGoals ~ home + offense + defense", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()


# Step 3: Produce predictions ----

import numpy as np
from itertools import product
from scipy.stats import skellam

all_teams = np.unique(raw_data['home_team'])

# Create a dataframe with all possible combinations of home (0/1), offense and defense
pred_data = pd.DataFrame(
    product([1, 0], all_teams, all_teams),
    columns=['home', 'offense', 'defense']
).query(
    'offense != defense'    # remove rows where the same team is the offense and the defense
)

pred_raw = pred_data.assign(
    pred_goals=poisson_model.predict(exog=pred_data)
)

pred_home = pred_raw.query(
    'home == 1'
).rename(
    columns={'offense':'home_team', 'defense':'away_team', 'pred_goals':'pred_goals_home'}
).loc[
    :, ['home_team', 'away_team', 'pred_goals_home']
]

pred_away = pred_raw.query(
    'home == 0'
).rename(
    columns={'defense':'home_team', 'offense':'away_team', 'pred_goals':'pred_goals_away'}
).loc[
    :, ['home_team', 'away_team', 'pred_goals_away']
]
poisson_model.summary()

0,1,2,3
Dep. Variable:,xGoals,No. Observations:,216.0
Model:,GLM,Df Residuals:,192.0
Model Family:,Poisson,Df Model:,23.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-257.7
Date:,"Thu, 07 Sep 2023",Deviance:,48.536
Time:,17:30:48,Pearson chi2:,50.6
No. Iterations:,4,Pseudo R-squ. (CS):,0.07267
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.2567,0.304,0.845,0.398,-0.339,0.852
offense[T.Courage],-0.2206,0.309,-0.713,0.476,-0.827,0.386
offense[T.Current],0.0609,0.292,0.209,0.834,-0.511,0.632
offense[T.Dash],-0.2803,0.314,-0.891,0.373,-0.897,0.336
offense[T.Gotham FC],0.0510,0.292,0.175,0.861,-0.521,0.623
offense[T.Louisville],0.0187,0.293,0.064,0.949,-0.557,0.594
offense[T.Pride],-0.0020,0.293,-0.007,0.995,-0.576,0.572
offense[T.Red Stars],-0.1768,0.311,-0.568,0.570,-0.787,0.434
offense[T.Reign],-0.0026,0.294,-0.009,0.993,-0.580,0.574


In [142]:
    '''
    Fill in the probability of the home team winning and the probability of the away team winning.
    Check that prob_home_win + prob_away_win + prob_draw = 1!
    '''
pred = pd.merge(pred_home, pred_away, on=['home_team', 'away_team']).assign(

    prob_home_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is positive
        sum(skellam.pmf(range(1, 10), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_away_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is negative
        sum(skellam.pmf(range(-9, 0), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_draw=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is exactly zero
        skellam.pmf(0, x['pred_goals_home'][i], x['pred_goals_away'][i]) for i in range(0, x.shape[0])
    ]
).loc[
    :, ['home_team', 'away_team', 'prob_home_win', 'prob_away_win', 'prob_draw']
]

print(pred['prob_home_win'] + pred['prob_away_win'] + pred['prob_draw'])

0      0.999999
1      0.999999
2      0.999999
3      0.999999
4      0.999999
         ...   
127    0.999999
128    0.999971
129    1.000000
130    0.999999
131    0.999998
Length: 132, dtype: float64


In [143]:
#create validation dataframe excluding games not played yet
data = nwsl_data.copy()
# Filter out rows where the 'Score' column is not NaN
data = data.dropna(subset=['Score'])

# Extract integer home_goals and away_goals from score string
data = data.assign(
home_goals=data['Score'].str[0].astype(int),
away_goals=data['Score'].str[2].astype(int)
)
data

Unnamed: 0,Wk,Day,Date,Time,home_team,Home_xG,Score,Away_xG,away_team,Attendance,Venue,Referee,Match Report,Notes,home_goals,away_goals
0,1,Sat,2023-03-25,14:00 (13:00),Courage,0.7,1–0,0.8,Current,4948,Sahlen's Stadium at WakeMed Soccer Park,Alex Billeter,Match Report,,1,0
1,1,Sun,2023-03-26,14:00 (16:00),Thorns,3.7,4–0,0.6,Pride,15204,Providence Park,Eric Tattersall,Match Report,,4,0
2,1,Sun,2023-03-26,16:00 (15:00),Spirit,0.7,1–0,2.2,Reign,11281,Audi Field,Alyssa Nichols,Match Report,,1,0
3,1,Sun,2023-03-26,18:00,Dash,1.3,0–0,1.2,Louisville,5722,Shell Energy Stadium,Danielle Chesky,Match Report,,0,0
4,1,Sat,2023-03-25,19:00 (21:00),Wave,2.1,3–2,2.5,Red Stars,30854,Snapdragon Stadium,Elton Garcia,Match Report,,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,18,Sat,2023-09-02,19:30 (18:30),Courage,0.8,3–3,2.8,Gotham FC,6389,Sahlen's Stadium at WakeMed Soccer Park,Abdou Ndiaye,Match Report,,3,3
104,18,Sat,2023-09-02,19:30 (18:30),Louisville,1.9,2–1,1.1,Thorns,5778,Lynn Family Stadium,Karen Callado,Match Report,,2,1
105,18,Sun,2023-09-03,15:00 (17:00),Reign,1.3,1–0,0.3,Pride,10106,Lumen Field,Ricardo Fierro,Match Report,,1,0
106,18,Sun,2023-09-03,17:00 (16:00),Spirit,2.7,0–2,1.3,Red Stars,10035,Audi Field,Brandon Stevis,Match Report,,0,2


In [194]:
# functionalize basic Bradley-Terry model
def train_and_predict_goals(data):
    '''Train a Poisson Bradley-Terry model and produce predictions

    Args:
        data (pandas df): dataframe with cols 'home_team', 'away_team', 'home_goals', 'away_goals'

    Returns:
        pred (pandas df): dataframe with cols 'home_team', 'away_team',
            'prob_home_win', 'prob_away_win', 'prob_draw'
    '''
    # Wrangle data for the Bradley-Terry model. We need two rows for every game, one for expected goals 
    # by the home team and one for expected goals by the away team.
    goal_model_data = pd.concat(
    objs=[
        data[['home_team','away_team','home_goals']].assign(
            home=1
        ).rename(
            columns={'home_team':'offense', 'away_team':'defense', 'home_goals':'goals'}
        ),
        data[['away_team','home_team','Away_xG']].assign(
            home=0
        ).rename(
            columns={'away_team': 'offense', 'home_team':'defense', 'away_goals':'goals'}
            )
        ]
    )
 
    
    #Fit a Poisson Bradley-Terry model to predict number of goals scored using home, offense and defense
    poisson_model = smf.glm(formula="goals ~ home + offense + defense", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
    all_teams = np.unique(data['home_team'])
    
    # Create a dataframe with all possible combinations of home (0/1), offense and defense
    pred_data = pd.DataFrame(
        product([1, 0], all_teams, all_teams),
        columns=['home', 'offense', 'defense']
    ).query(
        'offense != defense'    # remove rows where the same team is the offense and the defense
    )

    pred_raw = pred_data.assign(
        pred_goals=poisson_model.predict(exog=pred_data)
    )

    pred_home = pred_raw.query(
        'home == 1'
    ).rename(
        columns={'offense':'home_team', 'defense':'away_team', 'pred_goals':'pred_goals_home'}
    ).loc[
        :, ['home_team', 'away_team', 'pred_goals_home']
    ]

    pred_away = pred_raw.query(
        'home == 0'
    ).rename(
        columns={'defense':'home_team', 'offense':'away_team', 'pred_goals':'pred_goals_away'}
    ).loc[
        :, ['home_team', 'away_team', 'pred_goals_away']
    ]
    
    pred = pd.merge(pred_home, pred_away, on=['home_team', 'away_team']).assign(

    prob_home_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is positive
        sum(skellam.pmf(range(1, 10), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_away_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is negative
        sum(skellam.pmf(range(-9, 0), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_draw=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is exactly zero
        skellam.pmf(0, x['pred_goals_home'][i], x['pred_goals_away'][i]) for i in range(0, x.shape[0])
    ]
    ).loc[
        :, ['home_team', 'away_team', 'prob_home_win', 'prob_away_win', 'prob_draw']
    ]
    return(pred)


In [195]:
#validate predictions for basic Bradley-Terry model


import math                                     # for math.log()
from sklearn.model_selection import KFold

'''
To valid our projections, we're going to use k-fold cross-validation. This works by partitioning
our dataset into k equal-sized subsets (called folds). For each fold, we hold out that fold and
train the model using all other folds. Then we evaluate how well our predictions for the held-out
fold compare with the actual results in the held-out fold (hence it's an out-of-sample validation).
'''

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up a dataframe to hold cross-validation results
cv_data = pd.DataFrame()
'''
        Create a column prob that reflects the predicted probability of the outcome that happened.
        For example, if the home team won and prob_home_win = 0.5, the value of prob would be 0.5.
'''
for train_idx, test_idx in kf.split(data):

    pred = train_and_predict_goals(data.iloc[train_idx])

    test = data.iloc[test_idx]
    
    # Calculate the log of the predicted probability for the outcome that occurred
    cv_data_k = pd.merge(test, pred, on = ['home_team', 'away_team']).assign(
        #determine which outcome occured
        home_win = lambda x: (x['home_goals'] > x['away_goals']),
        away_win = lambda x: (x['home_goals'] < x['away_goals']),
        draw = lambda x: (x['home_goals'] == x['away_goals']),
        #get the log-probability of the event that actually happened
        prob=lambda x: x['home_win'] * x['prob_home_win'] + x['away_win'] * x['prob_away_win'] + x['draw'] * x['prob_draw'],
        log_prob=lambda x: [math.log(p) for p in x['prob']]
    ).loc[
        :, ['Date', 'home_team', 'away_team', 'log_prob']
    ]
   
    cv_data = pd.concat(objs=[cv_data, cv_data_k])

print(np.mean(cv_data['log_prob']))


-1.2154901281012374


In [199]:
#functionalize model based on expected goals instead of raw goals

def train_and_predict_xgoals(data):
    '''Train a Poisson Bradley-Terry model and produce predictions

    Args:
        data (pandas df): dataframe with cols 'Home', 'Away', 'Home_xG', 'Away_xG'

    Returns:
        pred (pandas df): dataframe with cols 'Home', 'Away',
            'prob_home_win', 'prob_away_win', 'prob_draw'
    '''
    # Wrangle data for the Bradley-Terry model. We need two rows for every game, one for expected goals 
    # by the home team and one for expected goals by the away team.
    goal_model_data = pd.concat(
    objs=[
        data[['home_team','away_team','Home_xG']].assign(
            home=1
        ).rename(
            columns={'home_team':'offense', 'away_team':'defense', 'Home_xG':'xGoals'}
        ),
        data[['away_team','home_team','Away_xG']].assign(
            home=0
        ).rename(
            columns={'away_team': 'offense', 'home_team':'defense', 'Away_xG':'xGoals'}
            )
        ]
    )
 
    
    #Fit a Poisson Bradley-Terry model to predict number of goals scored using home, offense and defense
    poisson_model = smf.glm(formula="xGoals ~ home + offense + defense", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
    all_teams = np.unique(data['home_team'])
    
    # Create a dataframe with all possible combinations of home (0/1), offense and defense
    pred_data = pd.DataFrame(
        product([1, 0], all_teams, all_teams),
        columns=['home', 'offense', 'defense']
    ).query(
        'offense != defense'    # remove rows where the same team is the offense and the defense
    )

    pred_raw = pred_data.assign(
        pred_goals=poisson_model.predict(exog=pred_data)
    )

    pred_home = pred_raw.query(
        'home == 1'
    ).rename(
        columns={'offense':'home_team', 'defense':'away_team', 'pred_goals':'pred_goals_home'}
    ).loc[
        :, ['home_team', 'away_team', 'pred_goals_home']
    ]

    pred_away = pred_raw.query(
        'home == 0'
    ).rename(
        columns={'defense':'home_team', 'offense':'away_team', 'pred_goals':'pred_goals_away'}
    ).loc[
        :, ['home_team', 'away_team', 'pred_goals_away']
    ]
    
    pred = pd.merge(pred_home, pred_away, on=['home_team', 'away_team']).assign(

    prob_home_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is positive
        sum(skellam.pmf(range(1, 10), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_away_win=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is negative
        sum(skellam.pmf(range(-9, 0), x['pred_goals_home'][i], x['pred_goals_away'][i])) for i in range(0, x.shape[0])],
    prob_draw=lambda x: [
        # Calculate the probability that the Skellam-distributed difference is exactly zero
        skellam.pmf(0, x['pred_goals_home'][i], x['pred_goals_away'][i]) for i in range(0, x.shape[0])
    ]
    ).loc[
        :, ['home_team', 'away_team', 'prob_home_win', 'prob_away_win', 'prob_draw']
    ]
    return(pred)


In [201]:
#validate predictions for improved model using expected goals


import math                                     # for math.log()
from sklearn.model_selection import KFold

'''
To valid our projections, we're going to use k-fold cross-validation. This works by partitioning
our dataset into k equal-sized subsets (called folds). For each fold, we hold out that fold and
train the model using all other folds. Then we evaluate how well our predictions for the held-out
fold compare with the actual results in the held-out fold (hence it's an out-of-sample validation).
'''

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up a dataframe to hold cross-validation results
cv_data = pd.DataFrame()
'''
        Create a column prob that reflects the predicted probability of the outcome that happened.
        For example, if the home team won and prob_home_win = 0.5, the value of prob would be 0.5.
'''
for train_idx, test_idx in kf.split(data):

    pred = train_and_predict_xgoals(data.iloc[train_idx])

    test = data.iloc[test_idx]
    
    # Calculate the log of the predicted probability for the outcome that occurred
    cv_data_k = pd.merge(test, pred, on = ['home_team', 'away_team']).assign(
        #determine which outcome occured
        home_win = lambda x: (x['home_goals'] > x['away_goals']),
        away_win = lambda x: (x['home_goals'] < x['away_goals']),
        draw = lambda x: (x['home_goals'] == x['away_goals']),
        #get the log-probability of the event that actually happened
        prob=lambda x: x['home_win'] * x['prob_home_win'] + x['away_win'] * x['prob_away_win'] + x['draw'] * x['prob_draw'],
        log_prob=lambda x: [math.log(p) for p in x['prob']]
    ).loc[
        :, ['Date', 'home_team', 'away_team', 'log_prob']
    ]
   
    cv_data = pd.concat(objs=[cv_data, cv_data_k])

print(np.mean(cv_data['log_prob']))


-1.097331821431348


In [256]:
#download predicitons
pred_nwsl = train_and_predict_xgoals(nwsl_data)
pred_nwsl.to_csv('pred_nwsl.csv')

In [None]:
#MLS DATA

In [247]:
#read in mls data
mls_data = pd.read_csv('MLS.csv', skiprows=[0])
mls_data.columns = mls_data.iloc[0]
mls_data = mls_data.drop([0])
mls_data

Unnamed: 0,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
1,Sat,2023-02-25,15:30,Nashville,1.3,2–0,0.4,NYCFC,28051,Geodis Park,Armando Villarreal,Match Report,
2,Sat,2023-02-25,19:30 (18:30),FC Cincinnati,1.7,2–1,1.4,Dynamo FC,25513,TQL Stadium,Chris Penso,Match Report,
3,Sat,2023-02-25,19:30,FC Dallas,0.9,0–1,0.8,Minnesota Utd,19096,Toyota Stadium,Ramy Touchan,Match Report,
4,Sat,2023-02-25,19:30 (18:30),Atlanta Utd,1.9,2–1,1.2,San Jose,67538,Mercedes-Benz Stadium,Jon Freemon,Match Report,
5,Sat,2023-02-25,19:30 (18:30),Philadelphia,3.2,4–1,0.6,Columbus Crew,18510,Subaru Park,Lukasz Szpala,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,Sat,2023-10-21,18:00 (20:00),Portland Timbers,,,,Dynamo FC,,Providence Park,,Head-to-Head,
509,Sat,2023-10-21,18:00 (20:00),Vancouver,,,,Los Angeles FC,,BC Place Stadium,,Head-to-Head,
510,Sat,2023-10-21,19:00 (20:00),Colorado Rapids,,,,Real Salt Lake,,Dick's Sporting Goods Park,,Head-to-Head,
511,Sat,2023-10-21,20:00,St. Louis,,,,Seattle,,Citypark,,Head-to-Head,


In [248]:
#clean the data, dropping empty rows, empty columns, and extra rows specifying column name
mls_data = mls_data.dropna(how='all').reset_index(drop=True)
mls_data = mls_data.drop(mls_data[mls_data['Notes']=='Notes'].index).reset_index(drop=True)
column_names = list(mls_data.columns)
column_names[3] = 'home_team'
column_names[7]='away_team'
column_names[4]= 'Home_xG'
column_names[5]='Score'
column_names[6] = 'Away_xG'
column_names[8]='Attendance'
mls_data.columns = column_names

mls_data

Unnamed: 0,Day,Date,Time,home_team,Home_xG,Score,Away_xG,away_team,Attendance,Venue,Referee,Match Report,Notes
0,Sat,2023-02-25,15:30,Nashville,1.3,2–0,0.4,NYCFC,28051,Geodis Park,Armando Villarreal,Match Report,
1,Sat,2023-02-25,19:30 (18:30),FC Cincinnati,1.7,2–1,1.4,Dynamo FC,25513,TQL Stadium,Chris Penso,Match Report,
2,Sat,2023-02-25,19:30,FC Dallas,0.9,0–1,0.8,Minnesota Utd,19096,Toyota Stadium,Ramy Touchan,Match Report,
3,Sat,2023-02-25,19:30 (18:30),Atlanta Utd,1.9,2–1,1.2,San Jose,67538,Mercedes-Benz Stadium,Jon Freemon,Match Report,
4,Sat,2023-02-25,19:30 (18:30),Philadelphia,3.2,4–1,0.6,Columbus Crew,18510,Subaru Park,Lukasz Szpala,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Sat,2023-10-21,18:00 (20:00),Portland Timbers,,,,Dynamo FC,,Providence Park,,Head-to-Head,
489,Sat,2023-10-21,18:00 (20:00),Vancouver,,,,Los Angeles FC,,BC Place Stadium,,Head-to-Head,
490,Sat,2023-10-21,19:00 (20:00),Colorado Rapids,,,,Real Salt Lake,,Dick's Sporting Goods Park,,Head-to-Head,
491,Sat,2023-10-21,20:00,St. Louis,,,,Seattle,,Citypark,,Head-to-Head,


In [253]:
#create validation dataframe excluding games not played yet
validate_mls = mls_data.copy()
# Filter out rows where the 'Score' and '' column is not NaN
validate_mls = validate_mls.dropna(subset=['Score'])
# Extract integer home_goals and away_goals from score string
validate_mls = validate_mls.assign(
home_goals=validate_mls['Score'].str[0].astype(int),
away_goals=validate_mls['Score'].str[2].astype(int)
)
validate_mls['Home_xG'] = validate_mls['Home_xG'].astype(float)
validate_mls['Away_xG'] = validate_mls['Away_xG'].astype(float)

validate_mls

Unnamed: 0,Day,Date,Time,home_team,Home_xG,Score,Away_xG,away_team,Attendance,Venue,Referee,Match Report,Notes,home_goals,away_goals
0,Sat,2023-02-25,15:30,Nashville,1.3,2–0,0.4,NYCFC,28051,Geodis Park,Armando Villarreal,Match Report,,2,0
1,Sat,2023-02-25,19:30 (18:30),FC Cincinnati,1.7,2–1,1.4,Dynamo FC,25513,TQL Stadium,Chris Penso,Match Report,,2,1
2,Sat,2023-02-25,19:30,FC Dallas,0.9,0–1,0.8,Minnesota Utd,19096,Toyota Stadium,Ramy Touchan,Match Report,,0,1
3,Sat,2023-02-25,19:30 (18:30),Atlanta Utd,1.9,2–1,1.2,San Jose,67538,Mercedes-Benz Stadium,Jon Freemon,Match Report,,2,1
4,Sat,2023-02-25,19:30 (18:30),Philadelphia,3.2,4–1,0.6,Columbus Crew,18510,Subaru Park,Lukasz Szpala,Match Report,,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,Sat,2023-09-02,19:30 (21:30),San Jose,1.7,1–1,1.9,Minnesota Utd,16151,PayPal Park,Alexis Da Silva,Match Report,,1,1
383,Sat,2023-09-02,19:30 (21:30),Seattle,1.8,2–2,1.1,Portland Timbers,37031,Lumen Field,Jon Freemon,Match Report,,2,2
384,Sun,2023-09-03,19:30 (18:30),Philadelphia,2.0,4–1,1.3,NY Red Bulls,19361,Subaru Park,Rubiel Vazquez,Match Report,,4,1
385,Sat,2023-09-02,19:30,Nashville,1.4,1–1,0.4,Charlotte,27902,Geodis Park,Alex Chilowicz,Match Report,,1,1


In [254]:
#validate predictions for basic MLS Bradley-Terry model using goals 


import math                                     # for math.log()
from sklearn.model_selection import KFold

'''
To valid our projections, we're going to use k-fold cross-validation. This works by partitioning
our dataset into k equal-sized subsets (called folds). For each fold, we hold out that fold and
train the model using all other folds. Then we evaluate how well our predictions for the held-out
fold compare with the actual results in the held-out fold (hence it's an out-of-sample validation).
'''

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up a dataframe to hold cross-validation results
cv_data = pd.DataFrame()
'''
        Create a column prob that reflects the predicted probability of the outcome that happened.
        For example, if the home team won and prob_home_win = 0.5, the value of prob would be 0.5.
'''
for train_idx, test_idx in kf.split(validate_mls):

    pred = train_and_predict_goals(validate_mls.iloc[train_idx])

    test = validate_mls.iloc[test_idx]
    
    # Calculate the log of the predicted probability for the outcome that occurred
    cv_data_k = pd.merge(test, pred, on = ['home_team', 'away_team']).assign(
        #determine which outcome occured
        home_win = lambda x: (x['home_goals'] > x['away_goals']),
        away_win = lambda x: (x['home_goals'] < x['away_goals']),
        draw = lambda x: (x['home_goals'] == x['away_goals']),
        #get the log-probability of the event that actually happened
        prob=lambda x: x['home_win'] * x['prob_home_win'] + x['away_win'] * x['prob_away_win'] + x['draw'] * x['prob_draw'],
        log_prob=lambda x: [math.log(p) for p in x['prob']]
    ).loc[
        :, ['Date', 'home_team', 'away_team', 'log_prob']
    ]
   
    cv_data = pd.concat(objs=[cv_data, cv_data_k])

print(np.mean(cv_data['log_prob']))


-1.0759520019564595


In [255]:
#validate predictions for improved MLS Bradley-Terry model using expected goals 


import math                                     # for math.log()
from sklearn.model_selection import KFold

'''
To valid our projections, we're going to use k-fold cross-validation. This works by partitioning
our dataset into k equal-sized subsets (called folds). For each fold, we hold out that fold and
train the model using all other folds. Then we evaluate how well our predictions for the held-out
fold compare with the actual results in the held-out fold (hence it's an out-of-sample validation).
'''

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up a dataframe to hold cross-validation results
cv_data = pd.DataFrame()
'''
        Create a column prob that reflects the predicted probability of the outcome that happened.
        For example, if the home team won and prob_home_win = 0.5, the value of prob would be 0.5.
'''
for train_idx, test_idx in kf.split(validate_mls):

    pred = train_and_predict_xgoals(validate_mls.iloc[train_idx])

    test = validate_mls.iloc[test_idx]
    
    # Calculate the log of the predicted probability for the outcome that occurred
    cv_data_k = pd.merge(test, pred, on = ['home_team', 'away_team']).assign(
        #determine which outcome occured
        home_win = lambda x: (x['home_goals'] > x['away_goals']),
        away_win = lambda x: (x['home_goals'] < x['away_goals']),
        draw = lambda x: (x['home_goals'] == x['away_goals']),
        #get the log-probability of the event that actually happened
        prob=lambda x: x['home_win'] * x['prob_home_win'] + x['away_win'] * x['prob_away_win'] + x['draw'] * x['prob_draw'],
        log_prob=lambda x: [math.log(p) for p in x['prob']]
    ).loc[
        :, ['Date', 'home_team', 'away_team', 'log_prob']
    ]
   
    cv_data = pd.concat(objs=[cv_data, cv_data_k])

print(np.mean(cv_data['log_prob']))


-1.052318572188805


In [257]:
#download predicitons
pred_mls = train_and_predict_xgoals(validate_mls)
pred_mls.to_csv('pred_mls.csv')