<a href="https://colab.research.google.com/github/mndore/football_regression/blob/main/football_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Football Regression on FiveThirtyEight Premier League Data
Reading the data from FiveThirtyEight

In [65]:
import pandas as pd
import numpy as np
import sklearn as sk

full_data = pd.read_csv('https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv')
full_data.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,...,,,2.0,0.0,,,,,,
1,2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,...,,,2.0,0.0,,,,,,
2,2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,...,,,1.0,1.0,,,,,,
3,2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,...,,,0.0,0.0,,,,,,
4,2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,...,,,1.0,2.0,,,,,,


## Preprocessing the data

In [66]:
# Remove games with no score
played_games = full_data[full_data['score1'].notnull()]

# Get only games from the Premier League
played_games = played_games[played_games['league'] == 'Barclays Premier League']

# Hot encode the team names, season, and the league and add to df
played_games = pd.concat([played_games, pd.get_dummies(played_games['league'])], axis=1)
played_games = pd.concat([played_games, pd.get_dummies(played_games['team1'])], axis=1)
played_games = pd.concat([played_games, pd.get_dummies(played_games['team2'])], axis=1)
played_games = pd.concat([played_games, pd.get_dummies(played_games['season'])], axis=1)

# Drop NaN values
played_games = played_games.dropna()

## Normalize the data

In [67]:
# Perform z-score normalization on the following columns 'spi1', 'spi2', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2'
def z_score_normalization(df, column):
    df[column] = (df[column] - df[column].mean()) / df[column].std()
    return df

columns = ['spi1', 'spi2', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2']
for column in columns:
    played_games = z_score_normalization(played_games, column)

## Create the train test split

In [68]:
from sklearn.model_selection import train_test_split

# Train test split using sklearn
X = played_games.drop(['date', 'league', 'team1', 'team2', 'score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1', 'adj_score2', 'season', 'league_id'], axis=1)
y = played_games[['score1', 'score2']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)

X.head()

Unnamed: 0,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,Barclays Premier League,...,West Bromwich Albion,West Ham United,Wolverhampton,2016,2017,2018,2019,2020,2021,2022
12,-1.493717,-0.34815,-0.505824,0.236361,1.147953,-0.766964,0.002413,38.1,22.2,1,...,0,0,0,1,0,0,0,0,0,0
13,-1.353483,-1.055253,-0.12492,-0.13708,0.999424,-0.401949,-0.193114,43.6,34.6,1,...,1,0,0,1,0,0,0,0,0,0
14,-0.242864,0.210592,-0.27829,0.115896,0.682294,-0.171413,0.27615,31.9,48.0,1,...,0,0,0,1,0,0,0,0,0,0
15,-1.025405,-0.961551,0.010288,-0.288209,1.013474,-0.363526,-0.369088,36.5,29.1,1,...,0,0,0,1,0,0,0,0,0,0
16,-0.115615,-0.997123,0.654545,-0.720239,0.035992,0.673885,-0.369088,34.1,30.7,1,...,0,0,0,1,0,0,0,0,0,0


## Train the model
A multi-layer perceptron with two outputs, one for the number of goals scored by the home team and one for the number of goals scored by the away team.


In [74]:
# Import the necessary libraries
import warnings
from sklearn.neural_network import MLPRegressor
warnings.filterwarnings("ignore")

# Create the model
model = MLPRegressor(hidden_layer_sizes=(81, 100, 45), activation='relu', solver='adam', max_iter=10000)

# Fit the model
model.fit(X_train, y_train)

# Predict the values
y_pred = model.predict(X_test)

# Calculate the accuracy of the model using the R^2 score
model.score(X_test, y_test)

-0.26577924033562406