# Project 2: Predict a football match outcome

In [55]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [56]:
df = pd.read_csv('~/Documents/GitHub/complete-football-analytics/Projects/project2.csv')

In [57]:
df.head()

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
0,1.0,Fri,2018-08-17,20:15,Girona,0.7,0–0,0.0,Valladolid,10368.0,Estadi Municipal de Montilivi,Guillermo Cuadra
1,1.0,Fri,2018-08-17,22:15,Betis,0.9,0–3,2.5,Levante,46225.0,Estadio Benito Villamarín,Ignacio Iglesias
2,1.0,Sat,2018-08-18,18:15,Celta Vigo,0.6,1–1,0.6,Espanyol,16215.0,Estadio de Balaídos,Santiago Jaime
3,1.0,Sat,2018-08-18,20:15,Villarreal,1.6,1–2,0.7,Real Sociedad,16250.0,Estadio de la Cerámica,Mario Melero
4,1.0,Sat,2018-08-18,22:15,Barcelona,3.2,3–0,0.3,Alavés,52356.0,Camp Nou,José Sánchez


In [5]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
1029,24.0,Tue,2020-03-10,20:00,Eibar,2.0,1–2,1.2,Real Sociedad,,Estadio Municipal de Ipurúa,David Medié
1030,28.0,Thu,2020-06-11,22:00,Sevilla,1.7,2–0,0.5,Betis,,Estadio Ramón Sánchez Pizjuán,Antonio Matéu Lahoz
1031,28.0,Fri,2020-06-12,19:30,Granada,2.3,2–1,0.6,Getafe,,Estadio Nuevo Los Cármenes,David Medié
1032,28.0,Fri,2020-06-12,22:00,Valencia,1.0,1–1,1.4,Levante,,Estadio de Mestalla,Alberola Rojas
1033,28.0,Sat,2020-06-13,14:00,Espanyol,2.5,2–0,0.1,Alavés,,RCDE Stadium,Pablo González
...,...,...,...,...,...,...,...,...,...,...,...,...
4555,38.0,Sun,2024-05-19,16:00,Brentford,,,,Newcastle Utd,,Gtech Community Stadium,
4556,38.0,Sun,2024-05-19,16:00,Chelsea,,,,Bournemouth,,Stamford Bridge,
4557,38.0,Sun,2024-05-19,16:00,Crystal Palace,,,,Aston Villa,,Selhurst Park,
4558,38.0,Sun,2024-05-19,16:00,Liverpool,,,,Wolves,,Anfield,


In [6]:
df[df['Score'].isnull()]

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
4110,32.0,Fri,2024-04-19,21:00,Athletic Club,,,,Granada,,San Mamés,
4111,32.0,Sat,2024-04-20,14:00,Celta Vigo,,,,Las Palmas,,Estadio de Balaídos,
4112,32.0,Sat,2024-04-20,16:15,Rayo Vallecano,,,,Osasuna,,Estadio del Rayo Vallecano,
4113,32.0,Sat,2024-04-20,18:30,Valencia,,,,Betis,,Estadio de Mestalla,
4114,32.0,Sat,2024-04-20,21:00,Girona,,,,Cádiz,,Estadi Municipal de Montilivi,
...,...,...,...,...,...,...,...,...,...,...,...,...
4555,38.0,Sun,2024-05-19,16:00,Brentford,,,,Newcastle Utd,,Gtech Community Stadium,
4556,38.0,Sun,2024-05-19,16:00,Chelsea,,,,Bournemouth,,Stamford Bridge,
4557,38.0,Sun,2024-05-19,16:00,Crystal Palace,,,,Aston Villa,,Selhurst Park,
4558,38.0,Sun,2024-05-19,16:00,Liverpool,,,,Wolves,,Anfield,


In [7]:
# Drop rows where there are missing values in 'Score', 'xG', 'xG.1', and 'Referee'
df = df.dropna(subset=['Score', 'xG', 'xG.1', 'Referee'])

# We'll also choose to drop the 'Attendance' column if not used for prediction
# Drop 'Attendance' and 'Time' columns as they may not be essential for the initial model
df = df.drop(columns=['Attendance', 'Time'])

In [8]:
df.head()

Unnamed: 0,Wk,Day,Date,Home,xG,Score,xG.1,Away,Venue,Referee
0,1.0,Fri,2018-08-17,Girona,0.7,0–0,0.0,Valladolid,Estadi Municipal de Montilivi,Guillermo Cuadra
1,1.0,Fri,2018-08-17,Betis,0.9,0–3,2.5,Levante,Estadio Benito Villamarín,Ignacio Iglesias
2,1.0,Sat,2018-08-18,Celta Vigo,0.6,1–1,0.6,Espanyol,Estadio de Balaídos,Santiago Jaime
3,1.0,Sat,2018-08-18,Villarreal,1.6,1–2,0.7,Real Sociedad,Estadio de la Cerámica,Mario Melero
4,1.0,Sat,2018-08-18,Barcelona,3.2,3–0,0.3,Alavés,Camp Nou,José Sánchez


In [9]:
# Split the 'Score' column into 'HomeGoals' and 'AwayGoals' columns
df[['home_goals', 'away_goals']] = df['Score'].str.split('–', expand=True).astype(float)

In [10]:
# Create a season_start column 
df['Date'] = pd.to_datetime(df['Date'])
df['season_start'] = df['Date'].apply(lambda x: x.year - 1 if x.month < 8 else x.year)

In [11]:
# Create a target variable for the match result
def determine_result(row):
    if row['home_goals'] > row['away_goals']:
        return 'Home win'
    elif row['home_goals'] < row['away_goals']:
        return 'Away win'
    else:
        return 'Draw'

df['result'] = df.apply(determine_result, axis=1)

In [12]:
# Encode 'Day' as a categorical feature using pd.get_dummies
df['Day'] = df['Date'].dt.day_name()
df = pd.get_dummies(df, columns=['Day'])

# Display the updated dataset with new features
df.head()

Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,away_goals,season_start,result,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,1.0,2018-08-17,Girona,0.7,0–0,0.0,Valladolid,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw,True,False,False,False,False,False,False
1,1.0,2018-08-17,Betis,0.9,0–3,2.5,Levante,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018,Away win,True,False,False,False,False,False,False
2,1.0,2018-08-18,Celta Vigo,0.6,1–1,0.6,Espanyol,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018,Draw,False,False,True,False,False,False,False
3,1.0,2018-08-18,Villarreal,1.6,1–2,0.7,Real Sociedad,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
4,1.0,2018-08-18,Barcelona,3.2,3–0,0.3,Alavés,Camp Nou,José Sánchez,3.0,0.0,2018,Home win,False,False,True,False,False,False,False


In [13]:
# Let's create some new features
df.reset_index(drop=True, inplace=True)
df.sort_values(['Date'], inplace=True)

In [14]:
for x in df.Home.unique():
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    break

In [15]:
temp_df

Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,away_goals,season_start,result,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
380,1.0,2018-08-10,Manchester Utd,1.5,2–1,1.8,Leicester City,Old Trafford,Andre Marriner,2.0,1.0,2018,Home win,True,False,False,False,False,False,False
398,2.0,2018-08-19,Brighton,1.7,3–2,1.4,Manchester Utd,The American Express Community Stadium,Kevin Friend,3.0,2.0,2018,Home win,False,False,False,True,False,False,False
409,3.0,2018-08-27,Manchester Utd,1.5,0–3,1.2,Tottenham,Old Trafford,Craig Pawson,0.0,3.0,2018,Away win,False,True,False,False,False,False,False
419,4.0,2018-09-02,Burnley,0.8,0–2,2.5,Manchester Utd,Turf Moor,Jonathan Moss,0.0,2.0,2018,Away win,False,False,False,True,False,False,False
426,5.0,2018-09-15,Watford,1.3,1–2,1.9,Manchester Utd,Vicarage Road Stadium,Mike Dean,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,28.0,2024-03-09,Manchester Utd,2.7,2–0,1.6,Everton,Old Trafford,Simon Hooper,2.0,0.0,2023,Home win,False,False,True,False,False,False,False
4400,30.0,2024-03-30,Brentford,3.2,1–1,0.6,Manchester Utd,Gtech Community Stadium,Simon Hooper,1.0,1.0,2023,Draw,False,False,True,False,False,False,False
4412,31.0,2024-04-04,Chelsea,3.0,4–3,1.5,Manchester Utd,Stamford Bridge,Jarred Gillett,4.0,3.0,2023,Home win,False,False,False,False,True,False,False
4420,32.0,2024-04-07,Manchester Utd,0.7,2–2,3.6,Liverpool,Old Trafford,Anthony Taylor,2.0,2.0,2023,Draw,False,False,False,True,False,False,False


In [16]:
# Update the rolling average goals for the home team in the main dataset
for x in df.Home.unique():
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])
    
    temp_df['goal_value_to_calculate'] = temp_df.apply(lambda y: y['home_goals'] if y['Home'] == x else y['away_goals'], axis=1)
    temp_df['rolling_avg_goals'] = temp_df['goal_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            df.at[index, 'home_rolling_avg_goals'] = row['rolling_avg_goals']
        else:
            df.at[index, 'away_rolling_avg_goals'] = row['rolling_avg_goals']

In [17]:
df[(df['Home'] == 'Brighton') | (df['Away'] == 'Brighton')][['Wk', 'Date', 'Home', 'Away', 'home_goals', 'away_goals','home_rolling_avg_goals', 'away_rolling_avg_goals']]

Unnamed: 0,Wk,Date,Home,Away,home_goals,away_goals,home_rolling_avg_goals,away_rolling_avg_goals
384,1.0,2018-08-11,Watford,Brighton,2.0,0.0,,
398,2.0,2018-08-19,Brighton,Manchester Utd,3.0,2.0,0.0,2.000000
405,3.0,2018-08-25,Liverpool,Brighton,1.0,0.0,3.0,1.500000
412,4.0,2018-09-01,Brighton,Fulham,2.0,2.0,1.0,1.666667
429,5.0,2018-09-17,Southampton,Brighton,2.0,2.0,1.0,1.250000
...,...,...,...,...,...,...,...,...
4385,28.0,2024-03-10,Brighton,Nott'ham Forest,1.0,0.0,2.2,1.400000
4401,30.0,2024-03-31,Liverpool,Brighton,2.0,1.0,2.6,1.600000
4409,31.0,2024-04-03,Brentford,Brighton,0.0,0.0,1.4,1.600000
4419,32.0,2024-04-06,Brighton,Arsenal,0.0,3.0,0.6,2.800000


In [18]:
# Do the same thing for xG
for x in df.Home.unique():
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])
    
    temp_df['xG_value_to_calculate'] = temp_df.apply(lambda y: y['xG'] if y['Home'] == x else y['xG.1'], axis=1)
    temp_df['rolling_avg_xG'] = temp_df['xG_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            df.at[index, 'home_rolling_avg_xG'] = row['rolling_avg_xG']
        else:
            df.at[index, 'away_rolling_avg_xG'] = row['rolling_avg_xG']

In [19]:
# One last bit of clean up is to drop the rows where the rolling averages are null
df = df.dropna(subset=['home_rolling_avg_goals', 'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'])

In [20]:
df[(df['Home'] == 'Brighton') | (df['Away'] == 'Brighton')][['Date', 'Home', 'Away', 'xG', 'xG.1','home_rolling_avg_xG', 'away_rolling_avg_xG']]

Unnamed: 0,Date,Home,Away,xG,xG.1,home_rolling_avg_xG,away_rolling_avg_xG
398,2018-08-19,Brighton,Manchester Utd,1.7,1.4,0.300000,1.50
405,2018-08-25,Liverpool,Brighton,1.6,0.6,3.300000,1.00
412,2018-09-01,Brighton,Fulham,2.8,1.6,0.866667,1.20
429,2018-09-17,Southampton,Brighton,2.0,1.4,1.500000,1.35
437,2018-09-22,Brighton,Tottenham,0.8,1.9,1.360000,1.54
...,...,...,...,...,...,...,...
4385,2024-03-10,Brighton,Nott'ham Forest,0.4,0.7,1.640000,1.68
4401,2024-03-31,Liverpool,Brighton,2.8,0.5,2.720000,1.44
4409,2024-04-03,Brentford,Brighton,0.6,1.1,1.600000,1.30
4419,2024-04-06,Brighton,Arsenal,0.5,3.4,0.940000,1.56


In [28]:
# Let's train a model to predict the match result
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [25]:
print(df.columns)
df.drop(columns=['Date', 'xG', 'xG.1', 'Home', 'Away', 'Referee', 'Venue', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns

Index(['Wk', 'Date', 'Home', 'xG', 'Score', 'xG.1', 'Away', 'Venue', 'Referee',
       'home_goals', 'away_goals', 'season_start', 'result', 'Day_Friday',
       'Day_Monday', 'Day_Saturday', 'Day_Sunday', 'Day_Thursday',
       'Day_Tuesday', 'Day_Wednesday', 'home_rolling_avg_goals',
       'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'],
      dtype='object')


Index(['Wk', 'Venue', 'Day_Friday', 'Day_Monday', 'Day_Saturday', 'Day_Sunday',
       'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday',
       'home_rolling_avg_goals', 'away_rolling_avg_goals',
       'home_rolling_avg_xG', 'away_rolling_avg_xG'],
      dtype='object')

In [30]:
# Define the features and target variable
features = [column for column in df.drop(columns=['Date', 'xG', 'xG.1', 'Home', 'Away', 'Referee', 'Venue', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets
train_data = df[df['season_start'] <= 2022]
test_data = df[df['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']


In [31]:
# Train a Random Forest model
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)

In [32]:
# Make predictions
predictions = clf.predict(X_test)

In [33]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.45166402535657685

In [34]:
# confusion matrix
confusion_matrix(y_test, predictions)

array([[ 76,  22,  86],
       [ 50,  16,  98],
       [ 65,  25, 193]])

In [35]:
# Our baseline model has an accuracy of
df['result'].value_counts(normalize=True)

result
Home win    0.445884
Away win    0.303547
Draw        0.250568
Name: proportion, dtype: float64

So based on the baseline model, we can see that our model is right around a baseline model
This baseline model predicts the majority class, which is 'Home win', for all the matches
We can try to improve the model by adding more features, tuning the hyperparameters, or using a different model

In [36]:
# Let's first try to tune the hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [37]:
grid_search.best_params_

{'max_depth': 5, 'n_estimators': 200}

In [38]:
# Train a Random Forest model with the best hyperparameters
clf = RandomForestClassifier(random_state=1, n_estimators=200, max_depth=5)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.49920760697305866

The accuracy of the model has increased after tuning the hyperparameters!
Let's try a adding in those features we ommitted earlier and see if we can improve the model further

In [39]:
df = pd.get_dummies(df, columns=['Home', 'Away', 'Referee', 'Venue'])
df.head()

Unnamed: 0,Wk,Date,xG,Score,xG.1,home_goals,away_goals,season_start,result,Day_Friday,...,Venue_The American Express Community Stadium,Venue_The City Ground,Venue_The Hawthorns,Venue_The John Smith's Stadium,Venue_Tottenham Hotspur Stadium,Venue_Turf Moor,Venue_Vicarage Road Stadium,Venue_Villa Park,Venue_Vitality Stadium,Venue_Wembley Stadium
395,2.0,2018-08-18,1.6,3–2,2.4,3.0,2.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False
391,2.0,2018-08-18,1.2,1–2,1.3,1.0,2.0,2018,Away win,False,...,False,False,False,False,False,False,False,False,False,False
392,2.0,2018-08-18,1.5,2–1,1.7,2.0,1.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False
393,2.0,2018-08-18,0.2,2–0,1.1,2.0,0.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False
394,2.0,2018-08-18,2.9,3–1,1.6,3.0,1.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,True


In [40]:
# Define the features and target variable
features = [column for column in df.drop(columns=['Date', 'xG', 'xG.1', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets
train_data = df[df['season_start'] <= 2022]
test_data = df[df['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

In [42]:
# Find the best hyperparameters for the Random Forest model
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

grid_search.best_params_

{'max_depth': 15, 'n_estimators': 50}

In [53]:
# Train a Random Forest model with the new features
clf = RandomForestClassifier(random_state=1, n_estimators=50, max_depth=15)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.5150554675118859

Even more improvements! The accuracy of the model has increased after adding more features
We can continue to improve the model by adding more features, tuning the hyperparameters, or using a different model
The process of improving the model can be very iterative as you try different combinations of models, features, and hyperparameters
Let's try one last thing by seeing if we can improve the model by using a different model

In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

models = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=1),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, 20]
        },
        'accuracy': ''
    },
    'Gaussian Naive Bayes': {
        'model': GaussianNB(),
        'params': {},
        'accuracy': ''
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=1),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        },
        'accuracy': ''
    }
}

In [45]:
for model_name, model in models.items():
    grid_search = GridSearchCV(estimator=model['model'], param_grid=model['params'], cv=5)
    grid_search.fit(X_train, y_train)
    
    model['accuracy'] = grid_search.best_score_
    model['best_params'] = grid_search.best_params_

In [46]:
models

{'Random Forest': {'model': RandomForestClassifier(random_state=1),
  'params': {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, 20]},
  'accuracy': 0.502257988382456,
  'best_params': {'max_depth': 15, 'n_estimators': 50}},
 'Gaussian Naive Bayes': {'model': GaussianNB(),
  'params': {},
  'accuracy': 0.4557955622250168,
  'best_params': {}},
 'Logistic Regression': {'model': LogisticRegression(random_state=1),
  'params': {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
  'accuracy': 0.4982785040210511,
  'best_params': {'C': 0.1, 'penalty': 'l2'}}}

In [None]:
# Now let's see how we can use to model to predict the outcome of a match
# We'll use the model to predict the outcome of a hypothetical match

In [47]:
home_team = 'Chelsea'
away_team = 'Manchester Utd'
referee = 'Anthony Taylor'

In [50]:
# Essentially, we need to create a row of data that represents the match we want to predict
# We'll use the features we used to train the model

# Create a row of data for the hypothetical match
data = {
    'Wk': [25],
    'home_rolling_avg_goals': [1.9],
    'away_rolling_avg_goals': [1.2],
    'home_rolling_avg_xG': [2.1],
    'away_rolling_avg_xG': [1.3],
    'Day_Saturday': [1],
    'Home_Chelsea': [1],
    'Away_Manchester Utd': [1],
    'Referee_Anthony Taylor': [1],
    'Venue_Stamford Bridge': [1]
}

match = pd.DataFrame(columns=X_train.columns, data=data)

match.fillna(0, inplace=True)

In [54]:
# Use the Random Forest model to predict the outcome of the match
prediction = clf.predict(match)
prediction

array(['Home win'], dtype=object)