# Predicting Soccer Match Results

As a fan of the English Premier League (EPL), particularly Tottenham Hotspur (Come On You Spurs!), I've always been obsessed with tracking the stats of different players and teams. In this project, I'm going to channel that obsession to see if I am able to predict the outcome of a match based on certain team stats. This project will involve a heavy amount of web scraping to pull in the data I need from [FBRef](fbref.com), as well as machine learning to then analyze and then make predictions off of that data.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

In [93]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [44]:
# The years I will use for scraping data
years = ['2021-2022','2020-2021','2019-2020','2018-2019','2017-2018']

In [45]:
# This list will hold my dataframes
dfs = []

In [46]:
domain = "https://fbref.com/"

In [47]:
# Creating a function that will pull all of the links to the individual teams' pages for each season
def pull_team_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    # The top table is the one we want. It is the final season standings with all of the teams
    table = soup.table
    a_tags = table.find_all('a')
    links = []
    teams = []
    for tag in a_tags:
        if 'squads' in tag['href']:
            team = tag.text
            teams.append(team)
            link = tag['href']
            links.append(link)
    team_links = [link for link in links if 'squads' in link]
    usable_links = [domain + link for link in team_links]
    return teams, usable_links

In [48]:
# Scrape each team's matchlogs into a dataframe
def scrape_team(team, url):
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    matchlogs = soup.find("table", {"id": "matchlogs_for"})
    match_df = pd.read_html(str(matchlogs))[0]
    # I am now going to pull the shooting stats, which I will merge with the matchlogs I just pulled
    shooting_links = soup.find_all("a", text="Shooting")
    shooting_url = domain + shooting_links[0]['href']
    r_2 = requests.get(shooting_url)
    html_2 = r_2.text
    soup_2 = BeautifulSoup(html_2, 'html.parser')
    shooting_matchlogs = soup_2.find("table", {"id": "matchlogs_for"})
    shooting_df = pd.read_html(str(shooting_matchlogs))[0]
    shooting_df.columns = shooting_df.columns.droplevel()
    combined_df = match_df.join(shooting_df[['Sh', 'SoT', 'Dist','PKatt']], how='left')
    combined_df['Team'] = team
    combined_df['Season'] = year
    epl_df = combined_df.loc[combined_df['Comp']=='Premier League']
    return epl_df

In [49]:
for year in years:
    year_url = "https://fbref.com/en/comps/9/" + year + "/" + year + "-Premier-League-Stats"
    teams, links = pull_team_links(year_url)
    for team, link in zip(teams, links):
        df = scrape_team(team, link)
        dfs.append(df)
        time.sleep(random.randint(5,16))

In [50]:
master_df = dfs[0]
for df in dfs[1:]:
    master_df = master_df.append(df)
master_df.reset_index(inplace=True, drop=True)
master_df.shape

(3800, 25)

In [51]:
master_df.tail()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,PKatt,Team,Season
3795,2018-04-15,16:00,Premier League,Matchweek 34,Sun,Away,W,1,0,Manchester Utd,0.8,1.0,30.0,75095.0,Chris Brunt,4-4-1-1,Paul Tierney,Match Report,,10.0,4.0,18.8,0.0,West Brom,2017-2018
3796,2018-04-21,12:30,Premier League,Matchweek 35,Sat,Home,D,2,2,Liverpool,1.4,1.4,39.0,24520.0,Chris Brunt,4-4-1-1,Stuart Attwell,Match Report,,12.0,5.0,18.1,0.0,West Brom,2017-2018
3797,2018-04-28,15:00,Premier League,Matchweek 36,Sat,Away,W,1,0,Newcastle Utd,0.8,1.5,39.0,52283.0,Chris Brunt,4-4-1-1,David Coote,Match Report,,9.0,2.0,18.5,0.0,West Brom,2017-2018
3798,2018-05-05,15:00,Premier League,Matchweek 37,Sat,Home,W,1,0,Tottenham,1.4,1.0,24.0,23685.0,Chris Brunt,4-4-1-1,Mike Jones,Match Report,,10.0,1.0,10.4,0.0,West Brom,2017-2018
3799,2018-05-13,15:00,Premier League,Matchweek 38,Sun,Away,L,0,2,Crystal Palace,0.3,2.4,42.0,25357.0,Chris Brunt,4-4-1-1,Jonathan Moss,Match Report,,7.0,1.0,24.5,0.0,West Brom,2017-2018


In [52]:
master_df.to_csv('EPL_Stats.csv', index=False)

In [4]:
matches = pd.read_csv('EPL_Stats.csv')
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,PKatt,Team,Season
0,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.9,1.3,64.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,,18.0,4.0,16.9,0.0,Manchester City,2021-2022
1,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.7,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,,16.0,4.0,17.3,0.0,Manchester City,2021-2022
2,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,3.8,0.1,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,,25.0,10.0,14.3,0.0,Manchester City,2021-2022
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.9,0.8,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,,25.0,8.0,14.0,0.0,Manchester City,2021-2022
4,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,1.1,0.4,63.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,,16.0,1.0,15.7,0.0,Manchester City,2021-2022


In [5]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3800 entries, 0 to 3799
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          3800 non-null   object 
 1   Time          3800 non-null   object 
 2   Comp          3800 non-null   object 
 3   Round         3800 non-null   object 
 4   Day           3800 non-null   object 
 5   Venue         3800 non-null   object 
 6   Result        3800 non-null   object 
 7   GF            3800 non-null   int64  
 8   GA            3800 non-null   int64  
 9   Opponent      3800 non-null   object 
 10  xG            3800 non-null   float64
 11  xGA           3800 non-null   float64
 12  Poss          3800 non-null   float64
 13  Attendance    2920 non-null   float64
 14  Captain       3800 non-null   object 
 15  Formation     3800 non-null   object 
 16  Referee       3800 non-null   object 
 17  Match Report  3800 non-null   object 
 18  Notes         0 non-null    

In [6]:
matches[matches['Dist'].isnull()]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,PKatt,Team,Season
1258,2020-10-18,14:00,Premier League,Matchweek 5,Sun,Home,D,1,1,Brighton,0.8,1.1,34.0,,Gary Cahill,4-4-2,Stuart Attwell,Match Report,,0.0,0.0,,1.0,Crystal Palace,2020-2021
2802,2019-03-02,15:00,Premier League,Matchweek 29,Sat,Home,L,0,1,Manchester City,0.0,1.6,20.0,10699.0,Andrew Surman,3-4-3,Kevin Friend,Match Report,,0.0,0.0,,0.0,Bournemouth,2018-2019
3715,2018-03-10,15:00,Premier League,Matchweek 30,Sat,Away,D,0,0,Huddersfield,0.0,1.6,19.0,23567.0,Federico Fernández,3-4-3,Michael Oliver,Match Report,,0.0,0.0,,0.0,Swansea City,2017-2018


In [7]:
matches['Dist'] = matches['Dist'].fillna(matches.groupby('Team')['Dist'].transform('mean'))

In [8]:
match_stats = matches[['Date','Venue','Result','GF','GA','Opponent','xG','xGA','Poss','Sh','SoT','Dist','PKatt','Team','Referee']].copy()
match_stats.head()

Unnamed: 0,Date,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,PKatt,Team,Referee
0,2021-08-15,Away,L,0,1,Tottenham,1.9,1.3,64.0,18.0,4.0,16.9,0.0,Manchester City,Anthony Taylor
1,2021-08-21,Home,W,5,0,Norwich City,2.7,0.1,67.0,16.0,4.0,17.3,0.0,Manchester City,Graham Scott
2,2021-08-28,Home,W,5,0,Arsenal,3.8,0.1,80.0,25.0,10.0,14.3,0.0,Manchester City,Martin Atkinson
3,2021-09-11,Away,W,1,0,Leicester City,2.9,0.8,61.0,25.0,8.0,14.0,0.0,Manchester City,Paul Tierney
4,2021-09-18,Home,D,0,0,Southampton,1.1,0.4,63.0,16.0,1.0,15.7,0.0,Manchester City,Jonathan Moss


I mostly only kept the actual stats of the game. I kept date for two reasons. First, because I am going to use that to join this table with itself, matching on date and opponent=team so I can have defensive stats as well as offensive. Basically, I want to consider not just the stats of the team playing, but also of the team they're playing against. Additionally, I kept referee because I will need to delete duplicates of the same game under different teams. For example, Arsenal vs Bournemouth is the same as Bournemouth vs Arsenal. The most accurate way to find that duplicate is by date and referee, as a referee cannot referee two games at once.

Before we get to that, first we need to create rolling averages to get a sense of how the teams are playing going into the game. Premier League usually uses 5 games as a form indicator, so I'll do the same. 

I also need date because in order to calculate the form each team is (the 5 game averages).

In [9]:
match_stats = match_stats.sort_values(['Team','Date'])
match_stats.head(40)

Unnamed: 0,Date,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,PKatt,Team,Referee
3230,2017-08-11,Home,W,4,3,Leicester City,2.3,1.3,69.0,27.0,10.0,19.4,0.0,Arsenal,Mike Dean
3231,2017-08-19,Away,L,0,1,Stoke City,1.6,1.0,76.0,19.0,7.0,17.0,0.0,Arsenal,Andre Marriner
3232,2017-08-27,Away,L,0,4,Liverpool,0.7,3.0,51.0,8.0,0.0,16.4,0.0,Arsenal,Craig Pawson
3233,2017-09-09,Home,W,3,0,Bournemouth,2.0,0.9,58.0,17.0,9.0,15.3,0.0,Arsenal,Anthony Taylor
3234,2017-09-17,Away,D,0,0,Chelsea,1.4,0.8,48.0,11.0,2.0,16.4,0.0,Arsenal,Michael Oliver
3235,2017-09-25,Home,W,2,0,West Brom,2.4,0.7,69.0,15.0,5.0,18.7,1.0,Arsenal,Robert Madley
3236,2017-10-01,Home,W,2,0,Brighton,3.0,0.4,63.0,26.0,8.0,18.4,0.0,Arsenal,Kevin Friend
3237,2017-10-14,Away,L,1,2,Watford,1.1,1.8,54.0,9.0,6.0,19.7,0.0,Arsenal,Niel Swarbrick
3238,2017-10-22,Away,W,5,2,Everton,2.8,1.3,67.0,28.0,14.0,16.9,0.0,Arsenal,Craig Pawson
3239,2017-10-28,Home,W,2,1,Swansea City,1.6,0.6,72.0,17.0,5.0,16.2,0.0,Arsenal,Lee Mason


In [10]:
# Creating a 'points' column to measure the points
match_stats.loc[match_stats['Result'] == 'W', 'points'] = 2
match_stats.loc[match_stats['Result'] == 'D', 'points'] = 1
match_stats.loc[match_stats['Result'] == 'L', 'points'] = 0

A note here. Normally in soccer, a win is worth 3 points. However, I believe this would skew the data unfairly, as in terms of actual precitions, the gap from win to draw is the same as from draw to loss, not bigger.

In [11]:
def rolling_stats(df, team_name):
    team_df = df.loc[df['Team']==team_name]
    team_rolling = team_df.rolling(19,min_periods=10,closed='left').mean()
    return team_rolling

In [12]:
for team in match_stats['Team'].unique():
    team_rolling = rolling_stats(match_stats, team)
    # Create a new df for the first team in the list
    if team == 'Arsenal':
        rolling_df = team_rolling
    # Otherwise append to the existing df
    else:
        rolling_df = pd.concat([rolling_df, team_rolling])
print(rolling_df.head())
print(rolling_df.shape)

      GF  GA  xG  xGA  Poss  Sh  SoT  Dist  PKatt  points
3230 NaN NaN NaN  NaN   NaN NaN  NaN   NaN    NaN     NaN
3231 NaN NaN NaN  NaN   NaN NaN  NaN   NaN    NaN     NaN
3232 NaN NaN NaN  NaN   NaN NaN  NaN   NaN    NaN     NaN
3233 NaN NaN NaN  NaN   NaN NaN  NaN   NaN    NaN     NaN
3234 NaN NaN NaN  NaN   NaN NaN  NaN   NaN    NaN     NaN
(3800, 10)


Great! Now we have rolling averages that look backwards at the previous five matches for each of our numerical columns. Now we need to join these to the original dataframe so we can make predictions based off of them.

In [13]:
rolling_match_stats = pd.merge(match_stats, rolling_df, left_index=True, right_index=True, suffixes=['','_last19'])

In [14]:
rolling_match_stats.head()

Unnamed: 0,Date,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,PKatt,Team,Referee,points,GF_last19,GA_last19,xG_last19,xGA_last19,Poss_last19,Sh_last19,SoT_last19,Dist_last19,PKatt_last19,points_last19
3230,2017-08-11,Home,W,4,3,Leicester City,2.3,1.3,69.0,27.0,10.0,19.4,0.0,Arsenal,Mike Dean,2.0,,,,,,,,,,
3231,2017-08-19,Away,L,0,1,Stoke City,1.6,1.0,76.0,19.0,7.0,17.0,0.0,Arsenal,Andre Marriner,0.0,,,,,,,,,,
3232,2017-08-27,Away,L,0,4,Liverpool,0.7,3.0,51.0,8.0,0.0,16.4,0.0,Arsenal,Craig Pawson,0.0,,,,,,,,,,
3233,2017-09-09,Home,W,3,0,Bournemouth,2.0,0.9,58.0,17.0,9.0,15.3,0.0,Arsenal,Anthony Taylor,2.0,,,,,,,,,,
3234,2017-09-17,Away,D,0,0,Chelsea,1.4,0.8,48.0,11.0,2.0,16.4,0.0,Arsenal,Michael Oliver,1.0,,,,,,,,,,


Now I need to drop all of the non-rolling stats (besides the results, which I will use as targets). These stats tell me how the game actually went, which I obviously would not have if I was trying to predict the game in advance. I will also need to drop the NA columns (these are the first 3 matches played by a team in the dataframe, so we couldn't really make a viable prediction anyway.

In [15]:
rolling_match_stats = rolling_match_stats.drop(columns=['xG','xGA','Poss','Sh','SoT','Dist','PKatt','GF','GA'])

In [16]:
rolling_match_stats = rolling_match_stats.dropna()

In [17]:
rolling_match_stats.head()

Unnamed: 0,Date,Venue,Result,Opponent,Team,Referee,points,GF_last19,GA_last19,xG_last19,xGA_last19,Poss_last19,Sh_last19,SoT_last19,Dist_last19,PKatt_last19,points_last19
3240,2017-11-05,Away,L,Manchester City,Arsenal,Michael Oliver,0.0,1.9,1.3,1.89,1.18,62.7,17.7,6.6,17.44,0.1,1.3
3241,2017-11-18,Home,W,Tottenham,Arsenal,Mike Dean,2.0,1.818182,1.454545,1.754545,1.254545,60.818182,16.636364,6.272727,17.772727,0.090909,1.181818
3242,2017-11-26,Away,W,Burnley,Arsenal,Lee Mason,2.0,1.833333,1.333333,1.725,1.266667,59.25,16.416667,6.166667,17.658333,0.083333,1.25
3243,2017-11-29,Home,W,Huddersfield,Arsenal,Graham Scott,2.0,1.769231,1.230769,1.730769,1.2,59.538462,16.461538,5.769231,17.953846,0.153846,1.307692
3244,2017-12-02,Home,L,Manchester Utd,Arsenal,Andre Marriner,0.0,2.0,1.142857,1.928571,1.171429,60.214286,16.785714,5.785714,17.757143,0.142857,1.357143


We now have all the rolling stats and the outcomes in the dataframe. The last thing to do before building models is joining the table on itself to get the opponent data.

In [18]:
df = pd.merge(rolling_match_stats, rolling_match_stats, left_on=['Date','Team'], right_on=['Date','Opponent'], suffixes=['','_opp'])

The last thing we need to do is to convert Venue into a categorical column, with 1 representing Home and 0 representing away so that we can use it with the models.

In [19]:
df['Venue'].replace(['Home','Away'], [1,0], inplace=True)

One more thing: We need to delete the duplicates, which we'll do by deleting any rows that have the same data and the same referee.

In [20]:
df.shape

(3444, 33)

In [21]:
df = df.drop_duplicates(subset=['Date','Referee'])

In [22]:
df.shape

(1722, 33)

In [23]:
df.head()

Unnamed: 0,Date,Venue,Result,Opponent,Team,Referee,points,GF_last19,GA_last19,xG_last19,xGA_last19,Poss_last19,Sh_last19,SoT_last19,Dist_last19,PKatt_last19,points_last19,Venue_opp,Result_opp,Opponent_opp,Team_opp,Referee_opp,points_opp,GF_last19_opp,GA_last19_opp,xG_last19_opp,xGA_last19_opp,Poss_last19_opp,Sh_last19_opp,SoT_last19_opp,Dist_last19_opp,PKatt_last19_opp,points_last19_opp
0,2017-11-05,0,L,Manchester City,Arsenal,Michael Oliver,0.0,1.9,1.3,1.89,1.18,62.7,17.7,6.6,17.44,0.1,1.3,Home,W,Arsenal,Manchester City,Michael Oliver,2.0,3.5,0.6,2.58,0.53,70.8,18.4,7.7,16.65,0.2,1.9
1,2017-11-18,1,W,Tottenham,Arsenal,Mike Dean,2.0,1.818182,1.454545,1.754545,1.254545,60.818182,16.636364,6.272727,17.772727,0.090909,1.181818,Away,L,Arsenal,Tottenham,Mike Dean,0.0,1.818182,0.636364,1.627273,0.663636,61.181818,17.363636,5.363636,17.809091,0.0,1.454545
2,2017-11-26,0,W,Burnley,Arsenal,Lee Mason,2.0,1.833333,1.333333,1.725,1.266667,59.25,16.416667,6.166667,17.658333,0.083333,1.25,Home,L,Arsenal,Burnley,Lee Mason,0.0,1.0,0.75,0.775,1.35,41.916667,10.583333,3.0,17.091667,0.0,1.333333
3,2017-11-29,1,W,Huddersfield,Arsenal,Graham Scott,2.0,1.769231,1.230769,1.730769,1.2,59.538462,16.461538,5.769231,17.953846,0.153846,1.307692,Away,L,Arsenal,Huddersfield,Graham Scott,0.0,0.692308,1.461538,0.684615,1.361538,45.307692,8.923077,2.692308,19.784615,0.0,0.846154
4,2017-12-02,1,L,Manchester Utd,Arsenal,Andre Marriner,0.0,2.0,1.142857,1.928571,1.171429,60.214286,16.785714,5.785714,17.757143,0.142857,1.357143,Away,W,Arsenal,Manchester Utd,Andre Marriner,2.0,2.285714,0.571429,1.907143,0.928571,54.357143,14.5,5.214286,17.028571,0.142857,1.571429


In [24]:
df = df.sample(n=1722, random_state = 10)
df.head()

Unnamed: 0,Date,Venue,Result,Opponent,Team,Referee,points,GF_last19,GA_last19,xG_last19,xGA_last19,Poss_last19,Sh_last19,SoT_last19,Dist_last19,PKatt_last19,points_last19,Venue_opp,Result_opp,Opponent_opp,Team_opp,Referee_opp,points_opp,GF_last19_opp,GA_last19_opp,xG_last19_opp,xGA_last19_opp,Poss_last19_opp,Sh_last19_opp,SoT_last19_opp,Dist_last19_opp,PKatt_last19_opp,points_last19_opp
1722,2018-12-26,1,W,Newcastle Utd,Liverpool,Graham Scott,2.0,2.263158,0.368421,2.036842,0.847368,59.894737,16.210526,6.157895,16.389474,0.052632,1.842105,Away,L,Liverpool,Newcastle Utd,Graham Scott,0.0,0.894737,1.157895,1.005263,1.410526,40.842105,11.0,3.526316,18.105263,0.052632,0.789474
3220,2020-06-20,1,L,Wolves,West Ham,Anthony Taylor,0.0,1.210526,1.894737,1.026316,1.721053,41.789474,9.736842,4.105263,15.331579,0.105263,0.526316,Away,W,West Ham,Wolves,Anthony Taylor,2.0,1.473684,1.105263,1.426316,0.884211,47.842105,14.368421,4.473684,16.5,0.052632,1.210526
483,2019-12-21,1,L,Sheffield Utd,Brighton,Robert Jones,0.0,1.210526,1.578947,1.226316,1.542105,52.526316,12.052632,4.052632,16.589474,0.105263,0.842105,Away,W,Brighton,Sheffield Utd,Robert Jones,2.0,1.235294,0.941176,1.170588,1.141176,44.588235,10.352941,3.529412,15.858824,0.0,1.117647
1040,2020-01-18,0,D,Manchester City,Crystal Palace,Graham Scott,1.0,0.947368,1.157895,0.947368,1.273684,45.315789,10.157895,3.105263,16.852632,0.157895,1.0,Home,D,Crystal Palace,Manchester City,Graham Scott,1.0,2.736842,1.157895,2.284211,1.026316,64.684211,19.421053,6.736842,16.042105,0.157895,1.421053
2283,2019-12-08,1,W,Southampton,Newcastle Utd,David Coote,2.0,1.315789,1.421053,0.868421,1.442105,34.263158,10.578947,3.578947,19.689474,0.0,1.0,Away,L,Newcastle Utd,Southampton,David Coote,0.0,1.157895,2.157895,1.305263,1.573684,45.526316,12.105263,4.0,18.078947,0.105263,0.736842


In [25]:
corr = df.corr()['points'].abs().sort_values(ascending=False)
corr

points               1.000000
points_opp           1.000000
xG_last19_opp        0.319289
Sh_last19_opp        0.306303
SoT_last19_opp       0.300776
GF_last19_opp        0.299568
points_last19_opp    0.290775
Poss_last19_opp      0.278974
xGA_last19_opp       0.265598
points_last19        0.254063
Poss_last19          0.248569
xG_last19            0.246833
GA_last19_opp        0.246765
GF_last19            0.243617
xGA_last19           0.233019
SoT_last19           0.228022
Sh_last19            0.217934
GA_last19            0.204495
Dist_last19_opp      0.119337
Venue                0.118839
PKatt_last19_opp     0.115364
PKatt_last19         0.094610
Dist_last19          0.019910
Name: points, dtype: float64

In [74]:
features = corr[corr.between(0.15,0.35)].index
features

Index(['xG_last19_opp', 'Sh_last19_opp', 'SoT_last19_opp', 'GF_last19_opp',
       'points_last19_opp', 'Poss_last19_opp', 'xGA_last19_opp',
       'points_last19', 'Poss_last19', 'xG_last19', 'GA_last19_opp',
       'GF_last19', 'xGA_last19', 'SoT_last19', 'Sh_last19', 'GA_last19'],
      dtype='object')

In [72]:
X = df[features]
X.head()

Unnamed: 0,xG_last19_opp,Sh_last19_opp,SoT_last19_opp,GF_last19_opp,points_last19_opp,Poss_last19_opp,xGA_last19_opp,points_last19,Poss_last19,xG_last19,GA_last19_opp,GF_last19,xGA_last19,SoT_last19,Sh_last19,GA_last19,Dist_last19_opp,Venue,PKatt_last19_opp,PKatt_last19
1722,1.005263,11.0,3.526316,0.894737,0.789474,40.842105,1.410526,1.842105,59.894737,2.036842,1.157895,2.263158,0.847368,6.157895,16.210526,0.368421,18.105263,1,0.052632,0.052632
3220,1.426316,14.368421,4.473684,1.473684,1.210526,47.842105,0.884211,0.526316,41.789474,1.026316,1.105263,1.210526,1.721053,4.105263,9.736842,1.894737,16.5,1,0.052632,0.105263
483,1.170588,10.352941,3.529412,1.235294,1.117647,44.588235,1.141176,0.842105,52.526316,1.226316,0.941176,1.210526,1.542105,4.052632,12.052632,1.578947,15.858824,1,0.0,0.105263
1040,2.284211,19.421053,6.736842,2.736842,1.421053,64.684211,1.026316,1.0,45.315789,0.947368,1.157895,0.947368,1.273684,3.105263,10.157895,1.157895,16.042105,0,0.157895,0.157895
2283,1.305263,12.105263,4.0,1.157895,0.736842,45.526316,1.573684,1.0,34.263158,0.868421,2.157895,1.315789,1.442105,3.578947,10.578947,1.421053,18.078947,1,0.105263,0.0


In [28]:
y = df['points']

In [29]:
lr = LinearRegression()

In [30]:
rfr = RandomForestRegressor()

In [31]:
# Predicting the number of points
lr_scores = cross_val_score(lr, X, y)
#print each lr score (accuracy) and average them
print(lr_scores)
print('lr_scores mean:{}'.format(np.mean(lr_scores)))

[0.09681767 0.20567836 0.11937721 0.22664157 0.18025221]
lr_scores mean:0.1657534030712156


In [32]:
lr_pts_predictions = cross_val_predict(lr, X, y)

In [33]:
# Predicting the number of goals scored by the team
rfr_scores = cross_val_score(rfr, X, y)
#print each rfr score (accuracy) and average them
print(rfr_scores)
print('rfr_scores mean:{}'.format(np.mean(rfr_scores)))

[0.044868   0.16314808 0.09590827 0.19342316 0.11693731]
rfr_scores mean:0.12285696321212128


In [34]:
df['predicted_pts'] = lr_pts_predictions

In [35]:
df.head()

Unnamed: 0,Date,Venue,Result,Opponent,Team,Referee,points,GF_last19,GA_last19,xG_last19,xGA_last19,Poss_last19,Sh_last19,SoT_last19,Dist_last19,PKatt_last19,points_last19,Venue_opp,Result_opp,Opponent_opp,Team_opp,Referee_opp,points_opp,GF_last19_opp,GA_last19_opp,xG_last19_opp,xGA_last19_opp,Poss_last19_opp,Sh_last19_opp,SoT_last19_opp,Dist_last19_opp,PKatt_last19_opp,points_last19_opp,predicted_pts
1722,2018-12-26,1,W,Newcastle Utd,Liverpool,Graham Scott,2.0,2.263158,0.368421,2.036842,0.847368,59.894737,16.210526,6.157895,16.389474,0.052632,1.842105,Away,L,Liverpool,Newcastle Utd,Graham Scott,0.0,0.894737,1.157895,1.005263,1.410526,40.842105,11.0,3.526316,18.105263,0.052632,0.789474,1.620795
3220,2020-06-20,1,L,Wolves,West Ham,Anthony Taylor,0.0,1.210526,1.894737,1.026316,1.721053,41.789474,9.736842,4.105263,15.331579,0.105263,0.526316,Away,W,West Ham,Wolves,Anthony Taylor,2.0,1.473684,1.105263,1.426316,0.884211,47.842105,14.368421,4.473684,16.5,0.052632,1.210526,0.427201
483,2019-12-21,1,L,Sheffield Utd,Brighton,Robert Jones,0.0,1.210526,1.578947,1.226316,1.542105,52.526316,12.052632,4.052632,16.589474,0.105263,0.842105,Away,W,Brighton,Sheffield Utd,Robert Jones,2.0,1.235294,0.941176,1.170588,1.141176,44.588235,10.352941,3.529412,15.858824,0.0,1.117647,0.999196
1040,2020-01-18,0,D,Manchester City,Crystal Palace,Graham Scott,1.0,0.947368,1.157895,0.947368,1.273684,45.315789,10.157895,3.105263,16.852632,0.157895,1.0,Home,D,Crystal Palace,Manchester City,Graham Scott,1.0,2.736842,1.157895,2.284211,1.026316,64.684211,19.421053,6.736842,16.042105,0.157895,1.421053,0.128834
2283,2019-12-08,1,W,Southampton,Newcastle Utd,David Coote,2.0,1.315789,1.421053,0.868421,1.442105,34.263158,10.578947,3.578947,19.689474,0.0,1.0,Away,L,Newcastle Utd,Southampton,David Coote,0.0,1.157895,2.157895,1.305263,1.573684,45.526316,12.105263,4.0,18.078947,0.105263,0.736842,0.920422


In [36]:
r2_score(df['points'],df['predicted_pts'])

0.16932199960240468

In [37]:
accuracy_score(df['points'], df['predicted_pts'].round())

0.34727061556329847

In [38]:
pd.crosstab(df['points'],df['predicted_pts'].round(), margins=True, normalize=True)

predicted_pts,0.0,1.0,2.0,All
points,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.080139,0.292683,0.012195,0.385017
1.0,0.012195,0.198606,0.015679,0.226481
2.0,0.009872,0.310105,0.068525,0.388502
All,0.102207,0.801394,0.0964,1.0


Hmmm. The accuracy score is low because when I round, a remarkable 84% of the time the predicted points rounds to a draw. Let's narrow the prediction for drawing from anything from -0.5 to 0.5 to a tighter amount until the proporiton of draws is correct.

In [39]:
df.loc[df['predicted_pts'] >= 1.105, 'pred_pts_2'] = 2
df.loc[df['predicted_pts'].between(0.895, 1.105, inclusive='left'), 'pred_pts_2'] = 1
df.loc[df['predicted_pts'] < 0.895, 'pred_pts_2'] = 0

In [40]:
pd.crosstab(df['points'],df['pred_pts_2'], margins=True, normalize=True)

pred_pts_2,0.0,1.0,2.0,All
points,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.218351,0.083624,0.083043,0.385017
1.0,0.079559,0.05633,0.090592,0.226481
2.0,0.076074,0.084204,0.228223,0.388502
All,0.373984,0.224158,0.401858,1.0


In [41]:
accuracy_score(df['points'], df['pred_pts_2'])

0.502903600464576

In [43]:
precision_score(df['points'], df['pred_pts_2'], average='weighted')

0.5023439084324564

That's it! Overall, we did a good job building a predictor that could predict wins and losses, but perhaps struggled a little bit more with predicting draws. This makes sense for a couple of reasons. First, draws are the least common outcome, only occuring about 22.6% of the time over the last 5 seaasons. Secondly, a draw means the teams are relatively evenly matched, which in turn means the outcome is a bit more of a tossup.

I'm creating one more prediction just to see if a team will pick up any points (win or draw).

In [44]:
df.loc[df['predicted_pts'] >= 0.89, 'pred_pts_3'] = 1
df.loc[df['predicted_pts'] < 0.89, 'pred_pts_3'] = 0

In [45]:
# Creating a 'points' column to measure the points
df.loc[df['Result'] == 'W', 'points_2'] = 1
df.loc[df['Result'] == 'D', 'points_2'] = 1
df.loc[df['Result'] == 'L', 'points_2'] = 0

In [46]:
accuracy_score(df['points_2'], df['pred_pts_3'])

0.6771196283391405

In [47]:
precision_score(df['points_2'], df['pred_pts_3'])

0.7322253000923361

In [48]:
pd.crosstab(df['points_2'],df['pred_pts_3'], margins=True, normalize=True)

pred_pts_3,0.0,1.0,All
points_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.216609,0.168409,0.385017
1.0,0.154472,0.460511,0.614983
All,0.37108,0.62892,1.0


Looks like we can say with about 67.7% accuracy whether a team will pick up a win or a draw. That's a pretty good score!

Let's try using classification algorithms. First svc, then random forest.

In [77]:
clf_features = corr[corr.between(0.01,0.35)].index
clf_X = df[clf_features]

In [78]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(clf_X, y)

In [80]:
top_features = clf_X.columns[selector.support_]
print(top_features)

Index(['xG_last19_opp', 'Sh_last19_opp', 'GF_last19_opp', 'points_last19_opp',
       'points_last19', 'xG_last19', 'GA_last19_opp', 'GF_last19',
       'xGA_last19', 'SoT_last19', 'GA_last19', 'Dist_last19_opp', 'Venue',
       'PKatt_last19_opp', 'PKatt_last19', 'Dist_last19'],
      dtype='object')


In [82]:
X2 = clf_X[top_features]

In [81]:
svc = make_pipeline(StandardScaler(), SVC())

In [83]:
scores = cross_val_score(svc, X2, y)
print(scores)
print(scores.mean())

[0.52173913 0.55652174 0.50290698 0.55232558 0.51162791]
0.5290242669362992


In [84]:
predictions_svc = cross_val_predict(svc, X2, y)

In [86]:
pd.crosstab(predictions_svc, y, normalize=True, margins=True)

points,0.0,1.0,2.0,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.259001,0.103949,0.117305,0.480256
1.0,0.001161,0.0,0.001161,0.002323
2.0,0.124855,0.122532,0.270035,0.517422
All,0.385017,0.226481,0.388502,1.0


Fascinatingly, using a classifier actually got around the difficulty of predicting draws... by not predicting any draws (technically 4 out of all the matches). That being said, it did get a higher accuracy score still!

In [92]:
precision_score(y, predictions_svc, average = 'micro')

0.5290360046457607

In [94]:
rfc = RandomForestClassifier()

In [95]:
scores = cross_val_score(rfc, X2, y)
print(scores)
print(scores.mean())

[0.48985507 0.53623188 0.51453488 0.51162791 0.51744186]
0.513938321536906


In [96]:
y2 = df['points_2']

In [98]:
estimator2 = SVC(kernel="linear")
selector2 = RFECV(estimator2, step=1, cv=5)
selector2 = selector.fit(clf_X, y2)

In [99]:
top_features_2 = clf_X.columns[selector.support_]
X3 = clf_X[top_features]

In [101]:
scores = cross_val_score(svc, X3, y2)
print(scores)
print(scores.mean())

[0.67246377 0.72463768 0.65988372 0.70930233 0.68604651]
0.6904668014829795


In [102]:
scores = cross_val_score(rfc, X3, y2)
print(scores)
print(scores.mean())

[0.66376812 0.70724638 0.63081395 0.69767442 0.67732558]
0.675365689248399
