In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Scrape schedule
url = 'https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures'
soup = BeautifulSoup(requests.get(url).content, 'lxml')
while soup.find('tr', class_ = 'thead') is not None: # Decompose all headers
    soup.find('tr', class_ = 'thead').decompose()
data = pd.read_html(str(soup.find('table')))[0]
data = data[data['Score'].notna()]
data = data[['Date', 'Home', 'Away', 'Score']]
data['Date'] = pd.to_datetime(data['Date'])
data['G_home'] = data['Score'].apply(lambda x: str(x).split('–')[0]).astype(int)
data['G_away'] = data['Score'].apply(lambda x: str(x).split('–')[-1]).astype(int)

In [164]:
teams = sorted(list(set(data['Home'])))
tm = 'Manchester City'
df = data.copy()[(data['Home'] == tm) | (data['Away'] == tm)].sort_values('Date').reset_index(drop = True) # Team results
df['H'] = df['Home'] == tm
df['A'] = df['H']    == False
df['Team'] = len(df) * [tm]
df['Opp'] = df['Home'] * df['A'] + df['Away'] * df['H']
df['GF'] = df['G_home'] * df['H'] + df['G_away'] * df['A'] # Goals for
df['GA'] = df['G_home'] * df['A'] + df['G_away'] * df['H'] # Goals against
df['GD'] = df['GF'] - df['GA'] # Goal difference
df['W'] = np.sign(df['GD'])
df = df[['Date', 'Team', 'Opp', 'H', 'W', 'GF', 'GA', 'GD']]
df['PTS%'] = [np.nan] + list(np.cumsum(df['W'].apply(lambda x: {-1:0,0:1,1:3}.get(x))) / (3*(df.index + 1)))[:-1]
features = ['PTS%']
for f in ['W', 'GF', 'GA', 'GD']:
    for w in [1, 2, 5, 10]:
        df[f'{f}_{w}'] = [np.nan] + list(df[f].rolling(w, min_periods = 1).mean())[:-1]
        features.append(f'{f}_{w}')
    df[f'{f}_inf'] = [np.nan] + list(df[f].rolling(1000, min_periods = 1).mean())[:-1]
    features.append(f'{f}_inf')
df['Rest'] = [np.nan] + list(np.diff(df['Date']).astype(float) / (10**9) / (60 * 60 * 24))
features.append('Rest')

home = df.groupby('H').get_group(True)
away = df.groupby('H').get_group(False)

In [165]:
renameDict = {'Team': 'Home', 'Opp': 'Away'}
for f in ['W', 'GF', 'GA', 'GD'] + features:
    renameDict[f] = f'{f}_home'
home = home.rename(columns = renameDict)
home

Unnamed: 0,Date,Home,Away,H,W_home,GF_home,GA_home,GD_home,PTS%_home,W_1_home,...,GA_2_home,GA_5_home,GA_10_home,GA_inf_home,GD_1_home,GD_2_home,GD_5_home,GD_10_home,GD_inf_home,Rest_home
1,2021-08-21,Manchester City,Norwich City,True,1,5,0,5,0.0,-1.0,...,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,6.0
2,2021-08-28,Manchester City,Arsenal,True,1,5,0,5,0.5,1.0,...,0.5,0.5,0.5,0.5,5.0,2.0,2.0,2.0,2.0,7.0
4,2021-09-18,Manchester City,Southampton,True,0,0,0,0,0.75,1.0,...,0.0,0.25,0.25,0.25,1.0,3.0,2.5,2.5,2.5,7.0
7,2021-10-16,Manchester City,Burnley,True,1,2,0,2,0.666667,0.0,...,1.0,0.4,0.428571,0.428571,0.0,0.5,1.4,1.571429,1.571429,13.0
9,2021-10-30,Manchester City,Crystal Palace,True,-1,0,2,-2,0.740741,1.0,...,0.5,0.6,0.444444,0.444444,3.0,2.5,1.2,1.777778,1.777778,7.0
11,2021-11-21,Manchester City,Everton,True,1,3,0,3,0.69697,1.0,...,1.0,1.0,0.5,0.545455,2.0,0.0,1.0,1.7,1.454545,15.0
12,2021-11-28,Manchester City,West Ham,True,1,2,1,1,0.722222,1.0,...,0.0,0.6,0.5,0.5,3.0,2.5,1.6,1.5,1.583333,7.0
15,2021-12-11,Manchester City,Wolves,True,1,1,0,1,0.777778,1.0,...,1.0,0.6,0.8,0.6,2.0,1.5,1.8,1.3,1.533333,7.0
16,2021-12-14,Manchester City,Leeds United,True,1,7,0,7,0.791667,1.0,...,0.5,0.6,0.8,0.5625,1.0,1.5,1.6,1.3,1.5,3.0
18,2021-12-26,Manchester City,Leicester City,True,1,6,3,3,0.814815,1.0,...,0.0,0.4,0.6,0.5,4.0,5.5,3.0,2.2,1.944444,7.0


In [166]:
renameDict = {'Team': 'Away', 'Opp': 'Home'}
for f in ['W', 'GF', 'GA', 'GD'] + features:
    renameDict[f] = f'{f}_away'
away = away.rename(columns = renameDict)
away

Unnamed: 0,Date,Away,Home,H,W_away,GF_away,GA_away,GD_away,PTS%_away,W_1_away,...,GA_2_away,GA_5_away,GA_10_away,GA_inf_away,GD_1_away,GD_2_away,GD_5_away,GD_10_away,GD_inf_away,Rest_away
0,2021-08-15,Manchester City,Tottenham,False,-1,0,1,-1,,,...,,,,,,,,,,
3,2021-09-11,Manchester City,Leicester City,False,1,1,0,1,0.666667,1.0,...,0.0,0.333333,0.333333,0.333333,5.0,5.0,3.0,3.0,3.0,14.0
5,2021-09-25,Manchester City,Chelsea,False,1,1,0,1,0.666667,0.0,...,0.0,0.2,0.2,0.2,0.0,0.5,2.0,2.0,2.0,7.0
6,2021-10-03,Manchester City,Liverpool,False,0,2,2,0,0.722222,1.0,...,0.0,0.0,0.166667,0.166667,1.0,0.5,2.4,1.833333,1.833333,8.0
8,2021-10-23,Manchester City,Brighton,False,1,4,1,3,0.708333,1.0,...,1.0,0.4,0.375,0.375,2.0,1.0,0.8,1.625,1.625,7.0
10,2021-11-06,Manchester City,Manchester Utd,False,1,2,0,2,0.666667,-1.0,...,1.5,1.0,0.6,0.6,-2.0,0.5,0.8,1.4,1.4,7.0
13,2021-12-01,Manchester City,Aston Villa,False,1,2,1,1,0.74359,1.0,...,0.5,0.8,0.6,0.538462,1.0,2.0,1.4,1.1,1.538462,3.0
14,2021-12-04,Manchester City,Watford,False,1,3,1,2,0.761905,1.0,...,1.0,0.8,0.7,0.571429,1.0,1.0,1.0,1.1,1.5,3.0
17,2021-12-19,Manchester City,Newcastle Utd,False,1,4,0,4,0.803922,1.0,...,0.0,0.6,0.6,0.529412,7.0,4.0,2.4,2.0,1.823529,5.0
19,2021-12-29,Manchester City,Brentford,False,1,1,0,1,0.824561,1.0,...,1.5,0.8,0.8,0.631579,3.0,3.5,3.4,2.2,2.0,3.0


In [3]:
teams = sorted(list(set(data['Home'])))