In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [9]:
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2022-09-18,17:00,WSL,Matchweek 2,Sun,Away,L,1.0,2.0,Liverpool,...,Match Report,,9.0,3.0,17.2,0.0,1,1,2024,Chelsea Women
1,2022-09-25,16:00,WSL,Matchweek 3,Sun,Home,W,2.0,0.0,Manchester City,...,Match Report,,11.0,2.0,20.3,0.0,1,1,2024,Chelsea Women
2,2022-09-28,19:05,WSL,Matchweek 1,Wed,Home,W,3.0,1.0,West Ham,...,Match Report,,20.0,7.0,15.1,0.0,0,1,2024,Chelsea Women
3,2022-10-16,13:00,WSL,Matchweek 4,Sun,Away,W,3.0,1.0,Everton,...,Match Report,,21.0,7.0,17.3,0.0,1,1,2024,Chelsea Women
5,2022-10-23,18:45,WSL,Matchweek 5,Sun,Away,W,2.0,0.0,Brighton,...,Match Report,,21.0,9.0,16.0,0.0,0,0,2024,Chelsea Women


In [4]:
matches.shape

(964, 27)

In [6]:
matches["Team"].value_counts()

Manchester City Women             82
Brighton and Hove Albion Women    82
Chelsea Women                     81
Arsenal Women                     81
Manchester United Women           80
Everton Women                     80
West Ham United Women             80
Tottenham Women                   80
Reading Women                     80
Aston Villa Women                 66
Birmingham City Women             56
Leicester City Women              44
Liverpool Women                   36
Bristol City Women                36
Name: Team, dtype: int64

In [7]:
matches["Round"].value_counts()

Regular season    174
Matchweek 14       36
Matchweek 22       36
Matchweek 21       36
Matchweek 19       36
Matchweek 18       36
Matchweek 20       36
Matchweek 12       36
Matchweek 17       36
Matchweek 16       36
Matchweek 15       36
Matchweek 2        36
Matchweek 3        36
Matchweek 10       36
Matchweek 9        36
Matchweek 8        36
Matchweek 7        36
Matchweek 6        36
Matchweek 5        36
Matchweek 4        36
Matchweek 1        36
Matchweek 13       36
Matchweek 11       34
Name: Round, dtype: int64

In [10]:
matches.dtypes

Date             object
Time             object
Comp             object
Round            object
Day              object
Venue            object
Result           object
GF              float64
GA              float64
Opponent         object
xG              float64
xGA             float64
Poss            float64
Attendance      float64
Captain          object
Formation        object
Referee          object
Match Report     object
Notes           float64
Sh              float64
SoT             float64
Dist            float64
FK              float64
PK                int64
PKatt             int64
Season            int64
Team             object
dtype: object

In [11]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [12]:
matches.dtypes

Date            datetime64[ns]
Time                    object
Comp                    object
Round                   object
Day                     object
Venue                   object
Result                  object
GF                     float64
GA                     float64
Opponent                object
xG                     float64
xGA                    float64
Poss                   float64
Attendance             float64
Captain                 object
Formation               object
Referee                 object
Match Report            object
Notes                  float64
Sh                     float64
SoT                    float64
Dist                   float64
FK                     float64
PK                       int64
PKatt                    int64
Season                   int64
Team                    object
dtype: object

In [13]:
matches["Venue_Code"] = matches["Venue"].astype("category").cat.codes

In [15]:
matches["Opp_Code"] = matches["Opponent"].astype("category").cat.codes

In [17]:
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")

In [19]:
matches["Day_Code"] = matches["Date"].dt.dayofweek

In [21]:
matches["Target"] = (matches["Result"] == "W").astype("int")

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [26]:
train = matches[matches["Date"] < '2023-01-01']

In [27]:
test = matches[matches["Date"] > '2023-01-01']

In [28]:
predictors = ["Venue_Code", "Opp_Code", "Hour", "Day_Code"]

In [29]:
rf.fit(train[predictors], train["Target"])

In [30]:
preds = rf.predict(test[predictors])

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
acc = accuracy_score(test["Target"], preds)

In [33]:
acc

0.551948051948052

In [34]:
combined = pd.DataFrame(dict(actual=test["Target"], prediction=preds))

In [39]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,65,25
1,44,20


In [40]:
from sklearn.metrics import precision_score

In [41]:
precision_score(test["Target"], preds)

0.4444444444444444

Improving predictors with rolling averages

In [42]:
grouped_matches = matches.groupby("Team")

In [43]:
group = grouped_matches.get_group("Chelsea Women")

In [44]:
group

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,FK,PK,PKatt,Season,Team,Venue_Code,Opp_Code,Hour,Day_Code,Target
0,2022-09-18,17:00,WSL,Matchweek 2,Sun,Away,L,1.0,2.0,Liverpool,...,0.0,1,1,2024,Chelsea Women,0,8,17,6,0
1,2022-09-25,16:00,WSL,Matchweek 3,Sun,Home,W,2.0,0.0,Manchester City,...,0.0,1,1,2024,Chelsea Women,1,9,16,6,1
2,2022-09-28,19:05,WSL,Matchweek 1,Wed,Home,W,3.0,1.0,West Ham,...,0.0,0,1,2024,Chelsea Women,1,13,19,2,1
3,2022-10-16,13:00,WSL,Matchweek 4,Sun,Away,W,3.0,1.0,Everton,...,0.0,1,1,2024,Chelsea Women,0,6,13,6,1
5,2022-10-23,18:45,WSL,Matchweek 5,Sun,Away,W,2.0,0.0,Brighton,...,0.0,0,0,2024,Chelsea Women,0,3,18,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,2020-01-12,14:00,WSL,Regular season,Sun,Home,W,6.0,1.0,Bristol City,...,0.0,0,0,2021,Chelsea Women,1,4,14,6,1
11,2020-01-19,14:00,WSL,Regular season,Sun,Away,W,4.0,1.0,Arsenal,...,0.0,0,0,2021,Chelsea Women,0,0,14,6,1
12,2020-02-02,14:00,WSL,Regular season,Sun,Home,W,8.0,0.0,West Ham,...,2.0,0,0,2021,Chelsea Women,1,13,14,6,1
13,2020-02-12,19:00,WSL,Regular season,Wed,Home,W,2.0,0.0,Birmingham City,...,0.0,0,0,2021,Chelsea Women,1,2,19,2,1


In [45]:
def rolling_averages(group, cols, new_cols):
  group = group.sort_values("Date")
  rolling_stats = group[cols].rolling(3, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [46]:
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [47]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [48]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day_Code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
3,2019-10-13,15:00,WSL,Regular season,Sun,Home,W,2.0,1.0,Arsenal,...,6,1,2.000000,0.333333,21.333333,8.333333,16.133333,0.000000,0.000000,0.000000
4,2019-10-27,15:00,WSL,Regular season,Sun,Away,W,3.0,1.0,West Ham,...,6,1,2.333333,0.666667,22.000000,8.333333,17.200000,0.333333,0.000000,0.000000
5,2019-11-17,14:00,WSL,Regular season,Sun,Home,W,1.0,0.0,Manchester Utd,...,6,1,3.000000,0.666667,24.666667,10.000000,17.366667,0.333333,0.000000,0.000000
6,2019-11-24,14:45,WSL,Regular season,Sun,Away,W,6.0,0.0,Birmingham City,...,6,1,2.000000,0.666667,19.333333,7.000000,18.766667,0.333333,0.333333,0.333333
7,2019-12-08,12:00,WSL,Regular season,Sun,Home,W,2.0,1.0,Manchester City,...,6,1,3.333333,0.333333,17.000000,8.333333,15.866667,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,2023-05-07,18:45,WSL,Matchweek 20,Sun,Home,W,7.0,0.0,Everton,...,6,1,1.666667,1.000000,16.000000,5.333333,16.433333,0.333333,0.000000,0.000000
28,2023-05-10,19:00,WSL,Matchweek 18,Wed,Home,W,6.0,0.0,Leicester City,...,2,1,4.000000,0.333333,18.666667,7.666667,14.466667,0.333333,0.000000,0.000000
29,2023-05-17,20:15,WSL,Matchweek 19,Wed,Away,W,4.0,0.0,West Ham,...,2,1,5.000000,0.333333,21.333333,9.000000,14.333333,0.333333,0.000000,0.000000
30,2023-05-21,12:30,WSL,Matchweek 21,Sun,Home,W,2.0,0.0,Arsenal,...,6,1,5.666667,0.000000,19.666667,9.666667,13.066667,0.000000,0.000000,0.000000


In [49]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [50]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day_Code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal Women,5,2019-10-13,15:00,WSL,Regular season,Sun,Away,L,1.0,2.0,Chelsea,...,6,0,2.333333,0.333333,16.666667,6.000000,14.933333,0.000000,0.000000,0.000000
Arsenal Women,7,2019-10-27,14:30,WSL,Regular season,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.000000,0.666667,13.333333,5.333333,15.233333,0.000000,0.000000,0.000000
Arsenal Women,9,2019-11-17,15:00,WSL,Regular season,Sun,Away,W,2.0,0.0,Tottenham,...,6,1,2.000000,0.666667,9.000000,3.666667,17.100000,0.000000,0.000000,0.000000
Arsenal Women,10,2019-11-24,14:00,WSL,Regular season,Sun,Home,W,1.0,0.0,Liverpool,...,6,1,1.333333,0.666667,11.000000,3.000000,17.866667,0.333333,0.000000,0.000000
Arsenal Women,11,2019-12-01,12:30,WSL,Regular season,Sun,Home,W,11.0,1.0,Bristol City,...,6,1,1.333333,0.000000,14.000000,2.666667,18.333333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Ham United Women,17,2023-04-23,18:45,WSL,Matchweek 18,Sun,Away,L,2.0,6.0,Manchester City,...,6,0,0.333333,2.000000,6.666667,3.000000,19.533333,0.333333,0.000000,0.000000
West Ham United Women,18,2023-05-07,14:00,WSL,Matchweek 20,Sun,Away,L,0.0,1.0,Brighton,...,6,0,0.666667,3.333333,7.000000,2.666667,19.233333,0.333333,0.000000,0.000000
West Ham United Women,19,2023-05-17,20:15,WSL,Matchweek 19,Wed,Home,L,0.0,4.0,Chelsea,...,2,0,0.666667,2.333333,5.666667,1.333333,19.300000,0.000000,0.000000,0.000000
West Ham United Women,20,2023-05-21,15:00,WSL,Matchweek 21,Sun,Away,W,2.0,1.0,Leicester City,...,6,1,0.666667,3.666667,6.000000,1.666667,16.033333,0.000000,0.000000,0.000000


In [51]:
matches_rolling = matches_rolling.droplevel('Team')

In [52]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day_Code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
5,2019-10-13,15:00,WSL,Regular season,Sun,Away,L,1.0,2.0,Chelsea,...,6,0,2.333333,0.333333,16.666667,6.000000,14.933333,0.000000,0.000000,0.000000
7,2019-10-27,14:30,WSL,Regular season,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.000000,0.666667,13.333333,5.333333,15.233333,0.000000,0.000000,0.000000
9,2019-11-17,15:00,WSL,Regular season,Sun,Away,W,2.0,0.0,Tottenham,...,6,1,2.000000,0.666667,9.000000,3.666667,17.100000,0.000000,0.000000,0.000000
10,2019-11-24,14:00,WSL,Regular season,Sun,Home,W,1.0,0.0,Liverpool,...,6,1,1.333333,0.666667,11.000000,3.000000,17.866667,0.333333,0.000000,0.000000
11,2019-12-01,12:30,WSL,Regular season,Sun,Home,W,11.0,1.0,Bristol City,...,6,1,1.333333,0.000000,14.000000,2.666667,18.333333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,2023-04-23,18:45,WSL,Matchweek 18,Sun,Away,L,2.0,6.0,Manchester City,...,6,0,0.333333,2.000000,6.666667,3.000000,19.533333,0.333333,0.000000,0.000000
18,2023-05-07,14:00,WSL,Matchweek 20,Sun,Away,L,0.0,1.0,Brighton,...,6,0,0.666667,3.333333,7.000000,2.666667,19.233333,0.333333,0.000000,0.000000
19,2023-05-17,20:15,WSL,Matchweek 19,Wed,Home,L,0.0,4.0,Chelsea,...,2,0,0.666667,2.333333,5.666667,1.333333,19.300000,0.000000,0.000000,0.000000
20,2023-05-21,15:00,WSL,Matchweek 21,Sun,Away,W,2.0,1.0,Leicester City,...,6,1,0.666667,3.666667,6.000000,1.666667,16.033333,0.000000,0.000000,0.000000


In [53]:
matches_rolling.index = range(matches_rolling.shape[0])

In [54]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day_Code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2019-10-13,15:00,WSL,Regular season,Sun,Away,L,1.0,2.0,Chelsea,...,6,0,2.333333,0.333333,16.666667,6.000000,14.933333,0.000000,0.000000,0.000000
1,2019-10-27,14:30,WSL,Regular season,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.000000,0.666667,13.333333,5.333333,15.233333,0.000000,0.000000,0.000000
2,2019-11-17,15:00,WSL,Regular season,Sun,Away,W,2.0,0.0,Tottenham,...,6,1,2.000000,0.666667,9.000000,3.666667,17.100000,0.000000,0.000000,0.000000
3,2019-11-24,14:00,WSL,Regular season,Sun,Home,W,1.0,0.0,Liverpool,...,6,1,1.333333,0.666667,11.000000,3.000000,17.866667,0.333333,0.000000,0.000000
4,2019-12-01,12:30,WSL,Regular season,Sun,Home,W,11.0,1.0,Bristol City,...,6,1,1.333333,0.000000,14.000000,2.666667,18.333333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,2023-04-23,18:45,WSL,Matchweek 18,Sun,Away,L,2.0,6.0,Manchester City,...,6,0,0.333333,2.000000,6.666667,3.000000,19.533333,0.333333,0.000000,0.000000
915,2023-05-07,14:00,WSL,Matchweek 20,Sun,Away,L,0.0,1.0,Brighton,...,6,0,0.666667,3.333333,7.000000,2.666667,19.233333,0.333333,0.000000,0.000000
916,2023-05-17,20:15,WSL,Matchweek 19,Wed,Home,L,0.0,4.0,Chelsea,...,2,0,0.666667,2.333333,5.666667,1.333333,19.300000,0.000000,0.000000,0.000000
917,2023-05-21,15:00,WSL,Matchweek 21,Sun,Away,W,2.0,1.0,Leicester City,...,6,1,0.666667,3.666667,6.000000,1.666667,16.033333,0.000000,0.000000,0.000000


Retraining ML model

In [55]:
def make_predictions(data, predictor):
  train = data[data["Date"] < '2023-01-01']
  test = data[data["Date"] > '2023-01-01']
  rf.fit(train[predictors], train["Target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
  precision = precision_score(test["Target"], preds)
  return combined, precision

In [56]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [57]:
precision

0.45454545454545453

In [58]:
combined

Unnamed: 0,actual,predicted
65,0,0
66,0,1
67,0,0
68,1,0
69,1,1
...,...,...
914,0,1
915,0,0
916,0,0
917,1,1


In [59]:
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

In [60]:
combined

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
65,0,0,2023-01-15,Arsenal Women,Chelsea,D
66,0,1,2023-02-05,Arsenal Women,West Ham,D
67,0,0,2023-02-11,Arsenal Women,Manchester City,L
68,1,0,2023-03-08,Arsenal Women,Liverpool,W
69,1,1,2023-03-12,Arsenal Women,Reading,W
...,...,...,...,...,...,...
914,0,1,2023-04-23,West Ham United Women,Manchester City,L
915,0,0,2023-05-07,West Ham United Women,Brighton,L
916,0,0,2023-05-17,West Ham United Women,Chelsea,L
917,1,1,2023-05-21,West Ham United Women,Leicester City,W


In [64]:
class MissingDict(dict):
  __missing__ = lambda self, key: key

map_values = {
    "Arsenal Women": "Arsenal",
    "Aston Villa Women": "Aston Villa",
    "Brighton and Hove Albion Women": "Brighton",
    "Chelsea Women": "Chelsea",
    "Everton Women": "Everton",
    "Leicester City Women": "Lecester City",
    "Liverpool Women": "Liverpool",
    "Manchester City Women": "Manchester City",
    "Manchester United Women": "Manchester Utd",
    "Reading Women": "Reading",
    "Tottenham Women": "Tottenham",
    "West Ham United Women": "West Ham"
}
mapping = MissingDict(**map_values)

In [61]:
combined["Team"].unique()

array(['Arsenal Women', 'Aston Villa Women',
       'Brighton and Hove Albion Women', 'Chelsea Women', 'Everton Women',
       'Leicester City Women', 'Liverpool Women', 'Manchester City Women',
       'Manchester United Women', 'Reading Women', 'Tottenham Women',
       'West Ham United Women'], dtype=object)

In [68]:
mapping["Arsenal Women"]

'Arsenal'

In [69]:
combined["New_Team"] = combined["Team"].map(mapping)

In [70]:
combined

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result,New_Team
65,0,0,2023-01-15,Arsenal Women,Chelsea,D,Arsenal
66,0,1,2023-02-05,Arsenal Women,West Ham,D,Arsenal
67,0,0,2023-02-11,Arsenal Women,Manchester City,L,Arsenal
68,1,0,2023-03-08,Arsenal Women,Liverpool,W,Arsenal
69,1,1,2023-03-12,Arsenal Women,Reading,W,Arsenal
...,...,...,...,...,...,...,...
914,0,1,2023-04-23,West Ham United Women,Manchester City,L,West Ham
915,0,0,2023-05-07,West Ham United Women,Brighton,L,West Ham
916,0,0,2023-05-17,West Ham United Women,Chelsea,L,West Ham
917,1,1,2023-05-21,West Ham United Women,Leicester City,W,West Ham


In [75]:
merged = combined.merge(combined, left_on=["Date", "New_Team"], right_on=["Date", "Opponent"])

In [76]:
merged

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,New_Team_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y,New_Team_y
0,0,0,2023-01-15,Arsenal Women,Chelsea,D,Arsenal,0,0,Chelsea Women,Arsenal,D,Chelsea
1,0,1,2023-02-05,Arsenal Women,West Ham,D,Arsenal,0,0,West Ham United Women,Arsenal,D,West Ham
2,0,0,2023-02-11,Arsenal Women,Manchester City,L,Arsenal,1,0,Manchester City Women,Arsenal,W,Manchester City
3,1,0,2023-03-08,Arsenal Women,Liverpool,W,Arsenal,0,0,Liverpool Women,Arsenal,L,Liverpool
4,1,1,2023-03-12,Arsenal Women,Reading,W,Arsenal,0,0,Reading Women,Arsenal,L,Reading
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,0,1,2023-04-23,West Ham United Women,Manchester City,L,West Ham,1,1,Manchester City Women,West Ham,W,Manchester City
137,0,0,2023-05-07,West Ham United Women,Brighton,L,West Ham,1,0,Brighton and Hove Albion Women,West Ham,W,Brighton
138,0,0,2023-05-17,West Ham United Women,Chelsea,L,West Ham,1,0,Chelsea Women,West Ham,W,Chelsea
139,1,1,2023-05-21,West Ham United Women,Leicester City,W,West Ham,0,0,Leicester City Women,West Ham,L,Lecester City


In [77]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

1    18
0    16
Name: actual_x, dtype: int64

In [78]:
18/34

0.5294117647058824