In [46]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score


In [13]:
matches = pd.read_csv('matches.csv', index_col= 0)

In [14]:
matches.shape

(1680, 27)

In [15]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-13,14:00,Premier League,Matchweek 1,Sun,Away,D,2.0,2.0,Brentford,...,Match Report,,18.0,6.0,19.6,0.0,0.0,0.0,2023,Tottenham Hotspur
1,2023-08-19,17:30,Premier League,Matchweek 2,Sat,Home,W,2.0,0.0,Manchester Utd,...,Match Report,,17.0,6.0,13.8,0.0,0.0,0.0,2023,Tottenham Hotspur
2,2023-08-26,12:30,Premier League,Matchweek 3,Sat,Away,W,2.0,0.0,Bournemouth,...,Match Report,,17.0,6.0,16.6,1.0,0.0,0.0,2023,Tottenham Hotspur
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Away,W,5.0,2.0,Burnley,...,Match Report,,21.0,11.0,19.3,0.0,0.0,0.0,2023,Tottenham Hotspur
5,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,W,2.0,1.0,Sheffield Utd,...,Match Report,,28.0,10.0,16.4,0.0,0.0,0.0,2023,Tottenham Hotspur


In [16]:
matches['team'].value_counts()

Tottenham Hotspur           84
Crystal Palace              84
Everton                     84
Brentford                   84
Wolverhampton Wanderers     84
Arsenal                     84
Chelsea                     84
Manchester United           84
Newcastle United            84
West Ham United             84
Brighton and Hove Albion    84
Aston Villa                 84
Liverpool                   84
Manchester City             84
Southampton                 76
Leeds United                76
Leicester City              76
Bournemouth                 46
Nottingham Forest           46
Burnley                     46
Fulham                      46
Watford                     38
Norwich City                38
Sheffield United             8
Luton Town                   8
Name: team, dtype: int64

In [17]:
matches['round'].value_counts()

Matchweek 1     60
Matchweek 3     60
Matchweek 4     60
Matchweek 5     60
Matchweek 6     60
Matchweek 7     60
Matchweek 8     60
Matchweek 2     60
Matchweek 33    40
Matchweek 26    40
Matchweek 27    40
Matchweek 29    40
Matchweek 30    40
Matchweek 31    40
Matchweek 35    40
Matchweek 34    40
Matchweek 28    40
Matchweek 24    40
Matchweek 36    40
Matchweek 37    40
Matchweek 32    40
Matchweek 25    40
Matchweek 21    40
Matchweek 12    40
Matchweek 23    40
Matchweek 22    40
Matchweek 20    40
Matchweek 19    40
Matchweek 18    40
Matchweek 17    40
Matchweek 16    40
Matchweek 15    40
Matchweek 14    40
Matchweek 13    40
Matchweek 11    40
Matchweek 10    40
Matchweek 9     40
Matchweek 38    40
Name: round, dtype: int64

In [18]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [19]:
matches['date']=pd.to_datetime(matches['date'])

In [20]:
# Create a predictor for the ML model (venue column as the model will only work with numerical values. Home advantage is a great predictor for possible win or loss)
matches['venue_code']= matches['venue'].astype('category').cat.codes

In [22]:
# Create a predictor for the ML model(Opponent column as the model will only work with numerical values)
matches['opp_code']= matches['opponent'].astype('category').cat.codes

In [24]:
# Create a predictor for the ML model(Hour column as the model will only work with numerical values. We are assuming that the time the match is been played affects performance)
matches['hour']= matches['time'].str.replace(":.+","",regex=True).astype('int')

In [26]:
# Create a predictor for the ML model()
matches['day_code']= matches['date'].dt.dayofweek

In [29]:
matches['target'] = (matches['result'] == 'W').astype('int')

In [30]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-13,14:00,Premier League,Matchweek 1,Sun,Away,D,2.0,2.0,Brentford,...,0.0,0.0,0.0,2023,Tottenham Hotspur,0,3,14,6,0
1,2023-08-19,17:30,Premier League,Matchweek 2,Sat,Home,W,2.0,0.0,Manchester Utd,...,0.0,0.0,0.0,2023,Tottenham Hotspur,1,15,17,5,1
2,2023-08-26,12:30,Premier League,Matchweek 3,Sat,Away,W,2.0,0.0,Bournemouth,...,1.0,0.0,0.0,2023,Tottenham Hotspur,0,2,12,5,1
4,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Away,W,5.0,2.0,Burnley,...,0.0,0.0,0.0,2023,Tottenham Hotspur,0,5,15,5,1
5,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,W,2.0,1.0,Sheffield Utd,...,0.0,0.0,0.0,2023,Tottenham Hotspur,1,19,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0.0,2.0,Aston Villa,...,0.0,0.0,0.0,2021,Norwich City,0,1,15,5,0
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0.0,4.0,West Ham,...,1.0,0.0,0.0,2021,Norwich City,1,23,14,6,0
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0.0,3.0,Leicester City,...,0.0,0.0,0.0,2021,Norwich City,0,11,19,2,0
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1.0,1.0,Wolves,...,0.0,0.0,0.0,2021,Norwich City,0,24,14,6,0


In [32]:
#initialiaze the model
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [90]:
# Split training data
train = matches[matches['date'] < '2022-01-01']

In [91]:
# Split testing data
test = matches[matches['date'] > '2022-01-01']

In [120]:
# The predictors
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [119]:
rf.fit(train[predictors], train['target'])

In [94]:
preds = rf.predict(test[predictors])

In [95]:
# accuracy of the model
acc = accuracy_score(test['target'],preds)

In [96]:
print("Random Forest Classifier accuracy is : {:.2f}".format(acc))

Random Forest Classifier accuracy is : 0.59


In [43]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [45]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,617,180
1,357,154


In [143]:
precision_score(test['target'],preds)

0.4561933534743202

In [144]:
grouped_matches = matches.groupby("team")

In [145]:
group = grouped_matches.get_group('Manchester United')

In [146]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-14,20:00,Premier League,Matchweek 1,Mon,Home,W,1.0,0.0,Wolves,...,0.0,0.0,0.0,2023,Manchester United,1,24,20,0,1
1,2023-08-19,17:30,Premier League,Matchweek 2,Sat,Away,L,0.0,2.0,Tottenham,...,1.0,0.0,0.0,2023,Manchester United,0,21,17,5,0
2,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,W,3.0,2.0,Nott'ham Forest,...,0.0,1.0,1.0,2023,Manchester United,1,18,15,5,1
3,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Away,L,1.0,3.0,Arsenal,...,0.0,0.0,0.0,2023,Manchester United,0,0,16,6,0
4,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,L,1.0,3.0,Brighton,...,1.0,0.0,0.0,2023,Manchester United,1,4,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-04-23,12:30,Premier League,Matchweek 34,Sat,Away,L,1.0,3.0,Arsenal,...,0.0,0.0,1.0,2021,Manchester United,0,0,12,5,0
45,2022-04-28,19:45,Premier League,Matchweek 37,Thu,Home,D,1.0,1.0,Chelsea,...,0.0,0.0,0.0,2021,Manchester United,1,6,19,3,0
46,2022-05-02,20:00,Premier League,Matchweek 35,Mon,Home,W,3.0,0.0,Brentford,...,1.0,1.0,1.0,2021,Manchester United,1,3,20,0,1
47,2022-05-07,17:30,Premier League,Matchweek 36,Sat,Away,L,0.0,4.0,Brighton,...,1.0,0.0,0.0,2021,Manchester United,0,4,17,5,0


In [147]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [148]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,1.0,Newcastle Utd,...,5,1,2.333333,0.666667,13.666667,4.666667,17.366667,0.666667,0.000000,0.000000
5,2021-09-19,14:00,Premier League,Matchweek 5,Sun,Away,W,2.0,1.0,West Ham,...,6,1,2.000000,0.666667,15.333333,4.000000,18.133333,0.666667,0.000000,0.000000
7,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Home,L,0.0,1.0,Aston Villa,...,5,0,2.333333,0.666667,16.000000,6.333333,18.800000,0.333333,0.000000,0.000000
9,2021-10-02,12:30,Premier League,Matchweek 7,Sat,Home,D,1.0,1.0,Everton,...,5,0,2.000000,1.000000,21.666667,6.666667,19.100000,0.666667,0.000000,0.333333
10,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Away,L,2.0,4.0,Leicester City,...,5,0,1.000000,1.000000,19.000000,6.666667,17.300000,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Away,L,1.0,3.0,Arsenal,...,6,0,1.333333,1.333333,18.000000,5.333333,16.866667,0.333333,0.333333,0.333333
4,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,L,1.0,3.0,Brighton,...,5,0,1.333333,2.333333,16.333333,5.333333,17.466667,0.333333,0.333333,0.333333
6,2023-09-23,20:00,Premier League,Matchweek 6,Sat,Away,W,1.0,0.0,Burnley,...,5,1,1.666667,2.666667,13.666667,4.666667,18.466667,0.333333,0.333333,0.333333
8,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Home,L,0.0,1.0,Crystal Palace,...,5,0,1.000000,2.000000,11.666667,3.333333,16.700000,0.333333,0.000000,0.000000


In [149]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

In [150]:
matches_rolling[matches_rolling['team'] == 'Manchester United']

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Manchester United,3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,1.0,Newcastle Utd,...,5,1,2.333333,0.666667,13.666667,4.666667,17.366667,0.666667,0.000000,0.000000
Manchester United,5,2021-09-19,14:00,Premier League,Matchweek 5,Sun,Away,W,2.0,1.0,West Ham,...,6,1,2.000000,0.666667,15.333333,4.000000,18.133333,0.666667,0.000000,0.000000
Manchester United,7,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Home,L,0.0,1.0,Aston Villa,...,5,0,2.333333,0.666667,16.000000,6.333333,18.800000,0.333333,0.000000,0.000000
Manchester United,9,2021-10-02,12:30,Premier League,Matchweek 7,Sat,Home,D,1.0,1.0,Everton,...,5,0,2.000000,1.000000,21.666667,6.666667,19.100000,0.666667,0.000000,0.333333
Manchester United,10,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Away,L,2.0,4.0,Leicester City,...,5,0,1.000000,1.000000,19.000000,6.666667,17.300000,0.666667,0.000000,0.333333
Manchester United,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Manchester United,3,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Away,L,1.0,3.0,Arsenal,...,6,0,1.333333,1.333333,18.000000,5.333333,16.866667,0.333333,0.333333,0.333333
Manchester United,4,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,L,1.0,3.0,Brighton,...,5,0,1.333333,2.333333,16.333333,5.333333,17.466667,0.333333,0.333333,0.333333
Manchester United,6,2023-09-23,20:00,Premier League,Matchweek 6,Sat,Away,W,1.0,0.0,Burnley,...,5,1,1.666667,2.666667,13.666667,4.666667,18.466667,0.333333,0.333333,0.333333
Manchester United,8,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Home,L,0.0,1.0,Crystal Palace,...,5,0,1.000000,2.000000,11.666667,3.333333,16.700000,0.333333,0.000000,0.000000


In [151]:
matches_rolling = matches_rolling.droplevel('team')

In [152]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.0,0.0
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.0,0.0
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.0,0.0
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.0,0.0
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Away,L,2.0,3.0,Crystal Palace,...,6,0,0.666667,1.666667,16.666667,4.333333,16.400000,0.333333,0.0,0.0
5,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Home,L,1.0,3.0,Liverpool,...,5,0,1.333333,2.333333,13.000000,3.666667,16.700000,0.333333,0.0,0.0
6,2023-09-23,15:00,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Luton Town,...,5,0,1.333333,2.000000,11.333333,2.666667,17.566667,0.333333,0.0,0.0
8,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Home,W,2.0,1.0,Manchester City,...,5,1,1.333333,2.333333,8.666667,3.000000,13.966667,0.000000,0.0,0.0


In [153]:
matches_rolling.index = range(matches_rolling.shape[0])

In [154]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [155]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [156]:
precision

0.5070921985815603

In [68]:
combined

Unnamed: 0,actual,predicted
17,0,0
18,1,0
19,1,0
20,1,1
21,1,1
...,...,...
1600,0,0
1601,0,0
1602,0,0
1603,1,0


In [69]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [70]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
17,0,0,2022-01-23,Arsenal,Burnley,D
18,1,0,2022-02-10,Arsenal,Wolves,W
19,1,0,2022-02-19,Arsenal,Brentford,W
20,1,1,2022-02-24,Arsenal,Wolves,W
21,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1600,0,0,2023-09-03,Wolverhampton Wanderers,Crystal Palace,L
1601,0,0,2023-09-16,Wolverhampton Wanderers,Liverpool,L
1602,0,0,2023-09-23,Wolverhampton Wanderers,Luton Town,D
1603,1,0,2023-09-30,Wolverhampton Wanderers,Manchester City,W


In [73]:
# RESOLVING THE NAMES IN TEAM AND OPPPONENT
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [74]:
combined["new_team"] = combined["team"].map(mapping)


In [75]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [76]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,0,0,2023-09-03,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,Crystal Palace,Wolves,W,Crystal Palace
1227,0,0,2023-09-16,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
1228,0,0,2023-09-23,Wolverhampton Wanderers,Luton Town,D,Wolves,0,0,Luton Town,Wolves,D,Luton Town
1229,1,0,2023-09-30,Wolverhampton Wanderers,Manchester City,W,Wolves,0,0,Manchester City,Wolves,L,Manchester City


In [77]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()


1    121
0    106
Name: actual_x, dtype: int64

In [83]:
# Using GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [84]:
# Instantiate Gradient Boosting Regressor
gbc = GradientBoostingClassifier(n_estimators=300,
                                 learning_rate=0.05,
                                 random_state=1)

In [122]:
# Fit to training set
gbc.fit(train[predictors], train['target'])

In [123]:
# Predict on test set
preds = gbc.predict(test[predictors])

In [124]:
# accuracy
acc = accuracy_score(test['target'],preds)

In [125]:
print("Gradient Boosting Classifier accuracy is : {:.2f}".format(acc))

Gradient Boosting Classifier accuracy is : 0.59


In [126]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [127]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,617,180
1,360,151


In [128]:
precision_score(test['target'],preds)

0.4561933534743202

In [157]:
# Using GBC with the new set of predictors
def make_predictions2(data, predictors):
    train = matches_rolling[matches_rolling["date"] < '2022-01-01']
    test = matches_rolling[matches_rolling["date"] > '2022-01-01']
    gbc.fit(train[predictors], train["target"])
    preds = gbc.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [158]:
combined, precision = make_predictions2(matches_rolling, (predictors + new_cols))

In [159]:
print("Gradient Boosting Classifier accuracy is : {:.2f}".format(precision))

Gradient Boosting Classifier accuracy is : 0.50
