In [54]:
import pandas as pd

In [55]:
matches = pd.read_csv('clean-matches.csv', index_col=0)

In [56]:
matches.shape

(12406, 7)

In [57]:
matches["Date"] = pd.to_datetime(matches["Date"], format='mixed', errors='coerce')

In [58]:
matches ["opponent"] = matches["Away"].astype("category").cat.codes
matches ["home_code"] = matches["Home"].astype("category").cat.codes
matches["day_code"] = matches["Date"].dt.day_of_week
matches["target"] = (matches["FTR"] == "H").astype(int)

In [59]:
matches

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H,29,17,5,1
1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H,49,26,5,1
1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H,28,44,5,1
1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D,4,18,5,0
1993,1,1992-08-15,Arsenal,2,4,Norwich City,A,31,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...
2024,38,2024-05-19,Arsenal,2,1,Everton,H,18,0,6,1
2024,38,2024-05-19,Chelsea,2,1,Bournemouth,H,7,16,6,1
2024,38,2024-05-19,ManchesterCity,3,1,West Ham,H,47,32,6,1
2024,38,2024-05-19,BrightonandHoveAlbion,0,2,Manchester Utd,A,28,12,6,0


In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10,random_state=42)

In [62]:
train = matches[matches["Date"] < "2024-01-01"]
test = matches[matches["Date"] >= "2024-01-01"]

In [63]:
predictors = ["opponent", "home_code", "day_code"]

In [64]:
rf.fit(train[predictors], train["target"])

In [65]:
preds = rf.predict(test[predictors])

In [66]:
from sklearn.metrics import accuracy_score

In [67]:
acc = accuracy_score(test["target"], preds)

In [68]:
acc

0.6630434782608695

In [69]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [70]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73,30
1,32,49


In [71]:
from sklearn.metrics import precision_score

In [72]:
precision_score(test["target"], preds)

np.float64(0.620253164556962)

In [73]:
grouped_matches = matches.groupby("Home")

In [74]:
group = grouped_matches.get_group("Arsenal")
group

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993,1,1992-08-15,Arsenal,2,4,Norwich City,A,31,0,5,0
1993,4,1992-08-26,Arsenal,2,0,Oldham Athletic,H,33,0,2,1
1993,5,1992-08-29,Arsenal,2,1,Sheffield Weds,H,38,0,5,1
1993,8,1992-12-09,Arsenal,0,1,Blackburn,A,4,0,2,0
1993,10,1992-09-28,Arsenal,1,0,Manchester City,H,27,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2024,29,2024-04-23,Arsenal,5,0,Chelsea,H,14,0,1,1
2024,31,2024-04-03,Arsenal,2,0,Luton Town,H,26,0,2,1
2024,33,2024-04-14,Arsenal,0,2,Aston Villa,A,1,0,6,0
2024,36,2024-05-04,Arsenal,3,0,Bournemouth,H,7,0,5,1


In [75]:
def rolling_avg(group,cols,new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [76]:
cols = ["HomeGoals", "AwayGoals", "target"]
new_cols = [f"{c}_rolling" for c in cols]

In [77]:
rolling_avg(group,cols,new_cols)

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling,target_rolling
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1993,4,1992-08-26,Arsenal,2,0,Oldham Athletic,H,33,0,2,1,2.333333,1.666667,0.666667
1993,5,1992-08-29,Arsenal,2,1,Sheffield Weds,H,38,0,5,1,2.333333,1.333333,0.666667
1993,10,1992-09-28,Arsenal,1,0,Manchester City,H,27,0,0,1,2.000000,1.666667,0.666667
1993,13,1992-10-24,Arsenal,2,0,Everton,H,18,0,5,1,1.666667,0.333333,1.000000
1993,17,1992-11-28,Arsenal,0,1,Manchester Utd,A,28,0,5,0,1.666667,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,31,2024-04-03,Arsenal,2,0,Luton Town,H,26,0,2,1,3.000000,1.000000,1.000000
2024,33,2024-04-14,Arsenal,0,2,Aston Villa,A,1,0,6,0,2.666667,0.666667,1.000000
2024,29,2024-04-23,Arsenal,5,0,Chelsea,H,14,0,1,1,1.333333,1.000000,0.666667
2024,36,2024-05-04,Arsenal,3,0,Bournemouth,H,7,0,5,1,2.333333,0.666667,0.666667


In [78]:
matches_rolling = matches.groupby("Home").apply(lambda x: rolling_avg(x,cols,new_cols))
matches_rolling

  matches_rolling = matches.groupby("Home").apply(lambda x: rolling_avg(x,cols,new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling,target_rolling
Home,Season_End_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Arsenal,1993,4,1992-08-26,Arsenal,2,0,Oldham Athletic,H,33,0,2,1,2.333333,1.666667,0.666667
Arsenal,1993,5,1992-08-29,Arsenal,2,1,Sheffield Weds,H,38,0,5,1,2.333333,1.333333,0.666667
Arsenal,1993,10,1992-09-28,Arsenal,1,0,Manchester City,H,27,0,0,1,2.000000,1.666667,0.666667
Arsenal,1993,13,1992-10-24,Arsenal,2,0,Everton,H,18,0,5,1,1.666667,0.333333,1.000000
Arsenal,1993,17,1992-11-28,Arsenal,0,1,Manchester Utd,A,28,0,5,0,1.666667,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,2023,31,2023-04-15,Wolves,2,0,Brentford,H,9,61,5,1,2.000000,1.333333,0.666667
Wolves,2023,33,2023-04-25,Wolves,2,0,Crystal Palace,H,16,61,1,1,2.000000,0.000000,1.000000
Wolves,2023,37,2023-05-20,Wolves,1,1,Everton,D,18,61,5,0,1.666667,0.000000,1.000000
Wolves,2023,35,2023-06-05,Wolves,1,0,Aston Villa,H,1,61,0,1,1.666667,0.333333,0.666667


In [79]:
matches_rolling = matches_rolling.droplevel("Home")
matches_rolling

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling,target_rolling
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1993,4,1992-08-26,Arsenal,2,0,Oldham Athletic,H,33,0,2,1,2.333333,1.666667,0.666667
1993,5,1992-08-29,Arsenal,2,1,Sheffield Weds,H,38,0,5,1,2.333333,1.333333,0.666667
1993,10,1992-09-28,Arsenal,1,0,Manchester City,H,27,0,0,1,2.000000,1.666667,0.666667
1993,13,1992-10-24,Arsenal,2,0,Everton,H,18,0,5,1,1.666667,0.333333,1.000000
1993,17,1992-11-28,Arsenal,0,1,Manchester Utd,A,28,0,5,0,1.666667,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,31,2023-04-15,Wolves,2,0,Brentford,H,9,61,5,1,2.000000,1.333333,0.666667
2023,33,2023-04-25,Wolves,2,0,Crystal Palace,H,16,61,1,1,2.000000,0.000000,1.000000
2023,37,2023-05-20,Wolves,1,1,Everton,D,18,61,5,0,1.666667,0.000000,1.000000
2023,35,2023-06-05,Wolves,1,0,Aston Villa,H,1,61,0,1,1.666667,0.333333,0.666667


In [81]:
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,opponent,home_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling,target_rolling
0,4,1992-08-26,Arsenal,2,0,Oldham Athletic,H,33,0,2,1,2.333333,1.666667,0.666667
1,5,1992-08-29,Arsenal,2,1,Sheffield Weds,H,38,0,5,1,2.333333,1.333333,0.666667
2,10,1992-09-28,Arsenal,1,0,Manchester City,H,27,0,0,1,2.000000,1.666667,0.666667
3,13,1992-10-24,Arsenal,2,0,Everton,H,18,0,5,1,1.666667,0.333333,1.000000
4,17,1992-11-28,Arsenal,0,1,Manchester Utd,A,28,0,5,0,1.666667,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,31,2023-04-15,Wolves,2,0,Brentford,H,9,61,5,1,2.000000,1.333333,0.666667
12216,33,2023-04-25,Wolves,2,0,Crystal Palace,H,16,61,1,1,2.000000,0.000000,1.000000
12217,37,2023-05-20,Wolves,1,1,Everton,D,18,61,5,0,1.666667,0.000000,1.000000
12218,35,2023-06-05,Wolves,1,0,Aston Villa,H,1,61,0,1,1.666667,0.333333,0.666667


In [82]:
def make_predictions(data, predictors):
    train = data[data["Date"] < "2024-01-01"]
    test = data[data["Date"] >= "2024-01-01"]
    rf = RandomForestClassifier(n_estimators=50, min_samples_split=10,random_state=42)
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)    
    precision = precision_score(test["target"], preds)
    return combined, precision

In [83]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [84]:
precision

np.float64(0.6825396825396826)

In [86]:
combined = combined.merge(matches_rolling[["Home", "Date", "Away", "FTR"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,Home,Date,Away,FTR
602,1,1,Arsenal,2024-01-20,Crystal Palace,H
603,1,0,Arsenal,2024-02-04,Liverpool,H
604,1,1,Arsenal,2024-02-24,Newcastle Utd,H
605,1,1,Arsenal,2024-03-09,Brentford,H
606,1,1,Arsenal,2024-04-03,Luton Town,H
...,...,...,...,...,...,...
12047,0,0,WolverhamptonWanderers,2024-04-06,West Ham,A
12048,0,0,WolverhamptonWanderers,2024-04-20,Arsenal,A
12049,0,0,WolverhamptonWanderers,2024-04-24,Bournemouth,A
12050,1,0,WolverhamptonWanderers,2024-04-27,Luton Town,H
