In [None]:
import pandas as pd


In [None]:
matches = pd.read_csv("matches.csv", index_col=0)

In [7]:
del matches["comp"]


In [8]:
del matches["notes"]

In [18]:
matches["date"] = pd.to_datetime(matches["date"])

In [19]:
matches["target"] = (matches["result"] == "W").astype("int")


In [13]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes


In [14]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes


In [15]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")


In [20]:
matches["day_code"] = matches["date"].dt.dayofweek


In [24]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)


In [27]:
train = matches[matches["date"] < '2022-01-01']


In [28]:
test = matches[matches["date"] > '2022-01-01']


In [29]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]


In [30]:
rf.fit(train[predictors], train["target"])


In [31]:
preds = rf.predict(test[predictors])


In [32]:
from sklearn.metrics import accuracy_score


In [34]:
error = accuracy_score(test["target"], preds)


In [35]:
error

0.6123188405797102

In [36]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))


In [37]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])


predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [38]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.4745762711864407

In [39]:
grouped_matches = matches.groupby("team")


In [40]:
group = grouped_matches.get_group("Manchester City").sort_values("date")


In [42]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [44]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]



In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [47]:
matches_rolling = matches_rolling.droplevel('team')


In [48]:
matches_rolling.index = range(matches_rolling.shape[0])


In [50]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [54]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)


In [55]:
precision


0.625

In [56]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)


In [57]:
combined.head(10)


Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
60,1,1,2022-03-13,Arsenal,Leicester City,W
61,0,1,2022-03-16,Arsenal,Liverpool,L
62,1,0,2022-03-19,Arsenal,Aston Villa,W
63,0,0,2022-04-04,Arsenal,Crystal Palace,L
64,0,0,2022-04-09,Arsenal,Brighton,L


In [58]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [59]:
combined["new_team"] = combined["team"].map(mapping)


In [60]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])


In [61]:
merged


Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
258,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
259,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa
260,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle Utd


In [62]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()


actual_x
1    27
0    13
Name: count, dtype: int64