In [50]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score
import numpy as np

In [24]:
# read in matches data
matches = pd.read_csv("matches.csv", index_col=0)

In [25]:
# data clean up
matches.dtypes
matches["date"] = pd.to_datetime(matches["date"])
matches["H/A_code"] = matches["venue"].astype("category").cat.codes # 0 = away, 1 = home
matches["opponent_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") # extract hour only from the match time
matches["day_code"] = matches["date"].dt.dayofweek

In [26]:
# set target which model aims to predict
matches["target"] = (matches["result"] == "W").astype("int") # code wins as 1 and losses or draws as 0

In [28]:
# create random forest classifier
model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=42)


In [39]:
def recent_stats(group, stats, rolling_stats):
    group = group.sort_values("date")
    # stats from 3 previous matches
    rolling = group[stats].rolling(3, closed='left').mean()
    group[rolling_stats] = rolling
    # drop empty data (for example if only 2 previous matches have been played)
    group = group.dropna(subset=rolling_stats)
    return group

In [89]:
stats = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
rolling_stats = [f"{s}_rolling" for s in stats]
recent_matches = matches.groupby("team").apply(lambda x: recent_stats(x, stats, rolling_stats))
recent_matches = recent_matches.droplevel("team")
recent_matches.index = range(recent_matches.shape[0]) # set unique indices
recent_matches.columns = recent_matches.columns.str.strip()


  recent_matches = matches.groupby("team").apply(lambda x: recent_stats(x, stats, rolling_stats))


In [101]:
def make_prediction(data, predictors):
    # split into training and test data
    training_set = data[data["date"] < "2022-01-01"]
    test_set  = data[data["date"] >= "2022-01-01"]
    model.fit(training_set[predictors], training_set["target"]) 
    predictions = model.predict(test_set[predictors])
    combined = pd.DataFrame(dict(actual=test_set["target"], predictions=predictions))
    precision = precision_score(test_set["target"], predictions)
    accuracy = accuracy_score(test_set["target"], predictions)
    return combined, precision, accuracy
    
    

In [102]:
predictors = recent_matches.columns[27:]
combined, precision, accuracy = make_prediction(recent_matches, predictors)
print(precision)
print(accuracy)

1.0
1.0
