In [1]:
import pandas as pd
games = pd.read_csv("game.csv", index_col=0)

In [2]:
games["game_date"] = pd.to_datetime(games["game_date"])
games["team_code_home"] = games["team_name_home"].astype("category").cat.codes
games["team_code_away"] = games["team_name_away"].astype("category").cat.codes
games["day_code"] = games["game_date"].dt.dayofweek
games["target_home"] = (games["wl_home"] == "W").astype("int")

home = games.rename(columns={"team_id_home":"team_id"}

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = games[(games["game_date"] < '2021-01-01') & (games["game_date"] > '2000-09-01')]
test = games[games["game_date"] > '2021-01-01']
predictors = ["team_code_home", "team_code_away", "day_code"]

In [4]:
rf.fit(train[predictors], train["target_home"])
preds = rf.predict(test[predictors])

In [5]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(test["target_home"], preds)
print(acc)

0.4971502590673575


In [6]:
combined = pd.DataFrame(dict(actual=test["target_home"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,518,1183
1,758,1401


In [7]:
from sklearn.metrics import precision_score
precision_score(test["target_home"], preds)

0.5421826625386997

In [8]:
grouped_games = games.groupby("team_name_home")

In [9]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("game_date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [10]:
cols = ["pts_home", "pts_away", "fg_pct_home", "ft_pct_home", "oreb_home","dreb_home","stl_home","blk_home","tov_home",
       "plus_minus_home","fta_away","fg_pct_away", "ft_pct_away", "reb_away","blk_away","plus_minus_away"]
new_cols = [f"{c}_rolling" for c in cols]

In [11]:
games_rolling = games.groupby("team_name_home").apply(lambda x: rolling_averages(x, cols, new_cols))
games_rolling = games_rolling.droplevel("team_name_home")
games_rolling.index = range(games_rolling.shape[0])

  games_rolling = games.groupby("team_name_home").apply(lambda x: rolling_averages(x, cols, new_cols))


In [12]:
def make_predictions(data, predictors):
    train = data[(data["game_date"] < '2021-01-01') & (data["game_date"] > '2000-09-01')]
    test = data[data["game_date"] > '2021-01-01']
    rf.fit(train[predictors], train["target_home"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target_home"], prediction=preds), index=test.index)
    precision = precision_score(test["target_home"], preds)
    return combined, precision

In [13]:
combined, precision = make_predictions(games_rolling, predictors + new_cols)
combined = combined.merge(games_rolling[["game_date", "team_name_home", "team_name_away", "wl_home"]], left_index=True, right_index=True)
precision

0.581886278515492