In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import calibration
import sklearn.linear_model
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
import numpy as np
import scipy.stats
from sklearn.base import clone
from sklearn.model_selection._search import ParameterSampler
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
import os
notebook_dir = os.getcwd()
os.chdir(os.path.join(notebook_dir, '..'))
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
# os.chdir('/Users/lucashaupt/Documents/GitHub/nfl-live-win-probability')
from src.generate_data import get_game_data, get_division_data, get_odds_data, get_schedule_data
import statsmodels.api as sm

# Data
* Pulls data from:
    * yards_gained_all_seasons.csv: proprietary metrics created by Kyle Cunningham-Rhodes and Bill Wang (df)
    * game_data.parquet: this is the get_game_data function that pulls a cached parquet file (game_data)
        * contains 
    * division_data.parquet: get_division_data function pulls cached parquet file of  

In [2]:
df = pd.read_csv("data/yards_gained_all_seasons.csv")
cache = True
game_data = get_game_data(cache=cache)
game_data = game_data.drop_duplicates("game_code")
division_data = get_division_data(cache=cache)
combined_df = df.merge(game_data, on=["game_code", "season"], how="left", copy=False)


Running get_game_data... 

get_game_data is returning cached df
get_game_data done in 0 sec.
Running get_division_data... 

get_division_data is returning cached df
get_division_data done in 0 sec.


In [3]:
def get_rolling_value(df, freq=16, min_periods=1, shift=1):
    cols = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
    df2 = df.rolling(freq, on="game_date", min_periods=min_periods)[cols].mean().shift(shift)
    df4 = pd.concat([df[["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]],
                     df2[["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]]], axis=1)
    return df4
aggregate_values = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
group_values = ["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]
game_yards = combined_df[group_values + aggregate_values].groupby(group_values, as_index=False).sum()
rolling_offense = game_yards.sort_values("game_date").groupby("offense_team", as_index=False).apply(get_rolling_value)
rolling_defense = game_yards.sort_values("game_date").groupby("defense_team", as_index=False).apply(get_rolling_value)

In [4]:
game_codes = combined_df[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "home_score", "away_score", "game_date", "season"]].drop_duplicates()

In [5]:
yards_values = ['total_expected_yards_gained', 'total_yards_gained', 'total_yards_added', 'play_count']
games_home_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_home_team = games_home_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])

games_away_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_away_team = games_away_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])

games = games_home_team.merge(games_away_team, on=['game_code', 'season', 'home_team_id', 'away_team_id', 'home_team_abbrev', 'away_team_abbrev', 'home_score', 'away_score', 'game_date'], suffixes=["_home", "_away"])
games = games[games["season"]>=2009]


In [6]:
features = ["total_yards_added_offense_home",
            "total_yards_added_defense_home",
            "total_yards_added_offense_away",
            "total_yards_added_defense_away",]
target = ["home_score","away_score"]

In [7]:
games["home_team_win"] = np.where(games["home_score"] > games["away_score"], 1, 0)
games["away_team_win"] = np.where(games["home_score"] < games["away_score"], 1, 0)
games["tie"] = np.where(games["home_score"] == games["away_score"], 1, 0)

In [8]:
import datetime as dt 

schedule = get_schedule_data(cache=cache)
schedule = schedule[schedule["game_date"].dt.date>np.max(pd.to_datetime(games["game_date"]))]


Running get_schedule_data... 

get_schedule_data is returning cached df
get_schedule_data done in 0 sec.


  result = libops.scalar_compare(x.ravel(), y, op)


In [9]:
current_values = games_home_team.append(games_away_team).sort_values("game_date", ascending=False).drop_duplicates(["offense_team"])
current_values = current_values.merge(division_data, left_on=["season", "offense_team"], right_on=["season", "team_id"])

In [10]:
predictions = schedule.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["home_team_id"], right_on=["offense_team"], suffixes=["", "_home"])
predictions = predictions.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["away_team_id"], right_on=["offense_team"], suffixes=["_home", "_away"])

In [36]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["home_team_win"], X2)
est_home_win = est.fit()
games["x_home_team_win"] = est_home_win.predict(X2)
predictions["const"] = 1
predictions["x_home_team_win"] = est_home_win.predict(predictions[["const"] + features])
home_win_params = est_home_win.params


Optimization terminated successfully.
         Current function value: 0.643942
         Iterations 5


0    0.409751
Name: x_home_team_win, dtype: float64

In [42]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["tie"], X2)
est_tie = est.fit()
games["x_tie"] = est_tie.predict(X2)
games["x_away_team_win"] = 1 - games["x_tie"] - games["x_home_team_win"]
predictions["x_tie"] = est_tie.predict(predictions[["const"] + features])
predictions["x_away_team_win"] = 1 - predictions["x_tie"] - predictions["x_home_team_win"]
tie_params = est_tie.params


Optimization terminated successfully.
         Current function value: 0.019140
         Iterations 10


In [43]:
pd.concat([games[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "x_home_team_win", "x_away_team_win", "game_date"]], predictions[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "x_home_team_win", "x_away_team_win", "game_date"]]]).rename(columns={"x_home_team_win": "prior_home", "x_away_team_win": "prior_away"}).to_csv("data/game_priors.csv", index=False)
os.system('say "done notebook"')


0

In [44]:
import json
param_order = ["constant", "total_yards_added_offense_home", "total_yards_added_defense_home", "total_yards_added_offense_away", "total_yards_added_defense_away"]

current_values_dict = {
    "team_ids": current_values["offense_team"].values.tolist(),
    "total_yards_added_offense": current_values["total_yards_added_offense"].values.tolist(),
    "total_yards_added_defense": current_values["total_yards_added_defense"].values.tolist(),
    "home_win_parameters": home_win_params.values.tolist(),
    "tie_parameters": tie_params.values.tolist(),
    "param_order": param_order
}
with open("data/team_ratings_dict.json", 'w') as f:
    json.dump(current_values_dict, f)


In [45]:
schedule_2021 = get_schedule_data(cache=False)
schedule_2021 = schedule_2021[(schedule_2021["season"]==2021)&(schedule_2021["week"]<=18)]


current_values_2021 = games_home_team.append(games_away_team).sort_values("game_date", ascending=False)
current_values_2021 = current_values_2021[current_values_2021["season"]==2020].drop_duplicates(["offense_team"])
current_values_2021 = current_values_2021.merge(division_data, left_on=["season", "offense_team"], right_on=["season", "team_id"])

Running get_schedule_data... 

Running get_schedule_data
queries/schedule_data.sql
get_schedule_data done in 3 sec.


In [46]:
current_values["team_abbrev"] = np.where(current_values["home_team_id"] == current_values["team_id"], current_values["home_team_abbrev"], current_values["away_team_abbrev"])
current_values["yards_added_combined"] = current_values["total_yards_added_offense"] - current_values["total_yards_added_defense"]

In [47]:
predictions_2021 = schedule_2021.merge(current_values_2021[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["home_team_id"], right_on=["offense_team"], suffixes=["", "_home"])
predictions_2021 = predictions_2021.merge(current_values_2021[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["away_team_id"], right_on=["offense_team"], suffixes=["_home", "_away"])
predictions_2021["x_home_team_win"] = est_home_win.predict(sm.add_constant(predictions_2021[features]))
predictions_2021["x_tie"] = est_tie.predict(sm.add_constant(predictions_2021[features]))
predictions_2021["x_away_team_win"] = 1 - predictions_2021["x_tie"] - predictions_2021["x_home_team_win"]
predictions_2021 = predictions_2021.drop_duplicates(["home_team_id", "week"])

In [48]:
predictions_2021[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "x_home_team_win", "x_away_team_win", "game_date"]].rename(columns={"x_home_team_win": "prior_home", "x_away_team_win": "prior_away"}).to_csv("data/game_priors_2021.csv", index=False)


predictions_2021

In [49]:
predictions_2021.groupby("home_team_abbrev", as_index=False).count()

Unnamed: 0,home_team_abbrev,game_code,game_date,home_team_id,away_team_id,season,away_team_abbrev,week,game_type_id,offense_team_home,total_yards_added_offense_home,total_yards_added_defense_home,offense_team_away,total_yards_added_offense_away,total_yards_added_defense_away,x_home_team_win,x_tie,x_away_team_win
0,Ari,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1,Atl,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
2,Bal,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
3,Buf,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
4,Car,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
5,Chi,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
6,Cin,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
7,Cle,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
8,Dal,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
9,Den,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
