In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import calibration
import sklearn.linear_model
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
import numpy as np
import scipy.stats
from sklearn.base import clone
from sklearn.model_selection._search import ParameterSampler
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
import os
notebook_dir = os.getcwd()
os.chdir(os.path.join(notebook_dir, '..'))
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
# os.chdir('/Users/lucashaupt/Documents/GitHub/nfl-live-win-probability')
from src.generate_data import get_game_data, get_division_data, get_odds_data, get_schedule_data
import statsmodels.api as sm

# Data
* Pulls data from:
    * yards_gained_all_seasons.csv: proprietary metrics created by Kyle Cunningham-Rhodes and Bill Wang (df)
    * game_data.parquet: this is the get_game_data function that pulls a cached parquet file (game_data)
        * contains 
    * division_data.parquet: get_division_data function pulls cached parquet file of  

In [2]:
df = pd.read_csv("data/yards_gained_all_seasons.csv")
cache = True
game_data = get_game_data(cache=cache)
game_data = game_data.drop_duplicates("game_code")
division_data = get_division_data(cache=cache)
combined_df = df.merge(game_data, on=["game_code", "season"], how="left", copy=False)


Running get_game_data... 

get_game_data is returning cached df
get_game_data done in 0 sec.
Running get_division_data... 

get_division_data is returning cached df
get_division_data done in 0 sec.


In [3]:
def get_rolling_value(df, freq=16, min_periods=1, shift=1):
    cols = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
    df2 = df.rolling(freq, on="game_date", min_periods=min_periods)[cols].mean().shift(shift)
    df4 = pd.concat([df[["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]],
                     df2[["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]]], axis=1)
    return df4
aggregate_values = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
group_values = ["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]
game_yards = combined_df[group_values + aggregate_values].groupby(group_values, as_index=False).sum()
rolling_offense = game_yards.sort_values("game_date").groupby("offense_team", as_index=False).apply(get_rolling_value)
rolling_defense = game_yards.sort_values("game_date").groupby("defense_team", as_index=False).apply(get_rolling_value)

In [4]:
game_codes = combined_df[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "home_score", "away_score", "game_date", "season"]].drop_duplicates()

In [5]:
yards_values = ['total_expected_yards_gained', 'total_yards_gained', 'total_yards_added', 'play_count']
games_home_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_home_team = games_home_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])

games_away_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_away_team = games_away_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])

games = games_home_team.merge(games_away_team, on=['game_code', 'season', 'home_team_id', 'away_team_id', 'home_team_abbrev', 'away_team_abbrev', 'home_score', 'away_score', 'game_date'], suffixes=["_home", "_away"])
games = games[games["season"]>=2009]


In [6]:
features = ["total_yards_added_offense_home",
            "total_yards_added_defense_home",
            "total_yards_added_offense_away",
            "total_yards_added_defense_away",]
target = ["home_score","away_score"]

In [7]:
games["home_team_win"] = np.where(games["home_score"] > games["away_score"], 1, 0)
games["away_team_win"] = np.where(games["home_score"] < games["away_score"], 1, 0)
games["tie"] = np.where(games["home_score"] == games["away_score"], 1, 0)

In [8]:
import datetime as dt 

schedule = get_schedule_data(cache=cache)
schedule = schedule[schedule["game_date"].dt.date>np.max(pd.to_datetime(games["game_date"]))]


Running get_schedule_data... 

get_schedule_data is returning cached df
get_schedule_data done in 0 sec.


  result = libops.scalar_compare(x.ravel(), y, op)


In [9]:
current_values = games_home_team.append(games_away_team).sort_values("game_date", ascending=False).drop_duplicates(["offense_team"])
current_values = current_values.merge(division_data, left_on=["season", "offense_team"], right_on=["season", "team_id"])

In [10]:
predictions = schedule.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["home_team_id"], right_on=["offense_team"], suffixes=["", "_home"])
predictions = predictions.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["away_team_id"], right_on=["offense_team"], suffixes=["_home", "_away"])

In [11]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["home_team_win"], X2)
est2 = est.fit()
games["x_home_team_win"] = est2.predict(X2)
predictions["x_home_team_win"] = est2.predict(sm.add_constant(predictions[features]))


Optimization terminated successfully.
         Current function value: 0.644433
         Iterations 5


In [12]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["tie"], X2)
est2 = est.fit()
games["x_tie"] = est2.predict(X2)
games["x_away_team_win"] = 1 - games["x_tie"] - games["x_home_team_win"]
predictions["x_tie"] = est2.predict(sm.add_constant(predictions[features]))
predictions["x_away_team_win"] = 1 - predictions["x_tie"] - predictions["x_home_team_win"]



Optimization terminated successfully.
         Current function value: 0.019513
         Iterations 10


In [16]:
games

Unnamed: 0,game_code,home_team_id,away_team_id,home_team_abbrev,away_team_abbrev,home_score,away_score,game_date,season,offense_team_home,...,total_expected_yards_gained_defense_away,total_yards_gained_defense_away,total_yards_added_defense_away,play_count_defense_away,home_team_win,away_team_win,tie,x_home_team_win,x_tie,x_away_team_win
266,887191,329,347,Cle,Min,20,34,2009-09-13 13:00:00,2009,329,...,313.874291,284.3125,-29.561791,56.7500,0,1,0,0.301116,0.002444,0.696440
267,887208,334,347,Det,Min,13,27,2009-09-20 13:00:00,2009,334,...,312.220221,284.8750,-27.345221,56.3750,0,1,0,0.202565,0.000998,0.796437
268,887257,347,327,Min,Cin,30,10,2009-12-13 12:00:00,2009,347,...,314.871557,276.6250,-38.246557,55.6250,1,0,0,0.676199,0.001892,0.321909
269,887265,347,361,Min,Sea,35,9,2009-11-22 12:00:00,2009,347,...,337.998959,352.5000,14.501041,61.2500,1,0,0,0.768603,0.003105,0.228291
270,887274,347,335,Min,GB,30,23,2009-10-05 19:30:00,2009,347,...,322.699806,316.3125,-6.387306,60.3125,1,0,0,0.600957,0.003593,0.395451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,2337471,325,364,Hou,Car,9,24,2021-09-23 19:20:00,2021,325,...,331.730530,329.5625,-2.168030,60.4375,0,1,0,0.512325,0.000605,0.487070
3676,2337660,364,354,Car,Phi,18,21,2021-10-10 13:00:00,2021,364,...,334.602363,357.3125,22.710137,63.0625,0,1,0,0.617452,0.002705,0.379842
3677,2337587,354,357,Phi,LAC,24,27,2021-11-07 16:05:00,2021,354,...,309.724075,322.0000,12.275925,58.5000,0,1,0,0.522317,0.001887,0.475796
3678,2337654,357,348,LAC,NE,24,27,2021-10-31 13:05:00,2021,357,...,339.589628,339.0000,-0.589628,62.3125,0,1,0,0.504407,0.002183,0.493410


In [18]:
pd.concat([games[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "x_home_team_win", "x_away_team_win", "game_date"]], predictions[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "x_home_team_win", "x_away_team_win", "game_date"]]]).rename(columns={"x_home_team_win": "prior_home", "x_away_team_win": "prior_away"}).to_csv("data/game_priors.csv", index=False)
os.system('say "done notebook"')


0