In [4]:
import os
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
data_dir = os.path.join(root_dir, 'data')

import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import ipywidgets as widgets
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.metrics import log_loss
from IPython.display import display, HTML
import pickle
from sklearn.model_selection import GroupKFold, RandomizedSearchCV, cross_val_predict
import scipy
def uniform_distribution(lo, hi):
    return scipy.stats.uniform(lo, hi - lo)
def ProbaScoreProxy(y_true, y_probs, proxied_func, **kwargs):
    return proxied_func(y_true, y_probs, **kwargs)
from sklearn import calibration
from utils import get_model_outputs
import functools
import sys

pd.DataFrame.groupby.__setattr__("as_index",  False)

In [2]:
# event_df = pd.read_parquet(os.path.join(data_dir, "event_data.parquet"))
odds_df = pd.read_parquet(os.path.join(data_dir, "odds_data.parquet"))
odds_df = odds_df.drop_duplicates("game_code")

def create_train_test_val_df(
    df,
    input_names,
    output_name,
    group_col="game_code",
    mask_test_season=2021,
    mask_val_season=2020,
):
    mask_train = ~(df.season.isin([mask_test_season, mask_val_season])) & (df.continuation == 0)
    mask_test = (df.season == mask_test_season) & (df.continuation == 0)
    mask_val = (df.season == mask_val_season) & (df.continuation == 0)
    X_train = df.loc[mask_train, input_names]
    y_train = df.loc[mask_train, output_name]
    group_train = df.loc[mask_train, group_col]
    X_test = df.loc[mask_test, input_names]
    y_test = df.loc[mask_test, output_name]
    group_test = df.loc[mask_test, group_col]
    X_val = df.loc[mask_val, input_names]
    y_val = df.loc[mask_val, output_name]
    group_val = df.loc[mask_val, group_col]
    return X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val
event_df[["cur_spread", "cur_over_under"]] = event_df.merge(odds_df, how="left", on="game_code")[["cur_spread", "cur_over_under"]].fillna({"cur_spread": np.mean(odds_df["cur_spread"]), "cur_over_under": np.mean(odds_df["cur_over_under"])})
event_df["sequence"] = event_df["continuation"].groupby(event_df["continuation"].eq(0).cumsum()).cumsum()
event_df["play_start_id"] = event_df["nevent"] - event_df["sequence"]

turnover_ids = [9, 16]
mask_turnover_on_downs = (event_df["down"]==4)&(event_df["field_goal_attempt"]==0)&(event_df["punt"]==0)&(event_df["yards_gained"]<event_df["ytg"])&(event_df["home_team_has_ball"]!=event_df.shift(-1)["home_team_has_ball"])
event_df["turnover"] = np.where(event_df["event_id"].isin(turnover_ids), 1, 0)
turnover_key_df =event_df[["game_code", "play_start_id", "turnover"]].groupby(["game_code", "play_start_id"], as_index=False).sum()

event_df["turnover_in_play"] = np.clip(event_df.merge(turnover_key_df,on=["game_code", "play_start_id"], how="left")["turnover_y"], 0, 1)

event_df["play_outcome"] = np.where(event_df["home_score_added"]+event_df["away_score_added"]>=6, "td", np.where(event_df["turnover_in_play"]==1, "turnover", "none"))
event_df["half"] = round((event_df["quarter"] + 0.01) / 2)
event_df["home_timeout"] = np.where(((event_df["event_id"]==57)&(event_df["home_team_has_ball"]==1))|((event_df["event_id"]==58)&(event_df["home_team_has_ball"]==0)), 1, 0)
event_df["away_timeout"] = np.where(((event_df["event_id"]==57)&(event_df["home_team_has_ball"]==0))|((event_df["event_id"]==58)&(event_df["home_team_has_ball"]==1)), 1, 0)
event_df["home_timeouts_remaining"] = np.clip(3 - event_df.groupby(["game_code", "half"])["home_timeout"].cumsum(), 0, 3)
event_df["away_timeouts_remaining"] = np.clip(3 - event_df.groupby(["game_code", "half"])["away_timeout"].cumsum(), 0, 3)
event_df["time_left_in_game"] = np.where(event_df["quarter"] <= 4, event_df["play_start_time"] + (4 - event_df["quarter"]) * 900, event_df["play_start_time"])
event_df["time_elapsed"] = 900 - event_df["play_start_time"] + (event_df["quarter"] - 1) * 900


NameError: name 'get_event_data' is not defined

In [56]:
output_name = "play_outcome"
input_names_start = ["time_left_in_game", "current_score_diff", "current_score_total", "cur_spread", "cur_over_under", 'home_timeouts_remaining', 'away_timeouts_remaining']

model_df = deepcopy(event_df[input_names_start + ['season', 'continuation', 'game_code', 'kick_off', 'punt', 'point_after_kick', 'two_point_attempt', 'field_goal_attempt', output_name]])
model_df["from_scrimmage"] = np.where(event_df["event_id"].isin([22, 52, 53, 55, 47, 54, 56]), 0, event_df["from_scrimmage"])
model_df["ytg"] = np.where(model_df["from_scrimmage"] == 0, -1, event_df["ytg"])
model_df["down"] = np.where(model_df["from_scrimmage"] == 0, 0, event_df["down"])
model_df["home_team_has_ball"] = np.where(event_df["event_id"].isin([5]), 1 - event_df["home_team_has_ball"], event_df["home_team_has_ball"])
model_df["yd_from_goal"] = np.where(model_df["from_scrimmage"] == 0, -1, event_df["yd_from_goal"])
input_names = [
    'time_left_in_game',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    # 'kick_off',
    'punt',
    # 'point_after_kick',
    # 'two_point_attempt',
    'field_goal_attempt',
    # 'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball'

]
model_df = model_df[(model_df[input_names+[output_name]].notna().all(axis=1))&(model_df["from_scrimmage"]==1)]

X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df, input_names, output_name, )



In [57]:
rf_play_outcome = RandomForestClassifier(n_estimators=500, max_depth=15, verbose=100, n_jobs=-1, random_state=1)
rf_play_outcome.fit(X_train, y_train)
x_view = pd.concat([X_train.reset_index(drop=True), pd.DataFrame(rf_play_outcome.predict_proba(X_train), columns=rf_play_outcome.classes_)], axis=1)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 500
building tree 2 of 500building tree 3 of 500building tree 4 of 500


building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.6s
building tree 10 of 500

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s
building tree 11 of 500[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    1.6s

building tree 12 of 500
building tree 13 of 500[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.7s

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.7s
building tree 14 of 500[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.8s

building tree 15 of 500[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    1.9s

building tree 16 of 500[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.2s

building tree 17 of 500[Parallel(n_jobs=-1)]: Do

In [52]:
x_view

Unnamed: 0,time_left_in_game,current_score_diff,current_score_total,cur_spread,cur_over_under,home_timeouts_remaining,away_timeouts_remaining,punt,field_goal_attempt,ytg,yd_from_goal,down,home_team_has_ball,away_team_td,home_team_td,none,turnover
0,3595.0,0,0,-4.5,41.5,3,3,0,0,10,84,1,1,0.000000,0.007678,0.978346,0.013976
1,3560.0,0,0,-4.5,41.5,3,3,0,0,7,81,2,1,0.000000,0.009266,0.974692,0.016042
2,3554.0,0,0,-4.5,41.5,3,3,0,0,7,81,3,1,0.000246,0.012037,0.959405,0.028312
3,3523.0,0,0,-4.5,41.5,3,3,0,0,10,73,1,1,0.000000,0.008646,0.977451,0.013903
4,3476.0,0,0,-4.5,41.5,3,3,0,0,7,70,2,1,0.000000,0.014706,0.968090,0.017204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481460,68.0,11,51,-1.5,53.0,3,2,0,0,10,75,2,0,0.012363,0.000000,0.942933,0.044703
481461,57.0,11,51,-1.5,53.0,3,2,0,0,10,59,1,1,0.000000,0.008464,0.977708,0.013827
481462,53.0,11,51,-1.5,53.0,3,1,0,0,15,64,2,1,0.000000,0.007818,0.978151,0.014031
481463,51.0,11,51,-1.5,53.0,3,0,0,0,18,67,3,1,0.000396,0.011274,0.966248,0.022082
