In [1]:
import os
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
data_dir = os.path.join(root_dir, 'data')

import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import ipywidgets as widgets
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.metrics import log_loss
from IPython.display import display, HTML
import pickle
from sklearn.model_selection import GroupKFold, RandomizedSearchCV, cross_val_predict, GridSearchCV
import scipy
from sklearn.preprocessing import OneHotEncoder

def uniform_distribution(lo, hi):
    return scipy.stats.uniform(lo, hi - lo)
def ProbaScoreProxy(y_true, y_probs, proxied_func, **kwargs):
    return proxied_func(y_true, y_probs, **kwargs)
from sklearn import calibration
from utils import get_model_outputs
import functools
import sys

In [2]:
def normalize_df(df, anchor_df=None):
    for col in df.columns:
        data = df[col]
        if anchor_df is None:
            df[col] = (data - np.min(data)) / (np.max(data) - np.min(data))
        else:
            df[col] = (data - np.min(anchor_df[col])) / (np.max(anchor_df[col]) - np.min(anchor_df[col]))
    return df

def create_train_test_val_df(
    df,
    input_names,
    output_name,
    group_col="game_code",
    mask_test_season=2021,
    mask_val_season=[2019, 2020],
    normalize=False
):
    mask_train = ~(df.season.isin([mask_test_season, mask_val_season]))
    mask_test = (df.season == mask_test_season)
    mask_val = (df.season.isin(mask_val_season))
    if normalize==False:
        X_train = df.loc[mask_train, input_names]
        X_test = df.loc[mask_test, input_names]
        X_val = df.loc[mask_val, input_names]
    else:
        X_train = normalize_df(df.loc[mask_train, input_names])
        X_test = normalize_df(df.loc[mask_test, input_names], df.loc[mask_train, input_names])
        X_val = normalize_df(df.loc[mask_val, input_names], df.loc[mask_train, input_names])
    y_train = df.loc[mask_train, output_name]
    group_train = df.loc[mask_train, group_col]
    y_test = df.loc[mask_test, output_name]
    group_test = df.loc[mask_test, group_col]
    y_val = df.loc[mask_val, output_name]
    group_val = df.loc[mask_val, group_col]
    return X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val

In [12]:
event_df = pd.read_parquet(os.path.join(data_dir, "event_data_cfb.parquet"))
event_df = event_df.drop_duplicates(["nevent", "game_code"]).reset_index(drop=True)
odds_df = pd.read_parquet(os.path.join(data_dir, "odds_data_cfb.parquet"))
odds_df = odds_df.drop_duplicates("game_code")


event_df.merge(odds_df, how="left", on="game_code")

Unnamed: 0,game_code,game_date_x,season_x,home_team_id,home_team_x,home_team_abbrev,home_conf_name,home_division_type,away_team_id,away_team_x,...,game_date_y,home_team_y,away_team_y,cur_favorite_id,cur_spread,cur_over_under,home_odds,away_odds,favorite_odds,underdog_odds
0,1178084,2012-08-30 17:15:00,2012,3499,Utah Utes,Utah,Pac-12,FBS,3710,Northern Colorado Bears,...,NaT,,,,,,,,,
1,1178084,2012-08-30 17:15:00,2012,3499,Utah Utes,Utah,Pac-12,FBS,3710,Northern Colorado Bears,...,NaT,,,,,,,,,
2,1178084,2012-08-30 17:15:00,2012,3499,Utah Utes,Utah,Pac-12,FBS,3710,Northern Colorado Bears,...,NaT,,,,,,,,,
3,1178084,2012-08-30 17:15:00,2012,3499,Utah Utes,Utah,Pac-12,FBS,3710,Northern Colorado Bears,...,NaT,,,,,,,,,
4,1178084,2012-08-30 17:15:00,2012,3499,Utah Utes,Utah,Pac-12,FBS,3710,Northern Colorado Bears,...,NaT,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1930683,2405840,2022-01-10 20:00:00,2021,3478,Alabama Crimson Tide,Ala,Southeastern,FBS,3473,Georgia Bulldogs,...,2022-01-10 20:00:00,Alabama Crimson Tide,Georgia Bulldogs,2.0,3.0,53.0,0.434783,0.6,0.534884,0.512195
1930684,2405840,2022-01-10 20:00:00,2021,3478,Alabama Crimson Tide,Ala,Southeastern,FBS,3473,Georgia Bulldogs,...,2022-01-10 20:00:00,Alabama Crimson Tide,Georgia Bulldogs,2.0,3.0,53.0,0.434783,0.6,0.534884,0.512195
1930685,2405840,2022-01-10 20:00:00,2021,3478,Alabama Crimson Tide,Ala,Southeastern,FBS,3473,Georgia Bulldogs,...,2022-01-10 20:00:00,Alabama Crimson Tide,Georgia Bulldogs,2.0,3.0,53.0,0.434783,0.6,0.534884,0.512195
1930686,2405840,2022-01-10 20:00:00,2021,3478,Alabama Crimson Tide,Ala,Southeastern,FBS,3473,Georgia Bulldogs,...,2022-01-10 20:00:00,Alabama Crimson Tide,Georgia Bulldogs,2.0,3.0,53.0,0.434783,0.6,0.534884,0.512195


In [14]:
test = event_df.merge(odds_df, how="left", on="game_code")

In [16]:
no_odds = test[test["home_odds"].isna()]


In [20]:
no_odds[["home_division_type", "away_division_type"]].value_counts()

home_division_type  away_division_type
FBS                 FCS                   196362
                    FBS                   145087
FCS                 FBS                      403
FBS                 I-NAIA                   212
                    II                       189
dtype: int64

In [7]:
event_df = pd.read_parquet(os.path.join(data_dir, "event_data_cfb.parquet"))
event_df = event_df.drop_duplicates(["nevent", "game_code"]).reset_index(drop=True)
odds_df = pd.read_parquet(os.path.join(data_dir, "odds_data_cfb.parquet"))
odds_df = odds_df.drop_duplicates("game_code")
event_ids = event_df[["event_id", "event_name"]].drop_duplicates().sort_values("event_id").reset_index(drop=True)
event_ids.to_csv(os.path.join(data_dir, "event_ids.csv"))


event_df["half"] = round((event_df["quarter"] + 0.01) / 2)
event_df["home_timeout"] = np.where(((event_df["event_id"]==57)&(event_df["home_team_has_ball"]==1))|((event_df["event_id"]==58)&(event_df["home_team_has_ball"]==0)), 1, 0)
event_df["away_timeout"] = np.where(((event_df["event_id"]==57)&(event_df["home_team_has_ball"]==0))|((event_df["event_id"]==58)&(event_df["home_team_has_ball"]==1)), 1, 0)
event_df["home_timeouts_remaining"] = np.clip(3 - event_df.groupby(["game_code", "half"])["home_timeout"].cumsum(), 0, 3)
event_df["away_timeouts_remaining"] = np.clip(3 - event_df.groupby(["game_code", "half"])["away_timeout"].cumsum(), 0, 3)
event_df["time_left_in_game"] = np.where(event_df["quarter"] <= 4, event_df["play_start_time"] + (4 - event_df["quarter"]) * 900, event_df["play_start_time"])
event_df["time_elapsed"] = 900 - event_df["play_start_time"] + (event_df["quarter"] - 1) * 900


event_df[["cur_spread", "cur_over_under"]] = event_df.merge(odds_df, how="left", on="game_code")[["cur_spread", "cur_over_under"]].fillna({"cur_spread": np.mean(odds_df["cur_spread"]), "cur_over_under": np.mean(odds_df["cur_over_under"])})
event_df["sequence"] = event_df["continuation"].groupby(event_df["continuation"].eq(0).cumsum()).cumsum()
event_df["play_start_id"] = event_df["nevent"] - event_df["sequence"]

turnover_ids = [9, 16]
# mask_turnover_on_downs = (event_df["down"]==4)&(event_df["field_goal_attempt"]==0)&(event_df["punt"]==0)&(event_df["yards_gained"]<event_df["ytg"])&(event_df["home_team_has_ball"]!=event_df.shift(-1)["home_team_has_ball"])
event_df["turnover"] = np.where(event_df["event_id"].isin(turnover_ids), 1, 0)
event_df["touchdown_scored"] = np.where(event_df["home_score_added"]+event_df["away_score_added"]>=6, 1, 0)
event_df["fieldgoal_made"] = np.where(event_df["home_score_added"]+event_df["away_score_added"]==3, 1, 0)

play_outcome_aggregate =event_df[["game_code", "play_start_id", "turnover", "touchdown_scored", "fieldgoal_made", "first_down"]].groupby(["game_code", "play_start_id"], as_index=False).sum()
# touchdown_key_df =event_df[["game_code", "play_start_id", "turnover"]].groupby(["game_code", "play_start_id"], as_index=False).sum()
event_df["touchdown_in_play"] = np.clip(event_df.merge(play_outcome_aggregate,on=["game_code", "play_start_id"], how="left")["touchdown_scored_y"], 0, 1)
event_df["turnover_in_play"] = np.clip(event_df.merge(play_outcome_aggregate,on=["game_code", "play_start_id"], how="left")["turnover_y"], 0, 1)
event_df["field_goal_in_play"] = np.clip(event_df.merge(play_outcome_aggregate,on=["game_code", "play_start_id"], how="left")["fieldgoal_made_y"], 0, 1)
event_df["first_down_in_play"] = np.clip(event_df.merge(play_outcome_aggregate,on=["game_code", "play_start_id"], how="left")["first_down_y"], 0, 1)


# drive_outcome_aggregate =event_df[["game_code", "off_team_id", "drive_id", "turnover", "touchdown_scored", "fieldgoal_made"]].groupby(["game_code", "off_team_id", "drive_id"], as_index=False).sum()
# event_df["touchdown_in_drive"] = np.clip(event_df.merge(drive_outcome_aggregate,on=["game_code", "drive_id", "off_team_id"], how="left")["touchdown_scored_y"], 0, 1)
# event_df["turnover_in_drive"] = np.clip(event_df.merge(drive_outcome_aggregate,on=["game_code", "drive_id", "off_team_id"], how="left")["turnover_y"], 0, 1)
# event_df["field_goal_in_drive"] = np.clip(event_df.merge(drive_outcome_aggregate,on=["game_code", "drive_id", "off_team_id"], how="left")["fieldgoal_made_y"], 0, 1)

event_df["play_outcome"] = (
    np.where((event_df["turnover_in_play"]==1), "turnover",
    # (event_df["touchdown_in_play"]==1)&(event_df["turnover_in_play"]==1), "defensive_touchdown", 
    np.where((event_df["punt"]==1), "punt",
    np.where((event_df["field_goal_in_play"]==1), "field_goal_made",
    np.where((event_df["field_goal_attempt"]==1)&(event_df["field_goal_in_play"]==0), "field_goal_missed",
    np.where((event_df["first_down_in_play"]==1)&(event_df["touchdown_in_play"]==0)&(event_df["turnover_in_play"]==0)&(event_df["punt"]==0), "first_down",
    np.where((event_df["touchdown_in_play"]==1)&(event_df["turnover_in_play"]==0), "offensive_touchdown", "none"))))))
)
drive_description_matrix = {
    7: "punt",
    9: "turnover",
    14: "turnover",
    17: "field_goal_made",
    18: "punt",
    20: "safety",
    35: "field_goal_missed",
    36: "field_goal_missed",
    37: "touch_down",
    38: "clock",
    39: "clock",
    40: "turnover_on_downs",
    42: "field_goal_made",
    51: "clock",
}
event_df["drive_outcome_desc_basic"] = event_df["drive_outcome_id"].map(drive_description_matrix)

# event_df["drive_outcome"] = np.where(
#     (event_df["touchdown_in_drive"]==1)&(event_df["turnover_in_drive"]==1), "defensive_touchdown", 
#     np.where((event_df["touchdown_in_drive"]==1)&(event_df["turnover_in_drive"]==0), "offensive_touchdown",
#     np.where((event_df["field_goal_in_drive"]==1), "field_goal_made",
#     np.where((event_df["touchdown_in_drive"]==0)&(event_df["turnover_in_drive"]==1), "turnover", "none"
# ))))
scrimmage_plays_we_want = [1, 2, 3, 4, 7, 9, 14, 17, 18, 35]
game_end_of_regulation_total_score = event_df[event_df.overtime==0].groupby("game_code", as_index=False).max()[["game_code", "home_start_score", "away_start_score"]]
game_end_of_regulation_total_score["end_of_regulation_score_total"] = game_end_of_regulation_total_score["home_start_score"] + game_end_of_regulation_total_score["away_start_score"]
# event_df["end_of_regulation_score_total_diff"] = 
event_df["end_of_regulation_score_total_diff"] = (
    event_df.merge(game_end_of_regulation_total_score, on="game_code")["end_of_regulation_score_total"]
    - (event_df["home_start_score"] + event_df["away_start_score"])
)

  game_end_of_regulation_total_score = event_df[event_df.overtime==0].groupby("game_code", as_index=False).max()[["game_code", "home_start_score", "away_start_score"]]


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)
def ProbaScoreProxy(y_true, y_probs, proxied_func, **kwargs):
    return proxied_func(y_true, y_probs, **kwargs)


log_loss_scorer = make_scorer(
    ProbaScoreProxy,
    greater_is_better=False,  # True
    needs_proba=True,
    proxied_func=log_loss,
)

In [8]:
output_name = "play_outcome"

model_df = deepcopy(event_df)
model_df["time_left_in_half"] = event_df["time_left_in_game"] - ((2 - event_df["half"]) * 1800)
model_df["from_scrimmage"] = np.where(event_df["event_id"].isin([22, 52, 53, 55, 47, 54, 56]), 0, event_df["from_scrimmage"])
model_df["ytg"] = np.where(model_df["from_scrimmage"] == 0, -1, event_df["ytg"])
model_df["down"] = np.where(model_df["from_scrimmage"] == 0, 0, event_df["down"])
model_df["home_team_has_ball"] = np.where(event_df["event_id"].isin([5]), 1 - event_df["home_team_has_ball"], event_df["home_team_has_ball"])
model_df["yd_from_goal"] = np.where(model_df["from_scrimmage"] == 0, -1, event_df["yd_from_goal"])


model_df["away_vegas_score_pred"] = np.clip(model_df["cur_over_under"], 30, 80) * 0.5 + model_df["cur_spread"] * 0.5
model_df["home_vegas_score_pred"] = np.clip(model_df["cur_over_under"], 30, 80) * 0.5 - model_df["cur_spread"] * 0.5
model_df["away_vegas_score_pred_weighted"] = model_df["away_vegas_score_pred"] * (model_df["time_left_in_game"] / 3600)
model_df["home_vegas_score_pred_weighted"] = model_df["home_vegas_score_pred"] * (model_df["time_left_in_game"] / 3600)
model_df["cur_over_under"] = np.clip(event_df["cur_over_under"], 30, 80)

input_names = [
    # 'time_left_in_game',
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    # 'kick_off',
    # 'punt',
    # 'point_after_kick',
    # 'two_point_attempt',
    # 'field_goal_attempt',
    # 'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
]
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name)

# mlp_play_outcome = MLPClassifier(hidden_layer_sizes=[100], verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# mlp_play_outcome.fit(X_train, y_train)
# rf_play_outcome = RandomForestClassifier(n_estimators=200, max_depth=15, verbose=100, n_jobs=-1, random_state=1)
# rf_play_outcome.fit(X_train, y_train)
# pickle.dump(mlp_play_outcome, open(os.path.join(root_dir, 'models/mlp_play_outcome.p'), 'wb'))
# pickle.dump(rf_play_outcome, open(os.path.join(root_dir, 'models/rf_play_outcome.p'), 'wb'))



# logit_play_outcome_basic = LogisticRegression(solver='liblinear', random_state=0)
# logit_play_outcome_basic.fit(X_train, y_train)
# pickle.dump(logit_play_outcome_basic, open(os.path.join(root_dir, 'models/logit_play_outcome_basic.p'), 'wb'))


mlp_play_outcome = pickle.load(open(os.path.join(root_dir, "models/mlp_play_outcome.p"), 'rb'))
rf_play_outcome = pickle.load(open(os.path.join(root_dir, "models/rf_play_outcome.p"), 'rb'))
logit_play_outcome_basic = pickle.load(open(os.path.join(root_dir, "models/logit_play_outcome_basic.p"), 'rb'))
# os.system('say "done"')


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [9]:
X_train

Unnamed: 0,time_left_in_half,half,current_score_diff,current_score_total,cur_spread,cur_over_under,home_timeouts_remaining,away_timeouts_remaining,ytg,yd_from_goal,down,home_team_has_ball
2,1800.0,1.0,0,0,-3.308451,55.707539,3,3,10,75,1,1
3,1766.0,1.0,0,0,-3.308451,55.707539,3,3,5,70,2,1
4,1739.0,1.0,0,0,-3.308451,55.707539,3,3,1,66,3,1
5,1705.0,1.0,0,0,-3.308451,55.707539,3,3,10,55,1,1
6,1660.0,1.0,0,0,-3.308451,55.707539,3,3,3,48,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1732375,183.0,2.0,28,76,-9.500000,76.000000,3,1,10,59,1,1
1732376,140.0,2.0,28,76,-9.500000,76.000000,3,1,11,60,2,1
1732377,97.0,2.0,28,76,-9.500000,76.000000,3,1,9,58,3,1
1732378,52.0,2.0,28,76,-9.500000,76.000000,3,1,10,45,1,1


In [None]:


output_name = "play_outcome"
input_names = [
    # 'time_left_in_game',
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    # 'kick_off',
    # 'punt',
    # 'point_after_kick',
    # 'two_point_attempt',
    # 'field_goal_attempt',
    # 'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
]
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name)
cv=GroupKFold(n_splits=3)

rf_grid = {
    "n_estimators": [200],
    "max_depth": [15],
}
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
rf_play_outcome_search_model = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)
# search_rf_play_outcome = RandomizedSearchCV(rf_play_outcome_search_model, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer)
# search_rf_play_outcome.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_play_outcome, open(os.path.join(root_dir, 'models/search_rf_play_outcome.p'), 'wb'))
search_rf_play_outcome = pickle.load(open(os.path.join(root_dir, "models/search_rf_play_outcome.p"), 'rb'))

mlp_grid = {
    'hidden_layer_sizes': [(100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'learning_rate': ['constant'],
}
mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name, normalize=True)

mlp_play_outcome_search_model = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_play_outcome_normalized_new_hpo = GridSearchCV(mlp_play_outcome_search_model, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)
# search_mlp_play_outcome_normalized_new_hpo.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_play_outcome_normalized_new_hpo, open(os.path.join(root_dir, 'models/search_mlp_play_outcome_normalized_new_hpo.p'), 'wb'))
search_mlp_play_outcome_normalized_new_hpo = pickle.load(open(os.path.join(root_dir, "models/search_mlp_play_outcome_normalized_new_hpo.p"), 'rb'))
# os.system('say "done"')
# print(search_rf_play_outcome.best_score_)
# print(search_mlp_play_outcome.best_score_)


In [None]:
output_name = "drive_outcome_desc_basic"
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)


X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}
# rf_grid = {
#     "n_estimators": [200],
#     "max_depth": [15],
# }

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
rf_drive_outcome_search_model = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)
# search_rf_drive_outcome = RandomizedSearchCV(rf_drive_outcome_search_model, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer)
# search_rf_drive_outcome.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_drive_outcome, open(os.path.join(root_dir, 'models/search_rf_drive_outcome.p'), 'wb'))
search_rf_drive_outcome = pickle.load(open(os.path.join(root_dir, "models/search_rf_drive_outcome.p"), 'rb'))

#     'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,100,100)],
mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
# mlp_grid = {
#     'hidden_layer_sizes': [(100,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant'],
# }

X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name, normalize=True)


mlp_drive_outcome_search_model = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_drive_outcome_normalized_new_hpo = GridSearchCV(mlp_drive_outcome_search_model, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)
# search_mlp_drive_outcome_normalized_new_hpo.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_drive_outcome_normalized_new_hpo, open(os.path.join(root_dir, 'models/search_mlp_drive_outcome_normalized_new_hpo.p'), 'wb'))
search_mlp_drive_outcome_normalized_new_hpo = pickle.load(open(os.path.join(root_dir, "models/search_mlp_drive_outcome_normalized_new_hpo.p"), 'rb'))

# os.system("say 'done'")
# print(search_rf_drive_outcome.best_score_)


# print(search_mlp_drive_outcome_normalized_new_hpo.best_score_)



In [None]:
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df[input_names].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)

search_rf_play_outcome.best_estimator_.verbose = 0
search_rf_drive_outcome.best_estimator_.verbose = 0

search_rf_play_class_names = ["search_rf_play_" + x for x in search_rf_play_outcome.classes_]
search_rf_drive_class_names = ["search_rf_drive_" + x for x in search_rf_drive_outcome.classes_]
model_df[search_rf_play_class_names] = pd.DataFrame(search_rf_play_outcome.predict_proba(model_df[mask_model][input_names]), index=model_df[mask_model].index).fillna(0)
model_df[search_rf_play_class_names] = model_df[search_rf_play_class_names].fillna(0)
model_df[search_rf_drive_class_names] = pd.DataFrame(search_rf_drive_outcome.predict_proba(model_df[mask_model][input_names]), index=model_df[mask_model].index).fillna(0)
model_df[search_rf_drive_class_names] = model_df[search_rf_drive_class_names].fillna(0)

search_mlp_play_class_names = ["search_mlp_play_" + x for x in search_mlp_play_outcome_normalized_new_hpo.classes_]
search_mlp_drive_class_names = ["search_mlp_drive_" + x for x in search_mlp_drive_outcome_normalized_new_hpo.classes_]
# model_df[search_mlp_play_class_names] = pd.DataFrame(search_mlp_play_outcome.predict_proba(model_df[mask_model][input_names]), index=model_df[mask_model].index).fillna(0)
# model_df[search_mlp_play_class_names] = model_df[search_mlp_play_class_names].fillna(0)
# model_df[search_mlp_drive_class_names] = pd.DataFrame(search_mlp_drive_outcome.predict_proba(model_df[mask_model][input_names]), index=model_df[mask_model].index).fillna(0)
# model_df[search_mlp_drive_class_names] = model_df[search_mlp_drive_class_names].fillna(0)


anchor_df = model_df[mask_model&(model_df["season"]<2020)&(model_df.play_counts==1)&(model_df.event_id.isin(scrimmage_plays_we_want))]
normalized_model_df = normalize_df(model_df[mask_model][input_names], anchor_df)

model_df[search_mlp_play_class_names] = pd.DataFrame(search_mlp_play_outcome_normalized_new_hpo.predict_proba(normalized_model_df), index=model_df[mask_model].index).fillna(0)
model_df[search_mlp_play_class_names] = model_df[search_mlp_play_class_names].fillna(0)
model_df[search_mlp_drive_class_names] = pd.DataFrame(search_mlp_drive_outcome_normalized_new_hpo.predict_proba(normalized_model_df), index=model_df[mask_model].index).fillna(0)
model_df[search_mlp_drive_class_names] = model_df[search_mlp_drive_class_names].fillna(0)




model_df


In [None]:
model_df["end_of_regulation_score_diff_change_clipped"] = np.clip(model_df["end_of_regulation_score_diff_change"], -35, 35)
model_df["point_after_play"] = np.where(model_df["point_after_kick"] + model_df["two_point_attempt"]==1, 1, 0)
search_rf_play_class_names_home = [x + "_home" for x in search_rf_play_class_names]
search_rf_play_class_names_away = [x + "_away" for x in search_rf_play_class_names]
search_rf_drive_class_names_home = [x + "_home" for x in search_rf_drive_class_names]
search_rf_drive_class_names_away = [x + "_away" for x in search_rf_drive_class_names]
model_df[search_rf_play_class_names_home] = model_df[search_rf_play_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_rf_play_class_names_away] = model_df[search_rf_play_class_names].where(model_df.home_team_has_ball==0, 0)
model_df[search_rf_drive_class_names_home] = model_df[search_rf_drive_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_rf_drive_class_names_away] = model_df[search_rf_drive_class_names].where(model_df.home_team_has_ball==0, 0)


search_mlp_play_class_names_home = [x + "_home" for x in search_mlp_play_class_names]
search_mlp_play_class_names_away = [x + "_away" for x in search_mlp_play_class_names]
search_mlp_drive_class_names_home = [x + "_home" for x in search_mlp_drive_class_names]
search_mlp_drive_class_names_away = [x + "_away" for x in search_mlp_drive_class_names]
model_df[search_mlp_play_class_names_home] = model_df[search_mlp_play_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_mlp_play_class_names_away] = model_df[search_mlp_play_class_names].where(model_df.home_team_has_ball==0, 0)
model_df[search_mlp_drive_class_names_home] = model_df[search_mlp_drive_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_mlp_drive_class_names_away] = model_df[search_mlp_drive_class_names].where(model_df.home_team_has_ball==0, 0)

In [None]:
model_df["end_of_regulation_score_diff_change_clipped"] = np.clip(model_df["end_of_regulation_score_diff_change"], -35, 35)
model_df["point_after_play"] = np.where(model_df["point_after_kick"] + model_df["two_point_attempt"]==1, 1, 0)
search_rf_play_class_names_home = [x + "_home" for x in search_rf_play_class_names]
search_rf_play_class_names_away = [x + "_away" for x in search_rf_play_class_names]
search_rf_drive_class_names_home = [x + "_home" for x in search_rf_drive_class_names]
search_rf_drive_class_names_away = [x + "_away" for x in search_rf_drive_class_names]
model_df[search_rf_play_class_names_home] = model_df[search_rf_play_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_rf_play_class_names_away] = model_df[search_rf_play_class_names].where(model_df.home_team_has_ball==0, 0)
model_df[search_rf_drive_class_names_home] = model_df[search_rf_drive_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_rf_drive_class_names_away] = model_df[search_rf_drive_class_names].where(model_df.home_team_has_ball==0, 0)


search_mlp_play_class_names_home = [x + "_home" for x in search_mlp_play_class_names]
search_mlp_play_class_names_away = [x + "_away" for x in search_mlp_play_class_names]
search_mlp_drive_class_names_home = [x + "_home" for x in search_mlp_drive_class_names]
search_mlp_drive_class_names_away = [x + "_away" for x in search_mlp_drive_class_names]
model_df[search_mlp_play_class_names_home] = model_df[search_mlp_play_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_mlp_play_class_names_away] = model_df[search_mlp_play_class_names].where(model_df.home_team_has_ball==0, 0)
model_df[search_mlp_drive_class_names_home] = model_df[search_mlp_drive_class_names].where(model_df.home_team_has_ball==1, 0)
model_df[search_mlp_drive_class_names_away] = model_df[search_mlp_drive_class_names].where(model_df.home_team_has_ball==0, 0)


input_names_score_pred = [item for item in input_names if item not in ["punt", "field_goal_attempt"]] + ["kick_off", "point_after_play"] + ["search_rf_play_first_down_home", "search_rf_play_first_down_away"] + search_rf_drive_class_names_home[1:] + search_rf_drive_class_names_away[1:]
output_name = "end_of_regulation_score_diff_change_clipped"
mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred+[output_name]].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    # "n_estimators": [1],
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}
# rf_grid = {
#     # "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
#     "n_estimators": [200],
#     # "max_features": ["auto", "sqrt"],
#     # "max_depth": np.linspace(5, 15, num=11, dtype=int),
#     "max_depth": [15],
#     # "min_samples_split": [2, 5, 10],
#     # "min_samples_leaf": [1, 2, 4],
#     # "bootstrap": [True, False],
# }
mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
# mlp_grid = {
#     'hidden_layer_sizes': [(100,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant'],
# }

score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())

log_loss_scorer_score_diff = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=score_diff_change_list_clipped)
rf_score_diff_search_base = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)
# search_rf_score_diff_clipped_rf_drive_preds = RandomizedSearchCV(rf_score_diff_search_base, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_rf_score_diff_clipped_rf_drive_preds.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_score_diff_clipped_rf_drive_preds, open(os.path.join(root_dir, 'models/search_rf_score_diff_clipped_rf_drive_preds.p'), 'wb'))
search_rf_score_diff_clipped_rf_drive_preds = pickle.load(open(os.path.join(root_dir, "models/search_rf_score_diff_clipped_rf_drive_preds.p"), 'rb'))

X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name, normalize=True)
mlp_score_diff_search_base = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_score_diff_clipped_rf_drive_preds = GridSearchCV(mlp_score_diff_search_base, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_mlp_score_diff_clipped_rf_drive_preds.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_score_diff_clipped_rf_drive_preds, open(os.path.join(root_dir, 'models/search_mlp_score_diff_clipped_rf_drive_preds.p'), 'wb'))
search_mlp_score_diff_clipped_rf_drive_preds = pickle.load(open(os.path.join(root_dir, "models/search_mlp_score_diff_clipped_rf_drive_preds.p"), 'rb'))
# os.system("say 'done'")
print(search_mlp_score_diff_clipped_rf_drive_preds.best_score_)
print(search_rf_score_diff_clipped_rf_drive_preds.best_score_)

In [None]:
pd.set_option("display.max_columns", None)
X_train.head(30)

In [None]:

input_names_score_pred = [item for item in input_names if item not in ["punt", "field_goal_attempt"]] + ["kick_off", "point_after_play"] + ["search_mlp_play_first_down_home", "search_mlp_play_first_down_away"] + search_mlp_drive_class_names_home[1:] + search_mlp_drive_class_names_away[1:]
output_name = "end_of_regulation_score_diff_change_clipped"

mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred+[output_name]].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    # "n_estimators": [1],
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}
# rf_grid = {
#     # "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
#     "n_estimators": [200],
#     # "max_features": ["auto", "sqrt"],
#     # "max_depth": np.linspace(5, 15, num=11, dtype=int),
#     "max_depth": [15],
#     # "min_samples_split": [2, 5, 10],
#     # "min_samples_leaf": [1, 2, 4],
#     # "bootstrap": [True, False],
# }
mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
# mlp_grid = {
#     'hidden_layer_sizes': [(100,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant'],
# }

score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())

log_loss_scorer_score_diff = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=score_diff_change_list_clipped)
rf_score_diff_search_base = RandomForestClassifier(verbose=100, n_jobs=1, random_state=1)
# search_rf_score_diff_clipped_mlp_drive_preds = RandomizedSearchCV(rf_score_diff_search_base, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=-1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_rf_score_diff_clipped_mlp_drive_preds.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_score_diff_clipped_mlp_drive_preds, open(os.path.join(root_dir, 'models/search_rf_score_diff_clipped_mlp_drive_preds.p'), 'wb'))
search_rf_score_diff_clipped_mlp_drive_preds = pickle.load(open(os.path.join(root_dir, "models/search_rf_score_diff_clipped_mlp_drive_preds.p"), 'rb'))


X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name, normalize=True)

mlp_score_diff_search_base = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_score_diff_clipped_mlp_drive_preds = GridSearchCV(mlp_score_diff_search_base, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_mlp_score_diff_clipped_mlp_drive_preds.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_score_diff_clipped_mlp_drive_preds, open(os.path.join(root_dir, 'models/search_mlp_score_diff_clipped_mlp_drive_preds.p'), 'wb'))
search_mlp_score_diff_clipped_mlp_drive_preds = pickle.load(open(os.path.join(root_dir, "models/search_mlp_score_diff_clipped_mlp_drive_preds.p"), 'rb'))


In [None]:
# os.system("say 'done'")

for key in search_mlp_score_diff_clipped_mlp_drive_preds.best_params_.keys():
    mlp_grid[key] = [search_mlp_score_diff_clipped_mlp_drive_preds.best_params_[key]]
mlp_grid

In [None]:
for x in search_mlp_score_diff_clipped_mlp_drive_preds.best_params_:
    print([search_mlp_score_diff_clipped_mlp_drive_preds.best_params_[x]])

In [None]:

input_names_score_pred = [
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'home_vegas_score_pred_weighted',
    'away_vegas_score_pred_weighted',
    # 'cur_spread',
    # 'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
    'kick_off',
    # 'point_after_kick',
    # 'two_point_attempt',
    'point_after_play',
    'search_rf_play_first_down_home',
    'search_rf_play_first_down_away',
    'search_rf_drive_field_goal_made_home',
    'search_rf_drive_field_goal_missed_home',
    'search_rf_drive_punt_home',
    'search_rf_drive_safety_home',
    'search_rf_drive_touch_down_home',
    'search_rf_drive_turnover_home',
    'search_rf_drive_turnover_on_downs_home',
    'search_rf_drive_field_goal_made_away',
    'search_rf_drive_field_goal_missed_away',
    'search_rf_drive_punt_away',
    'search_rf_drive_safety_away',
    'search_rf_drive_touch_down_away',
    'search_rf_drive_turnover_away',
    'search_rf_drive_turnover_on_downs_away'
 ]
output_name = "end_of_regulation_score_diff_change_clipped"

mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred+[output_name]].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    # "n_estimators": [1],
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

# rf_grid = {
#     # "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
#     "n_estimators": [200],
#     # "max_features": ["auto", "sqrt"],
#     # "max_depth": np.linspace(5, 15, num=11, dtype=int),
#     "max_depth": [15],
#     # "min_samples_split": [2, 5, 10],
#     # "min_samples_leaf": [1, 2, 4],
#     # "bootstrap": [True, False],
# }

# mlp_grid = {
#     'hidden_layer_sizes': [(100,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant'],
# }
for key in search_rf_score_diff_clipped_rf_drive_preds.best_params_.keys():
    rf_grid[key] = [search_rf_score_diff_clipped_rf_drive_preds.best_params_[key]]
for key in search_mlp_score_diff_clipped_rf_drive_preds.best_params_.keys():
    mlp_grid[key] = [search_mlp_score_diff_clipped_rf_drive_preds.best_params_[key]]

score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())

log_loss_scorer_score_diff = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=score_diff_change_list_clipped)
rf_score_diff_search_base = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)
# search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted = RandomizedSearchCV(rf_score_diff_search_base, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted, open(os.path.join(root_dir, 'models/search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.p'), 'wb'))
search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted = pickle.load(open(os.path.join(root_dir, "models/search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.p"), 'rb'))


X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name, normalize=True)

mlp_score_diff_search_base = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted = GridSearchCV(mlp_score_diff_search_base, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted, open(os.path.join(root_dir, 'models/search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted.p'), 'wb'))
search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted = pickle.load(open(os.path.join(root_dir, "models/search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted.p"), 'rb'))

print(search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.best_score_)
print(search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted.best_score_)
# os.system("say 'done'")




In [None]:
rf_grid

In [None]:
input_names_score_pred = [
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'home_vegas_score_pred_weighted',
    'away_vegas_score_pred_weighted',
    # 'cur_spread',
    # 'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
    'kick_off',
    'point_after_play',
    'point_after_kick',
    'two_point_attempt',
    'search_mlp_play_first_down_home',
    'search_mlp_play_first_down_away',
    'search_mlp_drive_field_goal_made_home',
    'search_mlp_drive_field_goal_missed_home',
    'search_mlp_drive_punt_home',
    'search_mlp_drive_safety_home',
    'search_mlp_drive_touch_down_home',
    'search_mlp_drive_turnover_home',
    'search_mlp_drive_turnover_on_downs_home',
    'search_mlp_drive_field_goal_made_away',
    'search_mlp_drive_field_goal_missed_away',
    'search_mlp_drive_punt_away',
    'search_mlp_drive_safety_away',
    'search_mlp_drive_touch_down_away',
    'search_mlp_drive_turnover_away',
    'search_mlp_drive_turnover_on_downs_away'
 ]
output_name = "end_of_regulation_score_diff_change_clipped"

mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred+[output_name]].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    # "n_estimators": [1],
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}
# rf_grid = {
#     # "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
#     "n_estimators": [200],
#     # "max_features": ["auto", "sqrt"],
#     # "max_depth": np.linspace(5, 15, num=11, dtype=int),
#     "max_depth": [15],
#     # "min_samples_split": [2, 5, 10],
#     # "min_samples_leaf": [1, 2, 4],
#     # "bootstrap": [True, False],
# }
mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
# mlp_grid = {
#     'hidden_layer_sizes': [(100,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant'],
# }
for key in search_rf_score_diff_clipped_mlp_drive_preds.best_params_.keys():
    rf_grid[key] = [search_rf_score_diff_clipped_mlp_drive_preds.best_params_[key]]
for key in search_mlp_score_diff_clipped_mlp_drive_preds.best_params_.keys():
    mlp_grid[key] = [search_mlp_score_diff_clipped_mlp_drive_preds.best_params_[key]]

score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())

log_loss_scorer_score_diff = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=score_diff_change_list_clipped)
rf_score_diff_search_base = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)

# search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted = RandomizedSearchCV(rf_score_diff_search_base, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted, open(os.path.join(root_dir, 'models/search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.p'), 'wb'))
search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted = pickle.load(open(os.path.join(root_dir, "models/search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.p"), 'rb'))


X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name, normalize=True)

mlp_score_diff_search_base = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted = GridSearchCV(mlp_score_diff_search_base, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer_score_diff)
# search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted, open(os.path.join(root_dir, 'models/search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted.p'), 'wb'))
search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted = pickle.load(open(os.path.join(root_dir, "models/search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted.p"), 'rb'))

print(search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.best_score_)
print(search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted.best_score_)
# os.system("say 'done'")




In [None]:
search_mlp_score_diff_clipped_mlp_drive_preds.best_params_.keys()

In [None]:
score_diff_change_list = list(model_df.end_of_regulation_score_diff_change.drop_duplicates().sort_values())


score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())


search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.best_estimator_.verbose = 0
rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds = pd.DataFrame(search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.predict_proba(model_df[mask_model][search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.feature_names_in_]), index=model_df[mask_model].index)
score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix = pd.DataFrame(np.zeros(rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.shape), index=rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.index)

for column in score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.columns:
    score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_rf_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T>0], axis=0)
model_df["xovertime_rf_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T==0], axis=0)
model_df["xaway_win_rf_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_rf_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix * rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds, axis=1)
search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.best_estimator_.verbose = 0
rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds = pd.DataFrame(search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.predict_proba(model_df[mask_model][search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.feature_names_in_]), index=model_df[mask_model].index)
score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix = pd.DataFrame(np.zeros(rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.shape), index=rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.index)

for column in score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.columns:
    score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_rf_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T>0], axis=0)
model_df["xovertime_rf_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T==0], axis=0)
model_df["xaway_win_rf_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_rf_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix * rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds, axis=1)


normalized_score_pred_df = normalize_df(model_df[mask_model][search_rf_score_diff_clipped_rf_drive_preds_vegas_adjusted.feature_names_in_], model_df[mask_model & (model_df.season<2020)])
mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds = pd.DataFrame(search_mlp_score_diff_clipped_rf_drive_preds_vegas_adjusted.predict_proba(normalized_score_pred_df), index=model_df[mask_model].index)
score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix = pd.DataFrame(np.zeros(mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.shape), index=mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.index)

for column in score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.columns:
    score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]



model_df["xhome_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T>0], axis=0)
model_df["xovertime_mlp_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T==0], axis=0)
model_df["xaway_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds_vegas_adjusted"] = np.sum(score_diff_clipped_rf_drive_preds_vegas_adjusted_matrix * mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds, axis=1)
search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.best_estimator_.verbose = 0


normalized_score_pred_df = normalize_df(model_df[mask_model][search_rf_score_diff_clipped_mlp_drive_preds_vegas_adjusted.feature_names_in_], model_df[mask_model & (model_df.season<2020)])

mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds = pd.DataFrame(search_mlp_score_diff_clipped_mlp_drive_preds_vegas_adjusted.predict_proba(normalized_score_pred_df), index=model_df[mask_model].index)
score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix = pd.DataFrame(np.zeros(mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.shape), index=mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.index)

for column in score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.columns:
    score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T>0], axis=0)
model_df["xovertime_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T==0], axis=0)
model_df["xaway_win_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds.T[score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"] = np.sum(score_diff_clipped_mlp_drive_preds_vegas_adjusted_matrix * mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds, axis=1)



# os.system("say 'done'")




In [None]:
score_diff_change_list = list(model_df.end_of_regulation_score_diff_change.drop_duplicates().sort_values())


score_diff_change_list_clipped = list(model_df.end_of_regulation_score_diff_change_clipped.drop_duplicates().sort_values())


search_rf_score_diff_clipped_rf_drive_preds.best_estimator_.verbose = 0
rf_search_score_diff_clipped_rf_drive_preds_preds = pd.DataFrame(search_rf_score_diff_clipped_rf_drive_preds.predict_proba(model_df[mask_model][search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_]), index=model_df[mask_model].index)
score_diff_clipped_rf_drive_preds_matrix = pd.DataFrame(np.zeros(rf_search_score_diff_clipped_rf_drive_preds_preds.shape), index=rf_search_score_diff_clipped_rf_drive_preds_preds.index)

for column in score_diff_clipped_rf_drive_preds_matrix.columns:
    score_diff_clipped_rf_drive_preds_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_rf_search_clipped_rf_drive_preds"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T>0], axis=0)
model_df["xovertime_rf_search_clipped_rf_drive_preds"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T==0], axis=0)
model_df["xaway_win_rf_search_clipped_rf_drive_preds"] = np.sum(rf_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_rf_search_clipped_rf_drive_preds"] = np.sum(score_diff_clipped_rf_drive_preds_matrix * rf_search_score_diff_clipped_rf_drive_preds_preds, axis=1)
search_rf_score_diff_clipped_mlp_drive_preds.best_estimator_.verbose = 0
rf_search_score_diff_clipped_mlp_drive_preds_preds = pd.DataFrame(search_rf_score_diff_clipped_mlp_drive_preds.predict_proba(model_df[mask_model][search_rf_score_diff_clipped_mlp_drive_preds.feature_names_in_]), index=model_df[mask_model].index)
score_diff_clipped_mlp_drive_preds_matrix = pd.DataFrame(np.zeros(rf_search_score_diff_clipped_mlp_drive_preds_preds.shape), index=rf_search_score_diff_clipped_mlp_drive_preds_preds.index)

for column in score_diff_clipped_mlp_drive_preds_matrix.columns:
    score_diff_clipped_mlp_drive_preds_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_rf_search_clipped_mlp_drive_preds"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T>0], axis=0)
model_df["xovertime_rf_search_clipped_mlp_drive_preds"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T==0], axis=0)
model_df["xaway_win_rf_search_clipped_mlp_drive_preds"] = np.sum(rf_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_rf_search_clipped_mlp_drive_preds"] = np.sum(score_diff_clipped_mlp_drive_preds_matrix * rf_search_score_diff_clipped_mlp_drive_preds_preds, axis=1)


normalized_score_pred_df = normalize_df(model_df[mask_model][search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], model_df[mask_model & (model_df.season<2020)])
mlp_search_score_diff_clipped_rf_drive_preds_preds = pd.DataFrame(search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalized_score_pred_df), index=model_df[mask_model].index)
score_diff_clipped_rf_drive_preds_matrix = pd.DataFrame(np.zeros(mlp_search_score_diff_clipped_rf_drive_preds_preds.shape), index=mlp_search_score_diff_clipped_rf_drive_preds_preds.index)

for column in score_diff_clipped_rf_drive_preds_matrix.columns:
    score_diff_clipped_rf_drive_preds_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]



model_df["xhome_win_mlp_search_clipped_rf_drive_preds"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T>0], axis=0)
model_df["xovertime_mlp_search_clipped_rf_drive_preds"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T==0], axis=0)
model_df["xaway_win_mlp_search_clipped_rf_drive_preds"] = np.sum(mlp_search_score_diff_clipped_rf_drive_preds_preds.T[score_diff_clipped_rf_drive_preds_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds"] = np.sum(score_diff_clipped_rf_drive_preds_matrix * mlp_search_score_diff_clipped_rf_drive_preds_preds, axis=1)
search_rf_score_diff_clipped_mlp_drive_preds.best_estimator_.verbose = 0


normalized_score_pred_df = normalize_df(model_df[mask_model][search_rf_score_diff_clipped_mlp_drive_preds.feature_names_in_], model_df[mask_model & (model_df.season<2020)])

mlp_search_score_diff_clipped_mlp_drive_preds_preds = pd.DataFrame(search_mlp_score_diff_clipped_mlp_drive_preds.predict_proba(normalized_score_pred_df), index=model_df[mask_model].index)
score_diff_clipped_mlp_drive_preds_matrix = pd.DataFrame(np.zeros(mlp_search_score_diff_clipped_mlp_drive_preds_preds.shape), index=mlp_search_score_diff_clipped_mlp_drive_preds_preds.index)

for column in score_diff_clipped_mlp_drive_preds_matrix.columns:
    score_diff_clipped_mlp_drive_preds_matrix[column] = score_diff_change_list_clipped[column] + model_df["current_score_diff"]

model_df["xhome_win_mlp_search_clipped_mlp_drive_preds"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T>0], axis=0)
model_df["xovertime_mlp_search_clipped_mlp_drive_preds"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T==0], axis=0)
model_df["xaway_win_mlp_search_clipped_mlp_drive_preds"] = np.sum(mlp_search_score_diff_clipped_mlp_drive_preds_preds.T[score_diff_clipped_mlp_drive_preds_matrix.T<0], axis=0)
model_df["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds"] = np.sum(score_diff_clipped_mlp_drive_preds_matrix * mlp_search_score_diff_clipped_mlp_drive_preds_preds, axis=1)



# os.system("say 'done'")




In [None]:
# rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds
naive_search_score_diff_clipped_cols = ["score_diff_dummy_" + str(x) for x in score_diff_change_list_clipped]
# naive_search_score_diff_clipped_cols


mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)

naive_score_change_dummies = pd.get_dummies(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], columns=naive_search_score_diff_clipped_cols)
naive_score_change_preds = pd.DataFrame(np.zeros((len(model_df[mask_model].index), len(naive_score_change_dummies.columns))), index= model_df[mask_model].index, columns=naive_score_change_dummies.columns)
for col in naive_score_change_preds.columns:
    naive_score_change_preds[col] = np.mean(naive_score_change_dummies)[col]

# pd.DataFrame(np.mean(naive_score_change_preds)).to_clipboard()
print("naive: ", log_loss(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], naive_score_change_preds))
print("search_mlp mlp_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season<2020)]))
print("search_mlp rf_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season<2020)]))
print("search_rf mlp_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season<2020)]))
print("search_rf rf_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season<2020)]))


# naive_score_change_preds

In [None]:
# rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds
naive_search_score_diff_clipped_cols = ["score_diff_dummy_" + str(x) for x in score_diff_change_list_clipped]
# naive_search_score_diff_clipped_cols


mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)

naive_score_change_dummies = pd.get_dummies(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], columns=naive_search_score_diff_clipped_cols)
naive_score_change_preds = pd.DataFrame(np.zeros((len(model_df[mask_model].index), len(naive_score_change_dummies.columns))), index= model_df[mask_model].index, columns=naive_score_change_dummies.columns)
for col in naive_score_change_preds.columns:
    naive_score_change_preds[col] = np.mean(naive_score_change_dummies)[col]

# pd.DataFrame(np.mean(naive_score_change_preds)).to_clipboard()
print("naive: ", log_loss(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], naive_score_change_preds))
print("search_mlp mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_mlp rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))


# naive_score_change_preds

In [None]:
# rf_search_score_diff_clipped_mlp_drive_preds_preds
naive_search_score_diff_clipped_cols = ["score_diff_dummy_" + str(x) for x in score_diff_change_list_clipped]
# naive_search_score_diff_clipped_cols


mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)

naive_score_change_dummies = pd.get_dummies(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], columns=naive_search_score_diff_clipped_cols)
naive_score_change_preds = pd.DataFrame(np.zeros((len(model_df[mask_model].index), len(naive_score_change_dummies.columns))), index= model_df[mask_model].index, columns=naive_score_change_dummies.columns)
for col in naive_score_change_preds.columns:
    naive_score_change_preds[col] = np.mean(naive_score_change_dummies)[col]

# pd.DataFrame(np.mean(naive_score_change_preds)).to_clipboard()
print("naive: ", log_loss(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], naive_score_change_preds))
print("search_mlp mlp_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_mlp_drive_preds_preds[(model_df[mask_model].season<2020)]))
print("search_mlp rf_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_rf_drive_preds_preds[(model_df[mask_model].season<2020)]))
print("search_rf mlp_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_mlp_drive_preds_preds[(model_df[mask_model].season<2020)]))
print("search_rf rf_drive: ", log_loss(model_df[mask_model&(model_df.season<2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_rf_drive_preds_preds[(model_df[mask_model].season<2020)]))


# naive_score_change_preds

In [None]:
# rf_search_score_diff_clipped_mlp_drive_preds_preds
naive_search_score_diff_clipped_cols = ["score_diff_dummy_" + str(x) for x in score_diff_change_list_clipped]
# naive_search_score_diff_clipped_cols


mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)

naive_score_change_dummies = pd.get_dummies(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], columns=naive_search_score_diff_clipped_cols)
naive_score_change_preds = pd.DataFrame(np.zeros((len(model_df[mask_model].index), len(naive_score_change_dummies.columns))), index= model_df[mask_model].index, columns=naive_score_change_dummies.columns)
for col in naive_score_change_preds.columns:
    naive_score_change_preds[col] = np.mean(naive_score_change_dummies)[col]

# pd.DataFrame(np.mean(naive_score_change_preds)).to_clipboard()
print("naive: ", log_loss(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], naive_score_change_preds))
print("search_mlp mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_mlp_drive_preds_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_mlp rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_rf_drive_preds_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_mlp_drive_preds_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_rf_drive_preds_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))


# naive_score_change_preds

In [None]:
# rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds
naive_search_score_diff_clipped_cols = ["score_diff_dummy_" + str(x) for x in score_diff_change_list_clipped]
# naive_search_score_diff_clipped_cols


mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred].notna().all(axis=1))&
    ~(model_df.event_id.isin([12,57,58,13]))&
    (model_df["overtime"]==0)
)

naive_score_change_dummies = pd.get_dummies(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], columns=naive_search_score_diff_clipped_cols)
naive_score_change_preds = pd.DataFrame(np.zeros((len(model_df[mask_model].index), len(naive_score_change_dummies.columns))), index= model_df[mask_model].index, columns=naive_score_change_dummies.columns)
for col in naive_score_change_preds.columns:
    naive_score_change_preds[col] = np.mean(naive_score_change_dummies)[col]

# pd.DataFrame(np.mean(naive_score_change_preds)).to_clipboard()
print("naive: ", log_loss(model_df[mask_model]["end_of_regulation_score_diff_change_clipped"], naive_score_change_preds))
print("search_mlp mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_mlp rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], mlp_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf mlp_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_mlp_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))
print("search_rf rf_drive: ", log_loss(model_df[mask_model&(model_df.season==2020)]["end_of_regulation_score_diff_change_clipped"], rf_search_score_diff_clipped_rf_drive_preds_vegas_adjusted_preds[(model_df[mask_model].season==2020)], labels=score_diff_change_list_clipped))


# naive_score_change_preds

In [None]:
model_df["home_team_win_in_regulation"] = np.where(model_df["end_of_regulation_score_diff"]>0, 1, 0)
model_df["away_team_win_in_regulation"] = np.where(model_df["end_of_regulation_score_diff"]<0, 1, 0)
model_df["tie_at_end_of_regulation"] = np.where(model_df["end_of_regulation_score_diff"]==0, 1, 0)
calb_df = model_df[(model_df["season"]==2020)&~(model_df.xhome_win_mlp_search_clipped_rf_drive_preds.isna())]
# calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_mlp"], 0, 1), n_bins=10, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
bins = 20

calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_mlp_search_clipped_rf_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_mlp_search_clipped_mlp_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_rf_search_clipped_rf_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_rf_search_clipped_mlp_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
model_df["home_team_win_in_regulation"] = np.where(model_df["end_of_regulation_score_diff"]>0, 1, 0)
model_df["away_team_win_in_regulation"] = np.where(model_df["end_of_regulation_score_diff"]<0, 1, 0)
model_df["tie_at_end_of_regulation"] = np.where(model_df["end_of_regulation_score_diff"]==0, 1, 0)
calb_df = model_df[(model_df["season"]<2020)&~(model_df.xhome_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted.isna())]
# calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_mlp"], 0, 1), n_bins=10, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
bins = 20

calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_rf_search_clipped_rf_drive_preds_vegas_adjusted"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win_in_regulation"], np.clip(calb_df["xhome_win_rf_search_clipped_mlp_drive_preds_vegas_adjusted"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
# estimator = search_rf_score_diff_clipped.best_estimator_.estimators_[0]
# from sklearn.tree import export_graphviz
# export_graphviz(estimator, out_file='tree.dot', 
#                 feature_names = list(search_rf_score_diff_clipped.feature_names_in_),
#                 max_depth=3,
#                 class_names = list(search_rf_score_diff_clipped.classes_.astype(str)),
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

# import pydot

# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])


In [None]:
model_df["xend_of_regulation_score_diff_change_mlp_search_clipped_rf_drive_preds"] = model_df["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds"] - model_df["current_score_diff"]
input_names_total_score_pred = input_names_score_pred + ["xend_of_regulation_score_diff_change_mlp_search_clipped_rf_drive_preds"]
model_df["end_of_regulation_score_total_diff_clipped"] = np.clip(model_df["end_of_regulation_score_total_diff"], 0, 83)
output_name = "end_of_regulation_score_total_diff_clipped"
mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_total_score_pred+[output_name]].notna().all(axis=1))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_total_score_pred, output_name)


cv=GroupKFold(n_splits=3)
rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=10, dtype=int),
    "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(5, 15, num=11, dtype=int),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}
rf_grid = {
    "n_estimators": [400],
    "max_depth": [14],
    "min_samples_split": [10],
    "min_samples_leaf": [4],
}
rf_total_score_search_base = RandomForestClassifier(verbose=100, n_jobs=-1, random_state=1)
# search_rf_total_score = RandomizedSearchCV(rf_total_score_search_base, rf_grid, cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100, scoring=log_loss_scorer)
# search_rf_total_score.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_rf_total_score, open(os.path.join(root_dir, 'models/search_rf_total_score.p'), 'wb'))
search_rf_total_score = pickle.load(open(os.path.join(root_dir, "models/search_rf_total_score.p"), 'rb'))


mlp_grid = {
    'hidden_layer_sizes': [(50,50,50), (10,30,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
mlp_grid = {
    'hidden_layer_sizes': [(10,30,10)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'learning_rate': ['constant'],
}
# search_mlp_play_outcome_normalized = GridSearchCV(mlp_play_outcome_search_model, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)

mlp_total_score_search_base = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# search_mlp_total_score = GridSearchCV(mlp_total_score_search_base, mlp_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)
# search_mlp_total_score.fit(X_train,y_train,groups=group_train)
# pickle.dump(search_mlp_total_score, open(os.path.join(root_dir, 'models/search_mlp_total_score.p'), 'wb'))
search_mlp_total_score = pickle.load(open(os.path.join(root_dir, "models/search_mlp_total_score.p"), 'rb'))




# os.system("say 'done'")

In [None]:
search_rf_total_score.best_estimator_.verbose = 100
rf_search_score_total_preds = pd.DataFrame(search_rf_total_score.predict_proba(model_df[mask_model][search_rf_total_score.feature_names_in_]), index=model_df[mask_model].index)
score_total_matrix = pd.DataFrame(np.zeros(rf_search_score_total_preds.shape), index=rf_search_score_total_preds.index)
score_total_change_list = list(model_df[mask_model].end_of_regulation_score_total_diff_clipped.drop_duplicates().sort_values())

for column in score_total_matrix.columns:
    score_total_matrix[column] = score_total_change_list[column] + model_df["home_start_score"] + model_df["away_start_score"]

# model_df["xend_of_regulation_score_total_rf_search"] = np.sum(score_total_matrix * rf_search_score_total_preds, axis=1)
model_df["xend_of_regulation_score_total_rf_search"] = np.sum(score_total_matrix * rf_search_score_total_preds, axis=1)



mlp_search_score_total_preds = pd.DataFrame(search_mlp_total_score.predict_proba(model_df[mask_model][search_rf_total_score.feature_names_in_]), index=model_df[mask_model].index)
score_total_matrix = pd.DataFrame(np.zeros(mlp_search_score_total_preds.shape), index=mlp_search_score_total_preds.index)
score_total_change_list = list(model_df[mask_model].end_of_regulation_score_total_diff_clipped.drop_duplicates().sort_values())

for column in score_total_matrix.columns:
    score_total_matrix[column] = score_total_change_list[column] + model_df["home_start_score"] + model_df["away_start_score"]

# model_df["xend_of_regulation_score_total_mlp_search"] = np.sum(score_total_matrix * mlp_search_score_total_preds, axis=1)
model_df["xend_of_regulation_score_total_mlp_search"] = np.sum(score_total_matrix * mlp_search_score_total_preds, axis=1)
# os.system("say 'done'")



In [None]:
model_df["game_info"] = (
    model_df["home_team"]
    + " "
    + model_df["away_team"]
    + " "
    + model_df["game_date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    + " "
    + model_df["season"].apply(str)
    + " ("
    + (model_df["game_code"]).apply(str)
    + ")"
)
def ordinaltg(n):
    return n.replace({1: "1st", 2: "2nd", 3: "3rd", 4: "4th", 5: "5th", 6: "6th"})

model_df["absolute_score_diff"] = abs(model_df["home_start_score"] - model_df["away_start_score"])

model_df["minutes"] = (model_df["play_start_time"] // 60).fillna(0).apply(int)
model_df["seconds"] = (model_df["play_start_time"] - (model_df["play_start_time"] // 60) * 60).fillna(0).apply(
    int
)
model_df["seconds_str"] = np.where(
    model_df["seconds"] >= 10, model_df["seconds"].apply(str), "0" + model_df["seconds"].apply(str)
)
model_df["time_str"] = model_df["minutes"].apply(str) + ":" + model_df["seconds_str"]

model_df["team_score_desc"] = np.where(
    model_df["home_team_has_ball"] == 1,
    np.where(
        model_df["home_start_score"] > model_df["away_start_score"],
        "Up by " + model_df["absolute_score_diff"].apply(str),
        np.where(
            model_df["home_start_score"] < model_df["away_start_score"],
            "Down by " + model_df["absolute_score_diff"].apply(str),
            "Tied",
        ),
    ),
    np.where(
        model_df["home_start_score"] < model_df["away_start_score"],
        "Up by " + model_df["absolute_score_diff"].apply(str),
        np.where(
            model_df["home_start_score"] > model_df["away_start_score"],
            "Down by " + model_df["absolute_score_diff"].apply(str),
            "Tied",
        ),
    ),
)
model_df["play_description"] = (
    ordinaltg(model_df["quarter"])
    + " Qtr "
    + model_df["minutes"].apply(str)
    + ":"
    + model_df["seconds_str"]
    + ", "
    + model_df["team_score_desc"]
    + ", "
    + ordinaltg(model_df["down"]).apply(str)
    + " & "
    + model_df["ytg"].apply(str)
    + ", "
    + model_df["yd_from_goal"].apply(str)
    + " Yards From Goal, "
    + np.where(
        model_df["home_team_has_ball"] == 1,
        model_df["home_team"],
        model_df["away_team"],
    )
    + " has ball, "
    + "Off TO: "
    + np.where(
        model_df["home_team_has_ball"] == 1,
        model_df["home_timeouts_remaining"],
        model_df["away_timeouts_remaining"],
    ).astype(str)
    + ", Def TO: "
    + np.where(
        model_df["home_team_has_ball"] == 0,
        model_df["home_timeouts_remaining"],
        model_df["away_timeouts_remaining"],
    ).astype(str)
    + " ("
    + model_df["nevent"].apply(str)
    + ")"
)




In [None]:
# pd.DataFrame(mlp_search_score_diff_clipped_rf_drive_preds_preds[mask_model & (model_df.game_code==2337728)]).to_clipboard()
model_df[mask_model&(model_df.game_code==2337728)][search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_.tolist() + ["nevent", "event_name", "event_id", ]].to_clipboard()
# search_mlp_play_class_names

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import plotly
import plotly.graph_objects as go
from plotly.offline import iplot
from plotly.subplots import make_subplots
import math


plotly.offline.init_notebook_mode()
model_df["home_team_has_ball_fixed"] = model_df["home_team_has_ball"]
def plot_game_lwp(game_info, model, width, height):
    game_df = deepcopy(model_df[(model_df["game_info"] == game_info)&(model_df["event_id"].isin([1,2,3,4,5,7,9,14,17,18,22,35,41,47,52,53,54,55,56]))&(model_df["continuation"]==0)&(model_df["overtime"]==0)])
    # print(game_df)
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    if model =="mlp":
        y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"]
        y1="xscore_diff_end_of_regulation_mlp"
    elif model =="rf":
        y=["xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
        y1="xscore_diff_end_of_regulation_rf"
    elif model =="rf_search_rf":
        y = ["xhome_win_rf_search_clipped_rf_drive_preds", "xaway_win_rf_search_clipped_rf_drive_preds", "xovertime_rf_search_clipped_rf_drive_preds"]
        y1 = "xend_of_regulation_score_diff_rf_search_clipped_rf_drive_preds"
        # y2 = "xend_of_regulation_score_total_rf_search"
        y3 = search_rf_drive_class_names
        y4 = "search_rf_play_first_down"
    elif model =="rf_search_mlp":
        y = ["xhome_win_rf_search_clipped_mlp_drive_preds", "xaway_win_rf_search_clipped_mlp_drive_preds", "xovertime_rf_search_clipped_mlp_drive_preds"]
        y1 = "xend_of_regulation_score_diff_rf_search_clipped_mlp_drive_preds"
        y2 = "xend_of_regulation_score_total_rf_search"
        y3 = search_mlp_drive_class_names
        y4 = "search_mlp_play_first_down"
    elif model =="mlp_search_rf":
        y = ["xhome_win_mlp_search_clipped_rf_drive_preds", "xaway_win_mlp_search_clipped_rf_drive_preds", "xovertime_mlp_search_clipped_rf_drive_preds"]
        y1 = "xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds"
        y2 = "xend_of_regulation_score_total_mlp_search"
        y3 = search_rf_drive_class_names
        y4 = "search_rf_play_first_down"
    elif model =="mlp_search_mlp":
        y = ["xhome_win_mlp_search_clipped_mlp_drive_preds", "xaway_win_mlp_search_clipped_mlp_drive_preds", "xovertime_mlp_search_clipped_mlp_drive_preds"]
        y1 = "xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds"
        y2 = "xend_of_regulation_score_total_mlp_search"
        y3 = search_mlp_drive_class_names
        y4 = "search_mlp_play_first_down"
    elif model =="rf_search_rf_vegas_adjusted":
        y = ["xhome_win_rf_search_clipped_rf_drive_preds_vegas_adjusted", "xaway_win_rf_search_clipped_rf_drive_preds_vegas_adjusted", "xovertime_rf_search_clipped_rf_drive_preds_vegas_adjusted"]
        y1 = "xend_of_regulation_score_diff_rf_search_clipped_rf_drive_preds_vegas_adjusted"
        y2 = "xend_of_regulation_score_total_rf_search"
        y3 = search_rf_drive_class_names
        y4 = "search_rf_play_first_down"
    elif model =="rf_search_mlp_vegas_adjusted":
        y = ["xhome_win_rf_search_clipped_mlp_drive_preds_vegas_adjusted", "xaway_win_rf_search_clipped_mlp_drive_preds_vegas_adjusted", "xovertime_rf_search_clipped_mlp_drive_preds_vegas_adjusted"]
        y1 = "xend_of_regulation_score_diff_rf_search_clipped_mlp_drive_preds_vegas_adjusted"
        y2 = "xend_of_regulation_score_total_rf_search"
        y3 = search_mlp_drive_class_names
        y4 = "search_mlp_play_first_down"
    elif model =="mlp_search_rf_vegas_adjusted":
        y = ["xhome_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted", "xaway_win_mlp_search_clipped_rf_drive_preds_vegas_adjusted", "xovertime_mlp_search_clipped_rf_drive_preds_vegas_adjusted"]
        y1 = "xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds_vegas_adjusted"
        y2 = "xend_of_regulation_score_total_mlp_search"
        y3 = search_rf_drive_class_names
        y4 = "search_rf_play_first_down"
    elif model =="mlp_search_mlp_vegas_adjusted":
        y = ["xhome_win_mlp_search_clipped_mlp_drive_preds_vegas_adjusted", "xaway_win_mlp_search_clipped_mlp_drive_preds_vegas_adjusted", "xovertime_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"]
        y1 = "xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"
        y2 = "xend_of_regulation_score_total_mlp_search"
        y3 = search_mlp_drive_class_names
        y4 = "search_mlp_play_first_down"
    # elif model == "loglin_basic":
    #     y=['xhome_win_basic_loglin', 'xhome_no_win_basic_loglin']
    # elif model == "loglin_by_minute":
    #     y=['xhome_win_loglin_minute', 'xhome_no_win_loglin_minute']
    # elif model == "mlp_basic":
    #     y=['xhome_win_basic_mlp', 'xhome_no_win_basic_mlp']
    # elif model =="rf_old":
    #     y=["xhome_win", "xaway_win", "xdraw"]
    # elif model =="mlp_old":
    #     y=["xhome_team_win_mlp", "xaway_team_win_mlp", "xdraw_mlp"]
    colors = ["darkkhaki", "skyblue", "gray"]
    game_df["xhome_final_score"] = (game_df[y2] - game_df[y1]) / 2 + game_df[y1] * .5
    game_df["xaway_final_score"] = (game_df[y2] - game_df[y1]) / 2 - game_df[y1] * .5
    game_df["yards_description"] = (
        game_df["event_name"]
        + " for "
        + game_df["yards_gained"].fillna(0).apply(int).apply(str)
        + " yards"
    )
    game_df["wpa"] = game_df[y[0]] - game_df[y[0]].shift(1)

    v = pd.DataFrame(game_df[["quarter", "nevent"]]).reset_index(drop=True)
    mask_ticks = v["quarter"][1:].reset_index(drop=True) == v["quarter"][
        :-1
    ].reset_index(drop=True)
    ticks_idx = [min(v["nevent"])] + list(v[:-1][~mask_ticks]["nevent"] + 1)
    if len(ticks_idx) == 4:
        ticks_values = [1, 2, 3, 4]
    else:
        ticks_values = [1, 2, 3, 4, "OT"]
    ['search_rf_drive_clock',
    'search_rf_drive_field_goal_made',
    'search_rf_drive_field_goal_missed',
    'search_rf_drive_punt',
    'search_rf_drive_safety',
    'search_rf_drive_touch_down',
    'search_rf_drive_turnover',
    'search_rf_drive_turnover_on_downs']

    fig.add_trace(
        go.Scatter(
            x=game_df["nevent"],
            y=game_df[y[0]],
            customdata=game_df[["yards_description", "play_description", "cur_spread", "wpa", y2, ]+y3 + [y4, "xhome_final_score", "xaway_final_score", "cur_over_under", y1]],
            stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color=colors[0]),
            name=game_df["home_team"].tolist()[0],
            hovertemplate="<br>".join(
                [
                    "%{y}",
                    "%{customdata[0]}",
                    "%{customdata[1]}",
                    "Spread: %{customdata[2]}",
                    "xScore Diff: %{customdata[17]}",
                    "Over/Under: %{customdata[16]}",
                    "xScore Total: %{customdata[4]}",
                    "xHome Final Score: %{customdata[14]}",
                    "xAway Final Score: %{customdata[15]}",
                    "Clock: %{customdata[5]}",
                    "FG Made: %{customdata[6]}",
                    "FG Miss: %{customdata[7]}",
                    "Punt: %{customdata[8]}",
                    "TD: %{customdata[10]}",
                    "TO: %{customdata[11]}",
                    "TOD: %{customdata[12]}",
                    "First Down: %{customdata[13]}",
                    "WPA: %{customdata[3]}",
                ]
            ),
        )
    )
    try:
        fig.add_trace(go.Scatter(
            x=game_df["nevent"],
            y=game_df[y[2]],
            # hovertext=game_df[hover_values],
            stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color=colors[2]),
            name="Overtime",
        ))
    except:
        pass
    fig.add_trace(go.Scatter(
        x=game_df["nevent"],
        y=game_df[y[1]],
        # hovertext=game_df[hover_values],
        stackgroup="one",
        mode="lines",
        line=dict(width=0.5, color=colors[1]),
        name=game_df["away_team"].tolist()[0],
    ))
    try:
        fig.add_trace(go.Scatter(
            x=game_df["nevent"],
            y=game_df[y1],
            # hovertext=game_df[hover_values],
            # stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color="black", dash='dash'),
            name="xfinal_score_diff",
        ), secondary_y=True)
    except:
        pass
    
    fig.update_xaxes(range=[1, np.max(game_df["nevent"])])
    fig.update_yaxes(range=[0, 1], secondary_y=False)
    fig.update_yaxes(range=[-20, 20], secondary_y=True, showgrid=False)
    # x_range = [-math.ceil(np.max(abs(game_df["xscore_diff_end_of_regulation"]))/10) * 10, math.ceil(np.max(abs(game_df["xscore_diff_end_of_regulation"]))/10) *10]
    # print(x_range)
    # fig.update_yaxes(range=x_range, secondary_y=True)
    # fig.update_yaxes(range=[-30, 30], secondary_y=True, showgrid=False)

    # iplot([fig1, fig2, fig3])
    
    fig.update_layout(
        title = game_df["game_info"].tolist()[0] + " (" + model + " model)",
        hovermode="x unified",
        width=width,
        height=height,
        xaxis=dict(
            tickvals=ticks_idx, ticktext=ticks_values, gridwidth=2
        ),
        yaxis=dict(tick0=0, dtick=0.25),
        # yaxis=dict(tick0=-30, dtick=30, gridcolor="black", gridwidth=2),
    )
    game_df["score_change"] = game_df["home_score_added"] + game_df["away_score_added"]
    game_df["score_str"] = (
        (game_df["away_score_added"] + game_df["away_start_score"]).apply(str)
        + "-"
        + (game_df["home_score_added"] + game_df["home_start_score"]).apply(str)
    )
    scores_idx_home = game_df[game_df["home_score_added"] >= 3]["nevent"].tolist()
    score_y_home = game_df[game_df["home_score_added"] >= 3][y[0]].tolist()
    score_home_value = game_df[game_df["home_score_added"] >= 3]["home_score_added"]
    score_home_str = np.where(score_home_value == 3, " FG", " TD")
    score_display_home = game_df[game_df["home_score_added"] >= 3]["score_str"].tolist()
    scores_idx_away = game_df[game_df["away_score_added"] >= 3]["nevent"].tolist()
    score_y_away = (game_df[game_df["away_score_added"] >= 3][y[0]]).tolist()
    score_away_value = game_df[game_df["away_score_added"] >= 3]["away_score_added"]
    score_away_str = np.where(score_away_value == 3, " FG", " TD")
    score_display_away = game_df[game_df["away_score_added"] >= 3]["score_str"].tolist()
    mask_poss_change = (game_df["home_team_has_ball_fixed"].shift(-1)!=game_df["home_team_has_ball_fixed"])&(game_df["score_change"]==0)&(game_df["score_change"].shift(-1)==0)
    ball_change_idx = game_df[mask_poss_change]["nevent"].tolist()
    ball_change_y = (game_df[mask_poss_change][y[0]]).tolist()
    # print(ball_change_idx)
    # print(ball_change_y)
    for x in range(len(scores_idx_home)):
        fig.add_annotation(
            x=scores_idx_home[x],
            y=score_y_home[x],
            text=game_df["home_team_abbrev"].tolist()[0]
            + score_home_str[x]
            + " "
            + score_display_home[x],
            showarrow=True,
        )
    for x in range(len(scores_idx_away)):
        fig.add_annotation(
            x=scores_idx_away[x],
            y=score_y_away[x],
            text=game_df["away_team_abbrev"].tolist()[0]
            + score_away_str[x]
            + " "
            + score_display_away[x],
            showarrow=True,
        )
    fig.add_trace(go.Scatter(
        x=ball_change_idx,
        y=ball_change_y,
        mode="markers",
        name="Possession Change",
        textposition="bottom center",
        marker=dict(color="blue")
    ))

    iplot(fig)
interact(
    plot_game_lwp, 
    game_info=widgets.Dropdown(options=list(model_df.sort_values("game_date", ascending=False)["game_info"].drop_duplicates()), value="Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)"),
    model=widgets.Dropdown(options=["mlp", "rf", "rf_search_rf", "rf_search_mlp", "mlp_search_rf", "mlp_search_mlp", "rf_search_rf_vegas_adjusted", "rf_search_mlp_vegas_adjusted", "mlp_search_rf_vegas_adjusted", "mlp_search_mlp_vegas_adjusted"], value="mlp_search_rf"),
    width=widgets.IntSlider(min=500, max=1500, step=50, value=1200),
    height=widgets.IntSlider(min=500, max=1500, step=50, value=700),
)
    # return ()


In [None]:
os.system("say 'done'")

In [None]:
plot_subset = model_df[(model_df["nevent"]==1)]
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"], alpha = .5)
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds_vegas_adjusted"], alpha = .2)
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds"], alpha = .1)
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds"], alpha = .1)
# plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds_vegas_adjusted"])
# plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_rf_drive_preds_vegas_adjusted"])
plt.plot([-20, 20], [-20, 20])



In [None]:
plot_subset = model_df[(model_df["nevent"]==1)&(model_df["home_team_has_ball"]==1)]
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds"])
plot_subset = model_df[(model_df["nevent"]==1)&(model_df["home_team_has_ball"]==0)]
plt.scatter(-plot_subset["cur_spread"], plot_subset["xend_of_regulation_score_diff_mlp_search_clipped_mlp_drive_preds"])




In [None]:
model_df["vegas_home_expected_points"] = model_df["cur_over_under"] * 0.5 - model_df["cur_spread"] * 0.5
model_df["vegas_away_expected_points"] = model_df["cur_over_under"] * 0.5 + model_df["cur_spread"] * 0.5
model_df["offense_expected_points"] = np.where(model_df["home_team_has_ball"]==1, model_df["vegas_home_expected_points"], model_df["vegas_away_expected_points"])
model_df["defense_expected_points"] = np.where(model_df["home_team_has_ball"]==0, model_df["vegas_home_expected_points"], model_df["vegas_away_expected_points"])


model_df[(model_df["two_point_attempt"]==1)&(model_df["play_counts"]==1)][["off_score_change", "event_name"]].value_counts()
# model_df["play_outcome"]
model_df["two_point_attempt_success"] = np.where(model_df["off_score_change"]==2, 1, 0)
input_names_two_point_conversion = ["offense_expected_points", "defense_expected_points", "home_team_has_ball"]
output_name_two_point_conversion = "two_point_attempt_success"
mask_model_two_point_conversion = (
    (model_df["two_point_attempt"]==1)&
    (model_df[input_names_total_score_pred+[output_name]].notna().all(axis=1))&
    (model_df["play_counts"]==1)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model_two_point_conversion], input_names_two_point_conversion, output_name_two_point_conversion)
cv=GroupKFold(n_splits=3)

rf_grid = {
    "n_estimators": np.linspace(start=50, stop=500, num=5, dtype=int),
    # "max_features": ["auto", "sqrt"],
    "max_depth": np.linspace(1, 15, num=5, dtype=int).tolist() + [None],
    # "min_samples_split": [2, 5, 10],
    # "min_samples_leaf": [1, 2, 4],
    # "bootstrap": [True, False],
}
rf_two_point_conversion = RandomForestClassifier(n_estimators=100, max_depth=3, verbose=0)

search_rf_two_point_conversion = GridSearchCV(rf_two_point_conversion, rf_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)
search_rf_two_point_conversion.fit(X_train, y_train, groups=group_train)

logit_grid = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'C': [100, 10, 1.0, 0.1, 0.01],
}
logit_two_point_conversion = LogisticRegression()
search_logit_two_point_conversion = GridSearchCV(logit_two_point_conversion, logit_grid, cv=cv,n_jobs=-1,verbose=100, scoring=log_loss_scorer)
search_logit_two_point_conversion.fit(X_train, y_train, groups=group_train)




In [None]:
pd.DataFrame(search_logit_two_point_conversion.cv_results_)

In [None]:

estimator = search_rf_two_point_conversion.best_estimator_.estimators_[0]
from sklearn.tree import export_graphviz
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(search_rf_two_point_conversion.feature_names_in_),
                # max_depth=3,
                class_names = list(search_rf_two_point_conversion.classes_.astype(str)),
                rounded = True, proportion = False, 
                precision = 2, filled = True)

import pydot

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=100'])


In [None]:
val = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True), pd.DataFrame(search_rf_two_point_conversion.predict_proba(X_val), columns=["xtwo_point_failure", "xtwo_point_success"])], axis=1)
for x in [0, 1]:
    plt.scatter(val[val["home_team_has_ball"]==x]["offense_expected_points"] - val[val["home_team_has_ball"]==x]["defense_expected_points"], val[val["home_team_has_ball"]==x]["xtwo_point_success"])
# for x in [0, 1]:
#     plt.scatter(val[val["home_team_has_ball"]==x]["defense_expected_points"], val[val["home_team_has_ball"]==x]["xtwo_point_success"])

log_loss(val["two_point_attempt_success"], val["xtwo_point_success"])

In [None]:
val = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True), pd.DataFrame(search_logit_two_point_conversion.predict_proba(X_val), columns=["xtwo_point_failure", "xtwo_point_success"])], axis=1)
for x in [0, 1]:
    plt.scatter(val[val["home_team_has_ball"]==x]["offense_expected_points"] - val[val["home_team_has_ball"]==x]["defense_expected_points"], val[val["home_team_has_ball"]==x]["xtwo_point_success"])
# for x in [0, 1]:
#     plt.scatter(val[val["home_team_has_ball"]==x]["defense_expected_points"], val[val["home_team_has_ball"]==x]["xtwo_point_success"])

log_loss(val["two_point_attempt_success"], val["xtwo_point_success"])

In [None]:
val

In [None]:
# pd.DataFrame(search_logit_two_point_conversion.best_estimator_.coef_[0], input_names_two_point_conversion)
search_logit_two_point_conversion.best_estimator_.coef_

In [None]:
plt.hist(X_train["offense_expected_points"])

In [None]:

estimator = search_rf_play_outcome.best_estimator_.estimators_[0]
from sklearn.tree import export_graphviz
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(search_rf_play_outcome.feature_names_in_),
                max_depth=3,
                class_names = list(search_rf_play_outcome.classes_.astype(str)),
                rounded = True, proportion = False, 
                precision = 2, filled = True)

import pydot

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=10000'])


In [None]:
def build_go_for_it_success_and_fail_data(input_feature_go_for_it):
    input_feature_go_for_it_success = deepcopy(input_feature_go_for_it)
    if input_feature_go_for_it["ytg"][0] == input_feature_go_for_it["yd_from_goal"][0]:
        input_feature_go_for_it_success["yd_from_goal"] = 15
        input_feature_go_for_it_success["ytg"] = -1
        input_feature_go_for_it_success["point_after_kick"] = 1
        input_feature_go_for_it_success["down"] = 0
        input_feature_go_for_it_success["current_score_diff"] = input_feature_go_for_it_success["current_score_diff"] + (input_feature_go_for_it["home_team_has_ball"] * 2 - 1) * 6
        input_feature_go_for_it_success["current_score_total"] = input_feature_go_for_it_success["current_score_total"] + 6
        input_feature_go_for_it_success[search_rf_play_class_names_home + search_rf_play_class_names_away + search_rf_drive_class_names_home + search_rf_drive_class_names_away] = 0
    else:
        input_feature_go_for_it_success["yd_from_goal"] = input_feature_go_for_it["yd_from_goal"] - input_feature_go_for_it["ytg"]
        input_feature_go_for_it_success["ytg"] = np.where(input_feature_go_for_it_success["yd_from_goal"] < 10, input_feature_go_for_it_success["yd_from_goal"], 10)
        input_feature_go_for_it_success["down"] = 1
        input_feature_go_for_it_success[search_rf_play_class_names_home] = np.where(
            input_feature_go_for_it["home_team_has_ball"] ==1, 
            search_rf_play_outcome.predict_proba(input_feature_go_for_it_success[input_names]), 0)
        input_feature_go_for_it_success[search_rf_play_class_names_away] = np.where(
            input_feature_go_for_it["home_team_has_ball"] ==0, 
            search_rf_play_outcome.predict_proba(input_feature_go_for_it_success[input_names]), 0)
        input_feature_go_for_it_success[search_rf_drive_class_names_home] = np.where(
            input_feature_go_for_it["home_team_has_ball"] ==1, 
            search_rf_drive_outcome.predict_proba(input_feature_go_for_it_success[input_names]), 0)
        input_feature_go_for_it_success[search_rf_drive_class_names_away] = np.where(
            input_feature_go_for_it["home_team_has_ball"] ==0, 
            search_rf_drive_outcome.predict_proba(input_feature_go_for_it_success[input_names]), 0)
    input_feature_go_for_it_success["time_left_in_half"] = input_feature_go_for_it_success["time_left_in_half"] - 10



    input_feature_go_for_it_fail = deepcopy(input_feature_go_for_it)
    input_feature_go_for_it_fail["home_team_has_ball"] = 1 - input_feature_go_for_it["home_team_has_ball"]
    input_feature_go_for_it_fail["ytg"] = 10
    input_feature_go_for_it_fail["yd_from_goal"] = 100 - input_feature_go_for_it["yd_from_goal"]
    input_feature_go_for_it_fail["down"] = 1
    input_feature_go_for_it_fail[search_rf_play_class_names_home] = np.where(
        input_feature_go_for_it_fail["home_team_has_ball"] ==1, 
        search_rf_play_outcome.predict_proba(input_feature_go_for_it_fail[input_names]), 0)
    input_feature_go_for_it_fail[search_rf_play_class_names_away] = np.where(
        input_feature_go_for_it_fail["home_team_has_ball"] ==0, 
        search_rf_play_outcome.predict_proba(input_feature_go_for_it_fail[input_names]), 0)
    input_feature_go_for_it_fail[search_rf_drive_class_names_home] = np.where(
        input_feature_go_for_it_fail["home_team_has_ball"] ==1, 
        search_rf_drive_outcome.predict_proba(input_feature_go_for_it_fail[input_names]), 0)
    input_feature_go_for_it_fail[search_rf_drive_class_names_away] = np.where(
        input_feature_go_for_it_fail["home_team_has_ball"] ==0, 
        search_rf_drive_outcome.predict_proba(input_feature_go_for_it_fail[input_names]), 0)
    input_feature_go_for_it_fail["time_left_in_half"] = input_feature_go_for_it_fail["time_left_in_half"] - 10


    return input_feature_go_for_it_success, input_feature_go_for_it_fail



def build_field_goal_success_and_fail_data(input_feature_field_goal):
    input_feature_field_goal_success_fail = deepcopy(input_feature_field_goal)
    input_feature_field_goal_success_fail["field_goal_attempt"] = 0
    input_feature_field_goal_success_fail["time_left_in_half"] = input_feature_field_goal["time_left_in_half"] - 5
    input_feature_field_goal_success_fail["home_team_has_ball"] = 1 - input_feature_field_goal_success_fail["home_team_has_ball"]

    input_feature_field_goal_success = deepcopy(input_feature_field_goal_success_fail)
    input_feature_field_goal_success["yd_from_goal"] = 65
    input_feature_field_goal_success["ytg"] = -1
    input_feature_field_goal_success["down"] = 0
    input_feature_field_goal_success["kick_off"] = 1
    input_feature_field_goal_success["current_score_diff"] = input_feature_field_goal_success["current_score_diff"] + (input_feature_field_goal["home_team_has_ball"] * 2 - 1) * 3
    input_feature_field_goal_success["current_score_total"] = input_feature_field_goal_success["current_score_total"] + 3
    input_feature_field_goal_success[search_rf_play_class_names_home + search_rf_play_class_names_away + search_rf_drive_class_names_home + search_rf_drive_class_names_away] = 0

    input_feature_field_goal_fail = deepcopy(input_feature_field_goal_success_fail)
    input_feature_field_goal_fail["ytg"] = 10
    input_feature_field_goal_fail["yd_from_goal"] = 100 - (input_feature_field_goal["yd_from_goal"] + 7)
    input_feature_field_goal_fail["down"] = 1
    input_feature_field_goal_fail[search_rf_play_class_names_home] = np.where(
        input_feature_field_goal_fail["home_team_has_ball"] ==1, 
        search_rf_play_outcome.predict_proba(input_feature_field_goal_fail[input_names]), 0)
    input_feature_field_goal_fail[search_rf_play_class_names_away] = np.where(
        input_feature_field_goal_fail["home_team_has_ball"] ==0, 
        search_rf_play_outcome.predict_proba(input_feature_field_goal_fail[input_names]), 0)
    input_feature_field_goal_fail[search_rf_drive_class_names_home] = np.where(
        input_feature_field_goal_fail["home_team_has_ball"] ==1, 
        search_rf_drive_outcome.predict_proba(input_feature_field_goal_fail[input_names]), 0)
    input_feature_field_goal_fail[search_rf_drive_class_names_away] = np.where(
        input_feature_field_goal_fail["home_team_has_ball"] ==0, 
        search_rf_drive_outcome.predict_proba(input_feature_field_goal_fail[input_names]), 0)

    return input_feature_field_goal_success, input_feature_field_goal_fail
def build_punt(input_feature_punt):
    input_feature_punt_success = deepcopy(input_feature_punt)
    input_feature_punt_success["punt"] = 0
    input_feature_punt_success["time_left_in_half"] = input_feature_punt["time_left_in_half"] - 10
    input_feature_punt_success["home_team_has_ball"] = 1 - input_feature_punt_success["home_team_has_ball"]


    input_feature_punt_success = deepcopy(input_feature_punt_success)
    input_feature_punt_success["ytg"] = 10
    input_feature_punt_success["yd_from_goal"] = np.clip(100 - (input_feature_punt_success["yd_from_goal"] -40), 20, 80)
    input_feature_punt_success["down"] = 1
    input_feature_punt_success[search_rf_play_class_names_home] = np.where(
        input_feature_punt_success["home_team_has_ball"] ==1, 
        search_rf_play_outcome.predict_proba(input_feature_punt_success[input_names]), 0)
    input_feature_punt_success[search_rf_play_class_names_away] = np.where(
        input_feature_punt_success["home_team_has_ball"] ==0, 
        search_rf_play_outcome.predict_proba(input_feature_punt_success[input_names]), 0)
    input_feature_punt_success[search_rf_drive_class_names_home] = np.where(
        input_feature_punt_success["home_team_has_ball"] ==1, 
        search_rf_drive_outcome.predict_proba(input_feature_punt_success[input_names]), 0)
    input_feature_punt_success[search_rf_drive_class_names_away] = np.where(
        input_feature_punt_success["home_team_has_ball"] ==0, 
        search_rf_drive_outcome.predict_proba(input_feature_punt_success[input_names]), 0)

    return input_feature_punt_success



In [None]:
print(search_rf_play_class_names)
print(search_rf_drive_class_names)

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
from IPython.display import display_html

def plot_model_probabilities(input_feature):
    anchor_df = model_df[mask_model]
    # input_feature = normalize_df(input_feature, anchor_df=anchor_df).fillna(0)
    end_of_regulation_scores =  [x + input_feature.current_score_diff[0] for x in search_mlp_score_diff_clipped_rf_drive_preds.classes_]
    # display_html(input_feature.T)
    input_feature["ytg"] = np.where(input_feature["ytg"] > input_feature["yd_from_goal"], input_feature["yd_from_goal"], input_feature["ytg"])
    # display_html(input_feature.T)
    fig, axs = plt.subplots(1, 3, figsize=(20,5)) 
    search_rf_play_outcome.best_estimator_.verbose = 0
    search_rf_drive_outcome.best_estimator_.verbose = 0
    search_mlp_score_diff_clipped_rf_drive_preds.best_estimator_.verbose = 0
    pred_output_rf_play = search_rf_play_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_rf_drive = search_rf_drive_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    input_feature_go_for_it = deepcopy(input_feature)
    input_feature_punt = deepcopy(input_feature)
    input_feature_field_goal = deepcopy(input_feature)
    input_feature_go_for_it[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = 0
    input_feature_punt[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = [0, 1, 0, 0, 0]
    input_feature_field_goal[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = [0, 0, 1, 0, 0]
    pred_output_rf_play_go_for_it = search_rf_play_outcome.predict_proba(pd.DataFrame(input_feature_go_for_it, columns = input_names))
    pred_output_rf_drive_go_for_it = search_rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_go_for_it, columns = input_names))
    pred_output_rf_play_punt = search_rf_play_outcome.predict_proba(pd.DataFrame(input_feature_punt, columns = input_names))
    pred_output_rf_drive_punt = search_rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_punt, columns = input_names))
    pred_output_rf_play_field_goal = search_rf_play_outcome.predict_proba(pd.DataFrame(input_feature_field_goal, columns = input_names))
    pred_output_rf_drive_field_goal = search_rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_field_goal, columns = input_names))
    # display_html(input_feature_go_for_it)
    # print(pred_output_rf_play_go_for_it[0][2])
    # print(pred_output_rf_drive_field_goal)
    input_feature_go_for_it_success, input_feature_go_for_it_fail = build_go_for_it_success_and_fail_data(input_feature_go_for_it)  
    input_feature_field_goal_success, input_feature_field_goal_fail = build_field_goal_success_and_fail_data(input_feature_field_goal)
    input_feature_punt_success = build_punt(input_feature_punt)

        





    success_df = pd.DataFrame(
        [
            (pred_output_rf_play_go_for_it[0][2] + pred_output_rf_play_go_for_it[0][4])
            / (1 - (
                pred_output_rf_play_go_for_it[0][0]
                + pred_output_rf_play_go_for_it[0][1]
            )),
            pred_output_rf_play_field_goal[0][0]
            / (
                pred_output_rf_play_field_goal[0][0]
                + pred_output_rf_play_field_goal[0][1]
            ),
            None,
        ], columns=["Success %"]
    )
    display_html(success_df)
    # pred_input
    # print(pd.DataFrame(input_feature, columns = input_names))
    # print(pred_output_rf_play_go_for_it)
    input_feature[search_rf_play_class_names] = pred_output_rf_play
    input_feature[search_rf_drive_class_names] = pred_output_rf_drive
    input_feature_go_for_it[search_rf_play_class_names] = pred_output_rf_play_go_for_it
    input_feature_go_for_it[search_rf_drive_class_names] = pred_output_rf_drive_go_for_it
    input_feature_punt[search_rf_play_class_names] = pred_output_rf_play_punt
    input_feature_punt[search_rf_drive_class_names] = pred_output_rf_drive_punt
    input_feature_field_goal[search_rf_play_class_names] = pred_output_rf_play_field_goal
    input_feature_field_goal[search_rf_drive_class_names] = pred_output_rf_drive_field_goal

    input_feature[search_rf_play_class_names_home] = input_feature[search_rf_play_class_names].where(input_feature.home_team_has_ball==1, 0)
    input_feature_go_for_it[search_rf_play_class_names_home] = input_feature_go_for_it[search_rf_play_class_names].where(input_feature_go_for_it.home_team_has_ball==1, 0)
    input_feature_punt[search_rf_play_class_names_home] = input_feature_punt[search_rf_play_class_names].where(input_feature_punt.home_team_has_ball==1, 0)
    input_feature_field_goal[search_rf_play_class_names_home] = input_feature_field_goal[search_rf_play_class_names].where(input_feature_field_goal.home_team_has_ball==1, 0)

    input_feature[search_rf_drive_class_names_home] = input_feature[search_rf_drive_class_names].where(input_feature.home_team_has_ball==1, 0)
    input_feature_go_for_it[search_rf_drive_class_names_home] = input_feature_go_for_it[search_rf_drive_class_names].where(input_feature_go_for_it.home_team_has_ball==1, 0)
    input_feature_punt[search_rf_drive_class_names_home] = input_feature_punt[search_rf_drive_class_names].where(input_feature_punt.home_team_has_ball==1, 0)
    input_feature_field_goal[search_rf_drive_class_names_home] = input_feature_field_goal[search_rf_drive_class_names].where(input_feature_field_goal.home_team_has_ball==1, 0)

    input_feature[search_rf_play_class_names_away] = input_feature[search_rf_play_class_names].where(input_feature.home_team_has_ball==0, 0)
    input_feature_go_for_it[search_rf_play_class_names_away] = input_feature_go_for_it[search_rf_play_class_names].where(input_feature_go_for_it.home_team_has_ball==0, 0)
    input_feature_punt[search_rf_play_class_names_away] = input_feature_punt[search_rf_play_class_names].where(input_feature_punt.home_team_has_ball==0, 0)
    input_feature_field_goal[search_rf_play_class_names_away] = input_feature_field_goal[search_rf_play_class_names].where(input_feature_field_goal.home_team_has_ball==0, 0)

    input_feature[search_rf_drive_class_names_away] = input_feature[search_rf_drive_class_names].where(input_feature.home_team_has_ball==0, 0)
    input_feature_go_for_it[search_rf_drive_class_names_away] = input_feature_go_for_it[search_rf_drive_class_names].where(input_feature_go_for_it.home_team_has_ball==0, 0)
    input_feature_punt[search_rf_drive_class_names_away] = input_feature_punt[search_rf_drive_class_names].where(input_feature_punt.home_team_has_ball==0, 0)
    input_feature_field_goal[search_rf_drive_class_names_away] = input_feature_field_goal[search_rf_drive_class_names].where(input_feature_field_goal.home_team_has_ball==0, 0)

    # input_feature_go_for_it_success = normalize_df(input_feature_go_for_it_success, anchor_df)
    # input_feature_go_for_it_success = input_feature_go_for_it_success.fillna(0)
    # input_feature_go_for_it_fail = normalize_df(input_feature_go_for_it_fail, anchor_df)
    # input_feature_go_for_it_fail = input_feature_go_for_it_fail.fillna(0)

    # input_feature_field_goal_success = normalize_df(input_feature_field_goal_success, anchor_df)
    # input_feature_field_goal_success = input_feature_field_goal_success.fillna(0)
    # input_feature_field_goal_fail = normalize_df(input_feature_field_goal_fail, anchor_df)
    # input_feature_field_goal_fail = input_feature_field_goal_fail.fillna(0)


    # input_feature_punt_success = normalize_df(input_feature_punt_success, anchor_df)
    # input_feature_punt_success = input_feature_punt_success.fillna(0)

    outcome_success_df = pd.DataFrame()
    for choice in ["go_for_it", "field_goal", "punt"]:
        # print(normalize_df(eval("input_feature_" + choice + "_success").loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).T)
        outcome_pred = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalize_df(eval("input_feature_" + choice + "_success").loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).values)
        score_probs = pd.concat([pd.DataFrame(outcome_pred, index=["prob"]).T, pd.DataFrame(end_of_regulation_scores, columns=["score"])], axis=1)
        if input_feature["home_team_has_ball"][0]==1:
            win_probs = pd.DataFrame([np.sum(score_probs[score_probs.score > 0]["prob"])], index=[choice], columns=["home_win"])
        else:
            win_probs = pd.DataFrame([np.sum(score_probs[score_probs.score < 0]["prob"])], index=[choice], columns=["away_win"])
        overtime_probs = pd.DataFrame([np.sum(score_probs[score_probs.score == 0]["prob"])], index=[choice], columns=["overtime"])
            
        outcome_success_df = pd.concat([outcome_success_df, pd.concat([win_probs, overtime_probs], axis=1)])
    display_html(outcome_success_df)

    outcome_fail_df = pd.DataFrame()
    for choice in ["go_for_it", "field_goal"]:
        outcome_pred = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalize_df(eval("input_feature_" + choice + "_fail").loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).values)
        score_probs = pd.concat([pd.DataFrame(outcome_pred, index=["prob"]).T, pd.DataFrame(end_of_regulation_scores, columns=["score"])], axis=1)
        # print("home_team_has_ball", input_feature["home_team_has_ball"][0])
        if input_feature["home_team_has_ball"][0]==1:
            win_probs = pd.DataFrame([np.sum(score_probs[score_probs.score > 0]["prob"])], index=[choice], columns=["home_win"])
        else:
            win_probs = pd.DataFrame([np.sum(score_probs[score_probs.score < 0]["prob"])], index=[choice], columns=["away_win"])
            
        overtime_probs = pd.DataFrame([np.sum(score_probs[score_probs.score == 0]["prob"])], index=[choice], columns=["overtime"])
        outcome_fail_df = pd.concat([outcome_fail_df, pd.concat([win_probs, overtime_probs], axis=1)])
    display_html(outcome_fail_df)



    # display_html(pd.concat([
    #     input_feature_go_for_it_success[search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], input_feature_go_for_it_fail[search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
    #     input_feature_field_goal_success[search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], input_feature_field_goal_fail[search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
    #     input_feature_punt_success[search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
        
    #     ]).T)



    # model_df[search_rf_drive_class_names_home] = model_df[search_rf_drive_class_names].where(model_df.home_team_has_ball==1, 0)
    pred_output_rf = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(input_feature.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_].values)
    # display(pd.DataFrame(pred_output_rf, columns = search_mlp_score_diff_clipped_rf_drive_preds.classes_, index=["rf"]).T.sort_values("rf", ascending=False).head(15))
    head_value = 5
    pred_output_go_for_it_rf = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalize_df(input_feature_go_for_it.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).values)
    pred_output_go_for_it_rf_df = (pd.DataFrame(pred_output_go_for_it_rf, columns = search_mlp_score_diff_clipped_rf_drive_preds.classes_, index=["rf_go_for_it"]).T.sort_values("rf_go_for_it", ascending=False).head(head_value))
    pred_output_punt_rf = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalize_df(input_feature_punt.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).values)
    pred_output_punt_rf_df = (pd.DataFrame(pred_output_punt_rf, columns = search_mlp_score_diff_clipped_rf_drive_preds.classes_, index=["rf_punt"]).T.sort_values("rf_punt", ascending=False).head(head_value))
    pred_output_field_goal_rf = search_mlp_score_diff_clipped_rf_drive_preds.predict_proba(normalize_df(input_feature_field_goal.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_], anchor_df).values)
    pred_output_field_goal_rf_df = (pd.DataFrame(pred_output_field_goal_rf, columns = search_mlp_score_diff_clipped_rf_drive_preds.classes_, index=["rf_field_goal"]).T.sort_values("rf_field_goal", ascending=False).head(head_value))


    score_probs_go_for_it = pd.concat([pd.DataFrame(pred_output_go_for_it_rf, index=["prob"]).T, pd.DataFrame(end_of_regulation_scores, columns=["score"])], axis=1)
    home_win_go_for_it = np.sum(score_probs_go_for_it[score_probs_go_for_it.score > 0]["prob"])
    overtime_go_for_it = np.sum(score_probs_go_for_it[score_probs_go_for_it.score == 0]["prob"])
    away_win_go_for_it = np.sum(score_probs_go_for_it[score_probs_go_for_it.score < 0]["prob"])
    score_probs_punt = pd.concat([pd.DataFrame(pred_output_punt_rf, index=["prob"]).T, pd.DataFrame(end_of_regulation_scores, columns=["score"])], axis=1)
    home_win_punt = np.sum(score_probs_punt[score_probs_punt.score > 0]["prob"])
    overtime_punt = np.sum(score_probs_punt[score_probs_punt.score == 0]["prob"])
    away_win_punt = np.sum(score_probs_punt[score_probs_punt.score < 0]["prob"])
    score_probs_field_goal = pd.concat([pd.DataFrame(pred_output_field_goal_rf, index=["prob"]).T, pd.DataFrame(end_of_regulation_scores, columns=["score"])], axis=1)
    home_win_field_goal = np.sum(score_probs_field_goal[score_probs_field_goal.score > 0]["prob"])
    overtime_field_goal = np.sum(score_probs_field_goal[score_probs_field_goal.score == 0]["prob"])
    away_win_field_goal = np.sum(score_probs_field_goal[score_probs_field_goal.score < 0]["prob"])


    fourth_down_bot_outcomes = pd.concat(
        [
            pd.DataFrame([home_win_go_for_it, overtime_go_for_it, away_win_go_for_it], ["home_win", "overtime", "away_win"], ["go_for_it"]).T,
            pd.DataFrame([home_win_field_goal, overtime_field_goal, away_win_field_goal], ["home_win", "overtime", "away_win"], ["field_goal"]).T,
            pd.DataFrame([home_win_punt, overtime_punt, away_win_punt], ["home_win", "overtime", "away_win"], ["punt"]).T,
        ]
    )
    # display_html(fourth_down_bot_outcomes)





    df1_style = pred_output_go_for_it_rf_df.style.set_table_attributes("style='display:inline;'")
    df2_style = pred_output_punt_rf_df.style.set_table_attributes("style='display:inline'")
    df3_style = pred_output_field_goal_rf_df.style.set_table_attributes("style='display:inline'")

    display_df = pd.concat([
        input_feature_go_for_it.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
        input_feature_punt.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
        input_feature_field_goal.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_],
        
    ])
    # display_html(display_df.T)
    # print(input_feature.loc[:, search_rf_score_diff_clipped_rf_drive_preds.feature_names_in_].iloc[0])
    # print(end_of_regulation_scores)

    display_html("Change in Score Diff (rf)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)
    axs[0].bar(search_mlp_score_diff_clipped_rf_drive_preds.classes_, pred_output_go_for_it_rf[0])
    axs[1].bar(search_mlp_score_diff_clipped_rf_drive_preds.classes_, pred_output_punt_rf[0])
    axs[2].bar(search_mlp_score_diff_clipped_rf_drive_preds.classes_, pred_output_field_goal_rf[0])
    y_max = np.max([pred_output_go_for_it_rf[0], pred_output_punt_rf[0], pred_output_field_goal_rf[0]])
    for x in range(3):
        axs[x].set_xlim(-10.5, 10.5)
        axs[x].set_xticks(np.arange(-10, 11, 1))
        axs[x].set_ylim(0, y_max*1.05)





style = {'description_width': '200px'}
layout = Layout(width='450px')
@interact(
    minutes=widgets.IntSlider(min=0, max=15, step=1, value=4, style=style, layout=layout),
    seconds=widgets.IntSlider(min=0, max=59, step=1, value=52, style=style, layout=layout),
    quarter=widgets.RadioButtons(options = [1, 2, 3, 4], value=4, style=style, layout=layout),
    current_score_diff=widgets.IntSlider(min=-30, max=30, step=1, value=1, style=style, layout=layout),
    current_score_total=widgets.IntSlider(min=0, max=80, step=1, value=45, style=style, layout=layout),
    cur_spread=widgets.IntSlider(min=-20, max=20, step=1, value=5, style=style, layout=layout),
    cur_over_under=widgets.IntSlider(min=30, max=60, step=1, value=45, style=style, layout=layout),
    home_timeouts_remaining=widgets.RadioButtons(options = [0, 1, 2, 3], value=3, style=style, layout=layout),
    away_timeouts_remaining=widgets.RadioButtons(options = [0, 1, 2, 3], value=3, style=style, layout=layout),
    ytg=widgets.IntSlider(min=1, max=30, step=1, value=1, style=style, layout=layout),
    yd_from_goal=widgets.IntSlider(min=1, max=100, step=1, value=7, style=style, layout=layout),
    # down=widgets.IntSlider(min=1, max=4, step=1, value=4, style=style, layout=layout),
    home_team_has_ball=False,
    play_type=widgets.Dropdown(options=["scrimmage", "kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"], style=style, layout=layout),
)
def g(
    quarter,
    minutes,
    seconds,
    home_timeouts_remaining,
    away_timeouts_remaining,
    ytg,
    yd_from_goal,
    current_score_diff,
    current_score_total,
    cur_spread,
    cur_over_under,
    # down,
    home_team_has_ball,
    play_type,
):  
    seconds_left_in_quarter = minutes * 60 + seconds
    time_left_in_game = (4 - quarter) * 900 +  seconds_left_in_quarter
    time_left_in_half = np.where(quarter>2, time_left_in_game, time_left_in_game - 1800)
    half = np.where(quarter>2, 2, 1)
    punt = np.where(play_type == "punt", 1, 0)
    field_goal_attempt = np.where(play_type == "field_goal_attempt", 1, 0)
    kick_off = np.where(play_type == "kick_off", 1, 0)
    point_after_kick = np.where(play_type == "point_after_kick", 1, 0)
    two_point_attempt = np.where(play_type == "two_point_attempt", 1, 0)
    scrimmage = np.where(play_type == "scrimmage", 1, 0)
    # down = np.where((kick_off==1)|(point_after_kick==1)|(two_point_attempt==1), 0, down)
    ytg_adj = np.where(yd_from_goal < ytg, yd_from_goal, ytg)
    down=4

    input_feature = pd.DataFrame(np.array(
        [
            time_left_in_half,
            half,
            current_score_diff,
            current_score_total,
            cur_spread,
            cur_over_under,
            home_timeouts_remaining,
            away_timeouts_remaining,
            punt,
            field_goal_attempt,
            ytg_adj,
            yd_from_goal,
            down,
            home_team_has_ball,
        ],
    ).reshape(1, -1), columns=input_names)
    input_feature[["kick_off", "point_after_kick", "two_point_attempt"]] = [kick_off, point_after_kick, two_point_attempt]
    input_feature = input_feature
    # input_feature = normalize_df(input_feature, model_df[mask_model])
    # print(input_feature)
    # return input_feature
    
    # print(pd.DataFrame(input_feature, columns=mlp_play_outcome.feature_names_in_))
    plot_model_probabilities(input_feature)
    # interactive_output(plot_model_probabilities, input_feature)

In [None]:
from IPython.display import display
import matplotlib.pylab as plt
import numpy as np
from ipywidgets import FloatSlider, IntSlider, interactive_output, HBox
%matplotlib inline

def plotPP(p,p_LP,Tperiods):
    t=np.linspace(0,Tperiods*(np.pi*2),1000)
    f = plt.figure(figsize=(1.618*6,6))
    p_t = p*(1.0+np.cos(t))
    plt.plot(t,p_t,'b',lw=2.0)
    plt.axhline(p_LP,color='r',lw=2.0)
    plt.xlim([t[0],t[-1]])
    plt.ylim([min(p_LP,np.amin(p_t))-0.1,np.amax(p_t)+0.1])
    plt.ylabel(r'$p$')
    plt.xlabel(r'$t$')

p_s       = FloatSlider(min=0, max=2, step=0.01, value=1.0)
p_LP_s    = FloatSlider(min=0, max=2., step=0.01, value=0.481343)
Tperiods_s= IntSlider(min=1,max=10,value=1)
out = interactive_output(plotPP, {'p': p_s, 'p_LP': p_LP_s, 'Tperiods': Tperiods_s})
display(HBox([p_s, p_LP_s]), out)


In [None]:
print(search_rf_score_diff_clipped_rf_drive_preds.best_score_)
print(search_rf_score_diff_clipped_mlp_drive_preds.best_score_)


print(search_mlp_score_diff_clipped_rf_drive_preds.best_score_)
print(search_mlp_score_diff_clipped_mlp_drive_preds.best_score_)



In [None]:
print(search_rf_score_diff_clipped_rf_drive_preds.best_score_)
print(search_rf_score_diff_clipped_mlp_drive_preds.best_score_)


print(search_mlp_score_diff_clipped_rf_drive_preds.best_score_)
print(search_mlp_score_diff_clipped_mlp_drive_preds.best_score_)



In [None]:
calb_df = model_df[(model_df["season"]>=2020)&~(model_df.xhome_win_mlp_search_clipped_rf_drive_preds.isna())]
calb = calibration.calibration_curve(calb_df["away_team_win_in_regulation"], np.clip(calb_df["xaway_win_mlp_search_clipped_rf_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
# calb = calibration.calibration_curve(calb_df["away_team_win_in_regulation"], np.clip(calb_df["xaway_win_mlp_search_clipped_mlp_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
# calb = calibration.calibration_curve(calb_df["away_team_win_in_regulation"], np.clip(calb_df["xaway_win_rf_search_clipped_rf_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
# calb = calibration.calibration_curve(calb_df["away_team_win_in_regulation"], np.clip(calb_df["xaway_win_rf_search_clipped_mlp_drive_preds"], 0, 1), n_bins=bins, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
plt.plot([0, 1], [0, 1])



In [None]:

calb_df = model_df[(model_df["season"]==2020)&~(model_df.xhome_win_mlp_search_clipped_rf_drive_preds.isna())]
# calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_mlp"], 0, 1), n_bins=10, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_rf_search_clipped_rf_drive_preds"], 0, 1), n_bins=10, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_rf_search_clipped_mlp_drive_preds"], 0, 1), n_bins=10, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
plt.scatter(-model_df[model_df.nevent==1]["cur_spread"], model_df[model_df.nevent==1]["xend_of_regulation_score_diff_mlp_search_clipped"])
plt.plot([-20, 20], [-20, 20])

In [None]:
# param_spaces={
#     "learning_rate": uniform_distribution(0.01, 0.1),
#     "max_depth": range(5, 16),
#     # "subsample": uniform_distribution(0.9, 1),
#     "l2_leaf_reg": uniform_distribution(1, 4),
#     "boosting_type": ["Plain"],
#     }
# import catboost as cb
# # COMPUTE_PARAMS = {"task_type":"GPU", "devices": "0:1", "bootstrap_type": "Poisson"}
# COMPUTE_PARAMS = {"thread_count": -1}
# X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name, normalize=False)
# categoricals = [
#     'half',
#     'punt',
#     'field_goal_attempt',
#     'down',
#     'home_team_has_ball',
# ]
# for x in categoricals:
#     X_train[x] = X_train[x].apply(int)
#     X_test[x] = X_test[x].apply(int)
#     X_val[x] = X_val[x].apply(int)


# val_pool_play_outcome =cb.Pool(
#         data=X_val,
#         label=y_val,
#         cat_features=categoricals,
#         feature_names=input_names,
# )
# FIT_PARAMS = {
#     "verbose": 1,
#     "early_stopping_rounds": 5,
#     "use_best_model": True,
# }

# CLASSIFIER_PARAMS = {
#     # "n_estimators": 10,
#     "num_trees": 1000,
#     "auto_class_weights": "Balanced",
#     # "loss_function": JITRmseObjective(),
#     # "eval_metric": JITRmseMetric(),
#     **COMPUTE_PARAMS,   
# }
# HPO_PARAMS = {"n_jobs": 1}

# cv=GroupKFold(n_splits=3)
# cb_play_outcome_base_model = cb.CatBoostClassifier(early_stopping_rounds=5,cat_features=categoricals, **CLASSIFIER_PARAMS)
# # search_cb_play_outcome = RandomizedSearchCV(cb_play_outcome_base_model, param_spaces, scoring=log_loss_scorer,cv=cv,random_state=42,n_iter=1,n_jobs=1,verbose=100)
# # search_cb_play_outcome.fit(X_train,y_train, eval_set=val_pool_play_outcome,groups=group_train, **FIT_PARAMS)
# # pickle.dump(search_cb_play_outcome, open(os.path.join(root_dir, 'models/search_cb_play_outcome.p'), 'wb'))
# search_cb_play_outcome = pickle.load(open(os.path.join(root_dir, "models/search_cb_play_outcome.p"), 'rb'))


In [None]:
model_df[mask_model_two_point_conversion]["yd_from_goal"]

In [None]:
X_train["down"]

In [None]:



output_name = "drive_outcome_desc_basic"
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names, output_name, normalize=False)
# pd.concat([X_train, model_df[model_df["season"]<=2019][["event_name"]], y_train], axis=1)

# mlp_drive_outcome = MLPClassifier(hidden_layer_sizes=[100], verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# mlp_drive_outcome.fit(X_train, y_train)
# rf_drive_outcome = RandomForestClassifier(n_estimators=200, max_depth=15, verbose=100, n_jobs=-1, random_state=1)
# rf_drive_outcome.fit(X_train, y_train)
# pickle.dump(mlp_drive_outcome, open(os.path.join(root_dir, 'models/mlp_drive_outcome.p'), 'wb'))


# pickle.dump(rf_drive_outcome, open(os.path.join(root_dir, 'models/rf_drive_outcome.p'), 'wb'))
# logit_drive_outcome_basic = LogisticRegression(solver='liblinear', random_state=0)
# logit_drive_outcome_basic.fit(X_train, y_train)
# pickle.dump(logit_drive_outcome_basic, open(os.path.join(root_dir, 'models/logit_drive_outcome_basic.p'), 'wb'))

# os.system('say "done"')

mlp_drive_outcome = pickle.load(open(os.path.join(root_dir, "models/mlp_drive_outcome.p"), 'rb'))
rf_drive_outcome = pickle.load(open(os.path.join(root_dir, "models/rf_drive_outcome.p"), 'rb'))
logit_drive_outcome_basic = pickle.load(open(os.path.join(root_dir, "models/logit_drive_outcome_basic.p"), 'rb'))

In [None]:
output_name = "play_outcome"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(model_df[["down"]])
down_one_hot_cols = ["down_" + str(x) for x in enc.categories_[0]]
model_df[down_one_hot_cols] = pd.DataFrame(enc.transform(model_df[["down"]]).toarray(), columns=enc.categories_)

def square_value(df, cols):
    for col in cols:
        df[col + "_sq"] = df[col] ** 2
    return df

down_ytg_one_hot_cols = ["down_ytg_" + str(x) for x in enc.categories_[0]]
model_df[down_ytg_one_hot_cols] = model_df[down_one_hot_cols].T.mul(model_df["ytg"]).T

down_yd_from_goal_cols = ["down_yd_from_goal_" + str(x) for x in enc.categories_[0]]
model_df[down_yd_from_goal_cols] = model_df[down_one_hot_cols].T.mul(model_df["yd_from_goal"]).T
model_df = model_df.pipe(square_value, ["time_left_in_half"] + down_ytg_one_hot_cols + down_yd_from_goal_cols)
down_ytg_sq_one_hot_cols = ["down_ytg_" + str(x) + "_sq" for x in enc.categories_[0]]
down_yd_from_goal_sq_cols = ["down_yd_from_goal_" + str(x) + "_sq" for x in enc.categories_[0]]

input_names_advanced = [
    # 'time_left_in_game',
    'time_left_in_half',
    'time_left_in_half_sq',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    # 'kick_off',
    'punt',
    # 'point_after_kick',
    # 'two_point_attempt',
    'field_goal_attempt',
    # 'from_scrimmage',
    # 'ytg',
    # 'yd_from_goal',
    # 'down',
    'home_team_has_ball',
] + down_one_hot_cols[2:] + down_ytg_one_hot_cols[1:] + down_yd_from_goal_cols[1:] + down_ytg_sq_one_hot_cols[1:] + down_yd_from_goal_sq_cols[1:]

mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names_advanced+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_advanced, output_name)

# logit_play_outcome_advanced = LogisticRegression(solver='liblinear', random_state=0)
# logit_play_outcome_advanced.fit(X_train, y_train)
# pickle.dump(logit_play_outcome_advanced, open(os.path.join(root_dir, 'models/logit_play_outcome_advanced.p'), 'wb'))

logit_play_outcome_advanced = pickle.load(open(os.path.join(root_dir, "models/logit_play_outcome_advanced.p"), 'rb'))

# import statsmodels.api as sm
# logit_play_outcome_advanced = sm.MNLogit(pd.get_dummies(y_train)[:5000], sm.add_constant(X_train[:5000])).fit()
# X = sm.add_constant(X_train, prepend=False)
# logit_play_outcome_advanced = {}
# for col in pd.get_dummies(y_train).columns:
#     y = pd.get_dummies(y_train)[col]
#     logit_play_outcome_advanced[col] = sm.Logit(y, X)
#     logit_play_outcome_advanced[col+"_fit"] = logit_play_outcome_advanced[col].fit()
#     # logit_play_outcome_advanced[col] = sm.Logit(y[:50000], X[:50000]).fit()
#     print(logit_play_outcome_advanced[col+"_fit"].summary())
# logit_play_outcome_advanced = sm.MNLogit(y_train, X).fit(method='bfgs')
# logit_play_outcome_advanced = sm.MNLogit(y_train[:50000], X[:50000]).fit()


In [None]:
# import statsmodels.api as sm
# X = sm.add_constant(X_train, prepend=False)
# logit_play_outcome_advanced_dict = {}
# for col in pd.get_dummies(y_train).columns:
#     y = pd.get_dummies(y_train)[col]
#     logit_play_outcome_advanced_dict[col] = sm.Logit(y, X)
#     logit_play_outcome_advanced_dict[col+"_fit"] = logit_play_outcome_advanced_dict[col].fit()
#     # logit_play_outcome_advanced[col] = sm.Logit(y[:50000], X[:50000]).fit()
#     print(logit_play_outcome_advanced_dict[col+"_fit"].summary())
# # logit_play_outcome_advanced_dict = sm.MNLogit(y_train, X).fit(method='bfgs')
# # logit_play_outcome_advanced_dict = sm.MNLogit(y_train[:50000], X[:50000]).fit()


In [None]:
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df[input_names_advanced+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)


In [None]:
output_name = "drive_outcome_desc_basic"
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names_advanced+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)

X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_advanced, output_name)

# logit_drive_outcome_advanced = LogisticRegression(solver='liblinear', random_state=0)
# logit_drive_outcome_advanced.fit(X_train, y_train)
# pickle.dump(logit_drive_outcome_advanced, open(os.path.join(root_dir, 'models/logit_drive_outcome_advanced.p'), 'wb'))


logit_drive_outcome_advanced = pickle.load(open(os.path.join(root_dir, "models/logit_drive_outcome_advanced.p"), 'rb'))


In [None]:
# import statsmodels.api as sm
# X = sm.add_constant(X_train, prepend=False)
# logit_drive_outcome_advanced_dict = {}
# for col in pd.get_dummies(y_train).columns:
#     y = pd.get_dummies(y_train)[col]
#     logit_drive_outcome_advanced_dict[col] = sm.Logit(y, X)
#     logit_drive_outcome_advanced_dict[col+"_fit"] = logit_drive_outcome_advanced_dict[col].fit()
#     print(logit_drive_outcome_advanced_dict[col+"_fit"].summary())



In [None]:
from statistics import mean
model_df["yd_from_goal_bin"] = round((model_df["yd_from_goal"])/ 5) * 5
model_df[mask_model&(model_df.field_goal_attempt==1)].groupby(["yd_from_goal_bin"], as_index=False).agg({
    "yd_from_goal": mean, 
    "fieldgoal_made": [mean, "count"]})#[["yd_from_goal_bin", "yd_from_goal", "fieldgoal_made"]]



In [None]:
mask_model

In [None]:
for play_outcome in model_df[mask_model]["play_outcome"].drop_duplicates().sort_values():
    model_df[play_outcome + "_naive"] = pd.DataFrame(model_df[mask_model]["play_outcome"].value_counts())[pd.DataFrame(model_df[mask_model]["play_outcome"].value_counts()).index==play_outcome].values[0][0]/model_df[mask_model]["play_outcome"].count()
for drive_outcome in model_df[mask_model]["drive_outcome_desc_basic"].drop_duplicates().sort_values():
    model_df[drive_outcome + "_naive"] = pd.DataFrame(model_df[mask_model]["drive_outcome_desc_basic"].value_counts())[pd.DataFrame(model_df[mask_model]["drive_outcome_desc_basic"].value_counts()).index==drive_outcome].values[0][0]/model_df[mask_model]["drive_outcome_desc_basic"].count()


rf_play_class_names = ["rf_play_" + x for x in rf_play_outcome.classes_]
search_rf_play_class_names = ["search_rf_play_" + x for x in search_rf_play_outcome.classes_]
search_cb_play_class_names = ["search_cb_play_" + x for x in search_cb_play_outcome.classes_]
mlp_play_class_names = ["mlp_play_" + x for x in mlp_play_outcome.classes_]
rf_drive_class_names = ["rf_drive_" + x for x in rf_drive_outcome.classes_]
mlp_drive_class_names = ["mlp_drive_" + x for x in mlp_drive_outcome.classes_]
logit_play_basic_class_names = ["logit_basic_play_" + x for x in logit_play_outcome_basic.classes_]
logit_drive_basic_class_names = ["logit_basic_drive_" + x for x in logit_drive_outcome_basic.classes_]
logit_play_advanced_class_names = ["logit_advanced_play_" + x for x in logit_play_outcome_advanced.classes_]
logit_drive_advanced_class_names = ["logit_advanced_drive_" + x for x in logit_drive_outcome_advanced.classes_]
naive_play_class_names = [x + "_naive" for x in model_df[mask_model]["play_outcome"].drop_duplicates().sort_values()]
naive_drive_class_names = [x + "_naive" for x in model_df[mask_model]["drive_outcome_desc_basic"].drop_duplicates().sort_values()]
# predict_df = normalize_df(model_df[mask_model][input_names], model_df[mask_model&(model_df.season<2020)])
# predict_advanced_df = normalize_df(model_df[mask_model][input_names_advanced], model_df[mask_model&(model_df.season<2020)])
predict_df = model_df[mask_model][input_names]
predict_advanced_df = model_df[mask_model][input_names_advanced]
predict_cb_df = model_df[mask_model][input_names]
for x in categoricals:
    predict_cb_df[x] = predict_cb_df[x].apply(int)
predict_cb_df
rf_play_outcome.verbose=0
rf_drive_outcome.verbose=0
search_rf_play_outcome.verbose=0
x_view = pd.concat([
    deepcopy(model_df[mask_model].reset_index()), 
    pd.DataFrame(logit_play_outcome_basic.predict_proba(predict_df), columns=logit_play_basic_class_names),
    pd.DataFrame(logit_play_outcome_advanced.predict_proba(predict_advanced_df), columns=logit_play_advanced_class_names),
    pd.DataFrame(mlp_play_outcome.predict_proba(predict_df), columns=mlp_play_class_names),
    pd.DataFrame(rf_play_outcome.predict_proba(predict_df), columns=rf_play_class_names),
    pd.DataFrame(search_rf_play_outcome.predict_proba(predict_df), columns=search_rf_play_class_names),
    # pd.DataFrame(search_cb_play_outcome.predict_proba(predict_cb_df), columns=search_cb_play_class_names),
    pd.DataFrame(logit_drive_outcome_basic.predict_proba(predict_df), columns=logit_drive_basic_class_names),
    pd.DataFrame(logit_drive_outcome_advanced.predict_proba(predict_advanced_df), columns=logit_drive_advanced_class_names),
    pd.DataFrame(mlp_drive_outcome.predict_proba(predict_df), columns=mlp_drive_class_names),
    pd.DataFrame(rf_drive_outcome.predict_proba(predict_df), columns=rf_drive_class_names),

    ], axis=1
)

# os.system('say "done"')



In [None]:
plt.scatter(x_view["search_cb_play_offensive_touchdown"], x_view["search_rf_play_offensive_touchdown"])

In [None]:
model_df[mask_model]["drive_outcome_desc_basic"].drop_duplicates().dropna().sort_values()

In [None]:
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
for play_outcome in model_df[mask_model]["play_outcome"].drop_duplicates().dropna().sort_values():
    model_df[play_outcome + "_naive"] = pd.DataFrame(model_df[mask_model&(model_df.season<2020)]["play_outcome"].value_counts())[pd.DataFrame(model_df[mask_model&(model_df.season<2020)]["play_outcome"].value_counts()).index==play_outcome].values[0][0]/model_df[mask_model&(model_df.season<2020)]["play_outcome"].count()
for drive_outcome in model_df[mask_model]["drive_outcome_desc_basic"].drop_duplicates().dropna().sort_values():
    model_df[drive_outcome + "_naive"] = pd.DataFrame(model_df[mask_model&(model_df.season<2020)]["drive_outcome_desc_basic"].value_counts())[pd.DataFrame(model_df[mask_model&(model_df.season<2020)]["drive_outcome_desc_basic"].value_counts()).index==drive_outcome].values[0][0]/model_df[mask_model&(model_df.season<2020)]["drive_outcome_desc_basic"].count()
naive_play_class_names = [x + "_naive" for x in model_df[mask_model]["play_outcome"].drop_duplicates().dropna().sort_values()]
naive_drive_class_names = [x + "_naive" for x in model_df[mask_model]["drive_outcome_desc_basic"].drop_duplicates().dropna().sort_values()]


In [None]:

log_loss_scores = {
    "play_outcome": {},
    "drive_outcome": {},
}
output_name = "play_outcome"

x_view = deepcopy(model_df)
mask_model = (
    (x_view.continuation==0)&
    (x_view.down!=0)&
    (x_view.play_counts==1)&
    (x_view.event_id.isin(scrimmage_plays_we_want))&
    (x_view[input_names+[output_name]].notna().all(axis=1))&
    (x_view["from_scrimmage"]==1)&
    (x_view["overtime"]==0)
)
x_view = x_view[mask_model]
log_loss_scores[output_name]["naive"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][naive_play_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][naive_play_class_names]),
}
# log_loss_scores[output_name]["logit_basic"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][logit_play_basic_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][logit_play_basic_class_names]),
# }
# log_loss_scores[output_name]["logit_advanced"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][logit_play_advanced_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][logit_play_advanced_class_names]),
# }``
# log_loss_scores[output_name]["mlp"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][mlp_play_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][mlp_play_class_names]),
# }
log_loss_scores[output_name]["search_mlp"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][search_mlp_play_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][search_mlp_play_class_names]),
}
# log_loss_scores[output_name]["rf"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][rf_play_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][rf_play_class_names]),
# }
log_loss_scores[output_name]["search_rf"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][search_rf_play_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][search_rf_play_class_names]),
}
# log_loss_scores[output_name]["search_cb"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][search_cb_play_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][search_cb_play_class_names]),
# }
output_name = "drive_outcome_desc_basic"
x_view = deepcopy(model_df)
mask_model = (
    (x_view.continuation==0)&
    (x_view.down!=0)&
    (x_view.play_counts==1)&
    (x_view.event_id.isin(scrimmage_plays_we_want))&
    (x_view[input_names+[output_name]].notna().all(axis=1))&
    (x_view["from_scrimmage"]==1)&
    (x_view["overtime"]==0)
)
x_view = x_view[mask_model]
log_loss_scores["drive_outcome"]["naive"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][naive_drive_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][naive_drive_class_names]),
}
# log_loss_scores["drive_outcome"]["logit_basic"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][logit_drive_basic_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][logit_drive_basic_class_names]),
# }
# log_loss_scores["drive_outcome"]["logit_advanced"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][logit_drive_advanced_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][logit_drive_advanced_class_names]),
# }
# log_loss_scores["drive_outcome"]["mlp"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][mlp_drive_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][mlp_drive_class_names]),
# }
log_loss_scores["drive_outcome"]["search_mlp"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][search_mlp_drive_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][search_mlp_drive_class_names]),
}
# log_loss_scores["drive_outcome"]["rf"] = {
#     "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][rf_drive_class_names]),
#     "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][rf_drive_class_names]),
# }
log_loss_scores["drive_outcome"]["search_rf"] = {
    "train": log_loss(x_view[x_view["season"]<2020][output_name], x_view[x_view["season"]<2020][search_rf_drive_class_names]),
    "val": log_loss(x_view[x_view["season"]==2020][output_name], x_view[x_view["season"]==2020][search_rf_drive_class_names]),
}

log_loss_scores["play_outcomes_individual"] = {
    x: {
        "naive": {
            "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020][x + "_naive"]),
            "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020][x + "_naive"]),
        },
        # "logit_basic": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_basic_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_basic_play_" + x]),
        # },
        # "logit_advanced": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_advanced_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_advanced_play_" + x]),
        # },
        # "mlp": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_play_" + x]),
        # },
        # "mlp": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_play_" + x]),
        # },
        "search_mlp": {
            "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_mlp_play_" + x]),
            "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_mlp_play_" + x]),
        },        
        # "rf": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["rf_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["rf_play_" + x]),
        # },        
        "search_rf": {
            "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_rf_play_" + x]),
            "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_rf_play_" + x]),
        },        
        # "search_cb": {
        #     "train": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_cb_play_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_cb_play_" + x]),
        # },        
    } for x in x_view["play_outcome"].drop_duplicates().sort_values()
}
log_loss_scores["drive_outcomes_individual"] = {
    x: {
        "naive": {
            "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020][x + "_naive"]),
            "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020][x + "_naive"]),
        },
        # "logit_basic": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_basic_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_basic_drive_" + x]),
        # },
        # "logit_advanced": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_advanced_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_advanced_drive_" + x]),
        # },
        # "mlp": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_drive_" + x]),
        # },
        # "mlp": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_drive_" + x]),
        # },
        "search_mlp": {
            "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_mlp_drive_" + x]),
            "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_mlp_drive_" + x]),
        },        
        # "rf": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["rf_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["rf_drive_" + x]),
        # },        
        "search_rf": {
            "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_rf_drive_" + x]),
            "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_rf_drive_" + x]),
        },        
        # "search_cb": {
        #     "train": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["search_cb_drive_" + x]),
        #     "val": log_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["search_cb_drive_" + x]),
        # },        
    } for x in x_view["drive_outcome_desc_basic"].drop_duplicates().sort_values()
}
with open(os.path.join(data_dir, 'log_loss_scores_play_drive_predictions.json'), 'w') as f:
    json.dump(log_loss_scores, f)


In [None]:
log_loss_scores

In [None]:
brier_scores = {
    "play_outcomes_individual": {},
    "drive_outcomes_individual": {},
}

brier_scores["play_outcomes_individual"] = {
    x: {
        "naive": {
            "train": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020][x + "_naive"]),
            "val": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020][x + "_naive"]),
        },
        "logit_basic": {
            "train": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_basic_play_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_basic_play_" + x]),
        },
        "logit_advanced": {
            "train": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_advanced_play_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_advanced_play_" + x]),
        },
        "mlp": {
            "train": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_play_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_play_" + x]),
        },
        "rf": {
            "train": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season<2020][x], x_view[x_view.season<2020]["rf_play_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["play_outcome"])[x_view.season==2020][x], x_view[x_view.season==2020]["rf_play_" + x]),
        },        
    } for x in x_view["play_outcome"].drop_duplicates().sort_values()
}
brier_scores["drive_outcomes_individual"] = {
    x: {
        "naive": {
            "train": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020][x + "_naive"]),
            "val": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020][x + "_naive"]),
        },
        "logit_basic": {
            "train": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_basic_drive_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_basic_drive_" + x]),
        },
        "logit_advanced": {
            "train": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["logit_advanced_drive_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["logit_advanced_drive_" + x]),
        },
        "mlp": {
            "train": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["mlp_drive_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["mlp_drive_" + x]),
        },
        "rf": {
            "train": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season<2020][x], x_view[x_view.season<2020]["rf_drive_" + x]),
            "val": brier_score_loss(pd.get_dummies(x_view["drive_outcome_desc_basic"])[x_view.season==2020][x], x_view[x_view.season==2020]["rf_drive_" + x]),
        },        
    } for x in x_view["drive_outcome_desc_basic"].drop_duplicates().sort_values()
}
with open(os.path.join(data_dir, 'brier_scores_play_drive_predictions.json'), 'w') as f:
    json.dump(brier_scores, f)



In [None]:
from matplotlib.pyplot import figure

figure(figsize=(5, 3), dpi=100)

some_model_keys = [
    "naive", 
    # "logit_basic", 
    # "logit_advanced", 
    # "mlp", 
    # "rf",
    "search_mlp",
    "search_rf",
]
width = 1 / len(some_model_keys) - .01
play_outcome_bar = "offensive_touchdown"
i=1
for key in some_model_keys:
    bar_positions = np.arange(len(log_loss_scores["play_outcome"][key])) + i * width
    plt.bar(bar_positions, log_loss_scores["play_outcome"][key].values(), width=width)
    i+=1
plt.title("Log Loss Play Outcomes")
plt.xticks([width * i / 2, 1 + width * i / 2],["train", "val"])
# plt.size([20, 10])
# plt.xti()

# bar_col+=1

# fig.suptitle("Play Outcomes Log Loss Scores")
plt.legend(some_model_keys, loc="lower center")



In [None]:
log_loss_scores

In [None]:
log_loss_scores["play_outcomes_individual"]["first_down"][key]

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(5, 3), dpi=100)

some_model_keys = [
    "naive", 
    # "logit_basic", 
    # "logit_advanced", 
    # "mlp", 
    # "rf",
    "search_mlp",
    "search_rf",
]
width = 1 / len(some_model_keys) - .01
play_outcome_bar = "offensive_touchdown"
i=1
for key in some_model_keys:
    bar_positions = np.arange(len(log_loss_scores["play_outcomes_individual"]["first_down"][key])) + i * width
    plt.bar(bar_positions, log_loss_scores["play_outcomes_individual"]["first_down"][key].values(), width=width)
    i+=1
    print(key, log_loss_scores["play_outcomes_individual"]["first_down"][key].values())
plt.title("Log Loss First Down Model")
plt.xticks([width * i / 2, 1 + width * i / 2],["train", "val"])
# plt.size([20, 10])
# plt.xti()

# bar_col+=1

# fig.suptitle("Play Outcomes Log Loss Scores")
plt.legend(some_model_keys, loc="lower center")



In [None]:
# some_model_keys = ["naive", "logit_basic", "logit_advanced", "mlp", "rf"]
figure(figsize=(5, 3), dpi=100)
width = 1 / len(some_model_keys) - .01
i=1
for key in some_model_keys:
    bar_positions = np.arange(len(log_loss_scores["drive_outcome"][key])) + i * width
    plt.bar(bar_positions, log_loss_scores["drive_outcome"][key].values(), width=width)
    i+=1
plt.title("Log Loss Drive Outcomes")
plt.xticks([width * i / 2, 1 + width * i / 2],["train", "val"])
# plt.xti()

# bar_col+=1

# fig.suptitle("Play Outcomes Log Loss Scores")
plt.legend(some_model_keys, loc="lower center")



In [None]:
import math

# some_model_keys = ["naive", "logit_basic", "logit_advanced", "mlp", "rf"]
width = 1 / len(some_model_keys) - .01
play_outcome_bar = "offensive_touchdown"
fig, ax = plt.subplots(2, math.ceil(len(log_loss_scores["play_outcomes_individual"])/2), figsize=(20, 10))
bar_col = 0
bar_row = 0
# for play_outcome in log_loss_scores["play_outcomes_individual"].keys():
for play_outcome in log_loss_scores["play_outcomes_individual"].keys():
    i = 1
    if bar_col == math.ceil(len(log_loss_scores["play_outcomes_individual"])/2):
        bar_row+=1
        bar_col=0
    for key in some_model_keys:
        bar_positions = np.arange(len(log_loss_scores["play_outcomes_individual"][play_outcome][key])) + i * width
        ax[bar_row, bar_col].bar(bar_positions, log_loss_scores["play_outcomes_individual"][play_outcome][key].values(), width=width)
        i+=1
        print(play_outcome, key, log_loss_scores["play_outcomes_individual"][play_outcome][key])
    ax[bar_row, bar_col].set_title(play_outcome)
    ax[bar_row, bar_col].set_xticks([width * i / 2, 1 + width * i / 2])
    ax[bar_row, bar_col].set_xticklabels(["train", "val"])

    bar_col+=1
    
fig.suptitle("Play Outcomes Log Loss Scores")
fig.legend(some_model_keys, loc="lower center")



In [None]:
import math

# some_model_keys = ["naive", "mlp", "logit_basic", "logit_advanced", "rf"]
width = 1 / len(some_model_keys) - .01
play_outcome_bar = "offensive_touchdown"
fig, ax = plt.subplots(2, math.ceil(len(brier_scores["play_outcomes_individual"])/2), figsize=(20, 10))
bar_col = 0
bar_row = 0
for play_outcome in brier_scores["play_outcomes_individual"].keys():
    i = 1
    if bar_col == math.ceil(len(brier_scores["play_outcomes_individual"])/2):
        bar_row+=1
        bar_col=0
    for key in some_model_keys:
        bar_positions = np.arange(len(brier_scores["play_outcomes_individual"][play_outcome][key])) + i * width
        ax[bar_row, bar_col].bar(bar_positions, brier_scores["play_outcomes_individual"][play_outcome][key].values(), width=width)
        i+=1
    ax[bar_row, bar_col].set_title(play_outcome)
    ax[bar_row, bar_col].set_xticks([width * i / 2, 1 + width * i / 2])
    ax[bar_row, bar_col].set_xticklabels(["train", "val"])

    bar_col+=1
# ax[0, 0].set_ylim([0.0014115, 0.001414])
fig.suptitle("Play Outcomes Brier Scores")
fig.legend(some_model_keys, loc="lower center")


In [None]:
import math

# some_model_keys = ["naive", "logit_basic", "logit_advanced", "mlp", "rf"]
width = 1 / len(some_model_keys) - .01
drive_outcome_bar = "offensive_touchdown"
fig, ax = plt.subplots(2, math.ceil(len(log_loss_scores["drive_outcomes_individual"])/2), figsize=(20, 10))
bar_col = 0
bar_row = 0
for drive_outcome in log_loss_scores["drive_outcomes_individual"].keys():
    i = 1
    if bar_col == math.ceil(len(log_loss_scores["drive_outcomes_individual"])/2):
        bar_row+=1
        bar_col=0
    for key in some_model_keys:
        bar_positions = np.arange(len(log_loss_scores["drive_outcomes_individual"][drive_outcome][key])) + i * width
        ax[bar_row, bar_col].bar(bar_positions, log_loss_scores["drive_outcomes_individual"][drive_outcome][key].values(), width=width)
        i+=1
    ax[bar_row, bar_col].set_title(drive_outcome)
    ax[bar_row, bar_col].set_xticks([width * i / 2, 1 + width * i / 2])
    ax[bar_row, bar_col].set_xticklabels(["train", "val"])

    bar_col+=1
    
fig.suptitle("Drive Outcomes Log Loss Scores")
fig.legend(some_model_keys, loc="lower center")




In [None]:
import math
print(int(math.ceil(4.2)))
# some_model_keys = ["naive", "logit_basic", "logit_advanced", "mlp", "rf"]
width = 1 / len(some_model_keys) - .01
drive_outcome_bar = "offensive_touchdown"
fig, ax = plt.subplots(2, math.ceil(len(brier_scores["drive_outcomes_individual"])/2), figsize=(20, 10))
bar_col = 0
bar_row = 0
for drive_outcome in brier_scores["drive_outcomes_individual"].keys():
    i = 1
    if bar_col == math.ceil(len(brier_scores["drive_outcomes_individual"])/2):
        bar_row+=1
        bar_col=0
    for key in some_model_keys:
        bar_positions = np.arange(len(brier_scores["drive_outcomes_individual"][drive_outcome][key])) + i * width
        ax[bar_row, bar_col].bar(bar_positions, brier_scores["drive_outcomes_individual"][drive_outcome][key].values(), width=width)
        i+=1
    ax[bar_row, bar_col].set_title(drive_outcome)
    ax[bar_row, bar_col].set_xticks([width * i / 2, 1 + width * i / 2])
    ax[bar_row, bar_col].set_xticklabels(["train", "val"])

    bar_col+=1
    
fig.suptitle("Drive Outcomes Brier Scores")
fig.legend(some_model_keys, loc="lower center")



In [None]:
output_name = "play_outcome"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["rf_play_"+x], n_bins=10, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "play_outcome"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["mlp_play_"+x], n_bins=10, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "drive_outcome_desc_basic"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]<2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["rf_drive_"+x], n_bins=5, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "play_outcome"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["logit_basic_play_"+x], n_bins=10, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "play_outcome"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["logit_advanced_play_"+x], n_bins=10, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "drive_outcome_desc_basic"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["logit_basic_drive_"+x], n_bins=5, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "drive_outcome_desc_basic"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["logit_advanced_drive_"+x], n_bins=5, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
output_name = "drive_outcome_desc_basic"

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

output_one_hot = enc.fit(x_view[[output_name]])

x_view[enc.categories_[0]] = pd.DataFrame(enc.transform(x_view[[output_name]]).toarray(), columns=enc.categories_[0])

calb_df = x_view[x_view["season"]==2020]
for x in enc.categories_[0]:
    calb = calibration.calibration_curve(calb_df[x], calb_df["mlp_drive_"+x], n_bins=5, strategy="quantile")
    plt.plot(calb[1], calb[0], marker = "o", label=x)


plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
rf_drive_outcome.verbose=0
rf_play_outcome.verbose=0

def plot_model_probabilities(model, input_feature, input_feature_advance):
    fig, axs = plt.subplots(1, 1, figsize=(15,5)) 
    pred_output_logit_basic_play = logit_play_outcome_basic.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_logit_advanced_play = logit_play_outcome_advanced.predict_proba(pd.DataFrame(input_feature_advance, columns = input_names_advanced))
    pred_output_logit_basic_drive = logit_drive_outcome_basic.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_logit_advanced_drive = logit_drive_outcome_advanced.predict_proba(pd.DataFrame(input_feature_advance, columns = input_names_advanced))
    pred_output_mlp_play = mlp_play_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_mlp_drive = mlp_drive_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_rf_play = rf_play_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    # pred_output_cb_play = search_cb_play_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_rf_drive = rf_drive_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    # print(pd.DataFrame(input_feature, columns = input_names))
    if model == "mlp_play":
        axs.bar(mlp_play_outcome.classes_, pred_output_mlp_play[0])
    elif model == "rf_play":
        axs.bar(rf_play_outcome.classes_, pred_output_rf_play[0])
    elif model == "cb_play":
        axs.bar(search_cb_play_outcome.classes_, pred_output_cb_play[0])
    elif model == "logit_basic_play":
        axs.bar(logit_play_outcome_basic.classes_, pred_output_logit_basic_play[0])
    elif model == "logit_basic_drive":
        axs.bar(logit_drive_outcome_basic.classes_, pred_output_logit_basic_drive[0])
    elif model == "logit_advanced_play":
        axs.bar(logit_play_outcome_advanced.classes_, pred_output_logit_advanced_play[0])
    elif model == "logit_advanced_drive":
        axs.bar(logit_drive_outcome_advanced.classes_, pred_output_logit_advanced_drive[0])
    elif model == "mlp_drive":
        axs.bar(mlp_drive_outcome.classes_, pred_output_mlp_drive[0])
    elif model == "rf_drive":
        axs.bar(rf_drive_outcome.classes_, pred_output_rf_drive[0])

    play_outcome_matrix = pd.concat([
        pd.DataFrame([[round(x, 6) for x in pred_output_logit_basic_play[0]]], columns=logit_play_outcome_basic.classes_, index=["logit_basic"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_logit_advanced_play[0]]], columns=logit_play_outcome_advanced.classes_, index=["logit_advanced"]).T,
        # pd.DataFrame([[round(x, 6) for x in pred_output_cb_play[0]]], columns=mlp_play_outcome.classes_, index=["mlp"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_mlp_play[0]]], columns=mlp_play_outcome.classes_, index=["mlp"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_rf_play[0]]], columns=rf_play_outcome.classes_, index=["rf"]).T,
    ], axis=1)
    drive_outcome_matrix = pd.concat([
        pd.DataFrame([[round(x, 6) for x in pred_output_logit_basic_drive[0]]], columns=logit_drive_outcome_basic.classes_, index=["logit_basic"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_logit_advanced_drive[0]]], columns=logit_drive_outcome_advanced.classes_, index=["logit_advanced"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_mlp_drive[0]]], columns=mlp_drive_outcome.classes_, index=["mlp"]).T,
        pd.DataFrame([[round(x, 6) for x in pred_output_rf_drive[0]]], columns=rf_drive_outcome.classes_, index=["rf"]).T,
    ], axis=1)

    print(play_outcome_matrix)
    print(drive_outcome_matrix)
    axs.set_ylim([0, 1])
    axs.set_yticks(np.arange(0, 1.1, .1))
    axs.grid(axis="y")
    plt.savefig("example.png")

style = {'description_width': '200px'}
layout = Layout(width='450px')
@interact(
    time_left_in_half=widgets.IntSlider(min=0, max=30, step=1, value=100, style=style, layout=layout),
    half=widgets.IntSlider(min=1, max=2, step=1, value=1, style=style, layout=layout),
    current_score_diff=widgets.IntSlider(min=-30, max=30, step=1, value=0, style=style, layout=layout),
    current_score_total=widgets.IntSlider(min=0, max=80, step=1, value=0, style=style, layout=layout),
    cur_spread=widgets.IntSlider(min=-20, max=20, step=1, value=-3, style=style, layout=layout),
    cur_over_under=widgets.IntSlider(min=30, max=60, step=1, value=45, style=style, layout=layout),
    home_timeouts_remaining=widgets.IntSlider(min=0, max=3, step=1, value=3, style=style, layout=layout),
    away_timeouts_remaining=widgets.IntSlider(min=0, max=3, step=1, value=3, style=style, layout=layout),
    ytg=widgets.IntSlider(min=1, max=30, step=1, value=10, style=style, layout=layout),
    yd_from_goal=widgets.IntSlider(min=1, max=100, step=1, value=75, style=style, layout=layout),
    down=widgets.IntSlider(min=1, max=4, step=1, value=1, style=style, layout=layout),
    model=widgets.Dropdown(options=["logit_basic_play", "logit_basic_drive", "logit_advanced_play", "logit_advanced_drive", "mlp_play", "mlp_drive", "rf_play", "rf_drive", "cb_play"], value="mlp_play"),
    punt=False,
    field_goal_attempt=False,
    home_team_has_ball=True,
)
def g(
    time_left_in_half,
    half,
    current_score_diff,
    current_score_total,
    cur_spread,
    cur_over_under,
    home_timeouts_remaining,
    away_timeouts_remaining,
    ytg,
    yd_from_goal,
    down,
    punt,
    field_goal_attempt,
    home_team_has_ball,
    model
):
    ytg_adj = np.where(yd_from_goal < ytg, yd_from_goal, ytg)
    input_feature = pd.DataFrame(np.array(
        [
            time_left_in_half * 60,
            half,
            current_score_diff,
            current_score_total,
            cur_spread,
            cur_over_under,
            home_timeouts_remaining,
            away_timeouts_remaining,
            punt,
            field_goal_attempt,
            ytg_adj,
            yd_from_goal,
            down,
            home_team_has_ball,
        ],
    ).reshape(1, -1), columns=input_names)
    if model== "cb_play":
        for x in categoricals:
            input_feature[x] = input_feature[x].apply(int)
    else:
        input_feature = input_feature, model_df[mask_model][input_names]
    print(input_feature)

    input_feature_advance = pd.DataFrame(np.array(
        [
            time_left_in_half * 60,
            (time_left_in_half * 60)**2,
            half,
            current_score_diff,
            current_score_total,
            cur_spread,
            cur_over_under,
            home_timeouts_remaining,
            away_timeouts_remaining,
            punt,
            field_goal_attempt,
            home_team_has_ball,
            np.where(down==2, 1, 0),
            np.where(down==3, 1, 0),
            np.where(down==4, 1, 0),
            np.where(down==1, ytg_adj, 0),
            np.where(down==2, ytg_adj, 0),
            np.where(down==3, ytg_adj, 0),
            np.where(down==4, ytg_adj, 0),
            np.where(down==1, yd_from_goal, 0),
            np.where(down==2, yd_from_goal, 0),
            np.where(down==3, yd_from_goal, 0),
            np.where(down==4, yd_from_goal, 0),
            np.where(down==1, ytg_adj, 0)**2,
            np.where(down==2, ytg_adj, 0)**2,
            np.where(down==3, ytg_adj, 0)**2,
            np.where(down==4, ytg_adj, 0)**2,
            np.where(down==1, yd_from_goal, 0)**2,
            np.where(down==2, yd_from_goal, 0)**2,
            np.where(down==3, yd_from_goal, 0)**2,
            np.where(down==4, yd_from_goal, 0)**2,
        ],
    ).reshape(1, -1), columns=input_names_advanced)
    # input_feature_advance = normalize_df(input_feature_advance, model_df[mask_model][input_names_advanced])
    input_feature_advance = (input_feature_advance)

    # print(pd.DataFrame(input_feature, columns=mlp_play_outcome.feature_names_in_))
    plot_model_probabilities(model, input_feature, input_feature_advance)
    

In [None]:
import itertools as it


play_possibilities = {
    "time_left_in_half": [1800],
    "half": [1],
    "current_score_diff": [0],
    "current_score_total": [0],
    "cur_spread": [-3],
    "cur_over_under": [45],
    "home_timeouts_remaining": [3],
    "away_timeouts_remaining": [3],
    "ytg": range(1, 11),
    "yd_from_goal": range(1, 101),
    "down": range(1, 5),
    "punt": [0],
    "field_goal_attempt": [0],
    "home_team_has_ball": [1],
}

play_possibilities_df = pd.DataFrame(it.product(*(play_possibilities[Name] for Name in play_possibilities)), columns=play_possibilities)

# play_possibilities_df[mlp_play_class_names] = mlp_play_outcome.predict_proba(normalize_df(play_possibilities_df[input_names], model_df[mask_model&(model_df.season<2020)]))
# play_possibilities_df[mlp_drive_class_names] = mlp_drive_outcome.predict_proba(normalize_df(play_possibilities_df[input_names], model_df[mask_model&(model_df.season<2020)]))
# play_possibilities_df[rf_play_class_names] = rf_play_outcome.predict_proba(normalize_df(play_possibilities_df[input_names], model_df[mask_model&(model_df.season<2020)]))
# play_possibilities_df[rf_drive_class_names] = rf_drive_outcome.predict_proba(normalize_df(play_possibilities_df[input_names], model_df[mask_model&(model_df.season<2020)]))
play_possibilities_df[mlp_play_class_names] = mlp_play_outcome.predict_proba((play_possibilities_df[input_names]))
play_possibilities_df[mlp_drive_class_names] = mlp_drive_outcome.predict_proba((play_possibilities_df[input_names]))
play_possibilities_df[rf_play_class_names] = rf_play_outcome.predict_proba((play_possibilities_df[input_names]))
play_possibilities_df[rf_drive_class_names] = rf_drive_outcome.predict_proba((play_possibilities_df[input_names]))
print(mlp_drive_class_names, mlp_play_class_names)
for ytg in range(1, 5):
    plot_input = play_possibilities_df[
        # (play_possibilities_df.ytg == 10) &
        (play_possibilities_df.yd_from_goal == 50)&
        (play_possibilities_df.down == ytg)
    ]
    plt.plot(plot_input["ytg"], plot_input["mlp_play_first_down"], label=ytg)
    plt.ylim([0, 1])
    plt.legend()

In [None]:
list(model_df.columns)

In [None]:
plot_input

In [None]:
plt.hist(event_df["end_of_regulation_score_diff"], bins=108)

In [None]:
plt.hist(model_df["end_of_regulation_score_diff"], bins=108)

In [None]:
all_play_drive_predictions = rf_play_class_names + mlp_play_class_names + rf_drive_class_names + mlp_drive_class_names

In [None]:
game_df = event_df.groupby(["game_code"], as_index=False).max()[["game_code", "quarter"]]
mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df[input_names].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)

# model_df[logit_play_advanced_class_names] = pd.DataFrame(logit_play_outcome_advanced.predict_proba(normalize_df(model_df[mask_model][input_names_advanced], model_df[mask_model&(model_df.season<2020)])), index=model_df[mask_model].index).fillna(0)
# model_df[logit_drive_advanced_class_names] = pd.DataFrame(logit_drive_outcome_advanced.predict_proba(normalize_df(model_df[mask_model][input_names_advanced], model_df[mask_model&(model_df.season<2020)])), index=model_df[mask_model].index).fillna(0)
# model_df[logit_play_advanced_class_names] = model_df[logit_play_advanced_class_names].fillna(0)
# model_df[logit_drive_advanced_class_names] = model_df[logit_drive_advanced_class_names].fillna(0)


# model_df[rf_drive_class_names] = pd.DataFrame(rf_drive_outcome.predict_proba(normalize_df(model_df[mask_model][input_names], model_df[mask_model&(model_df.season<2020)])), index=model_df[mask_model].index).fillna(0)
# model_df[rf_play_class_names] = pd.DataFrame(rf_play_outcome.predict_proba(normalize_df(model_df[mask_model][input_names], model_df[mask_model&(model_df.season<2020)])), index=model_df[mask_model].index).fillna(0)
model_df[rf_drive_class_names] = pd.DataFrame(rf_drive_outcome.predict_proba((model_df[mask_model][input_names])), index=model_df[mask_model].index).fillna(0)
model_df[rf_play_class_names] = pd.DataFrame(rf_play_outcome.predict_proba((model_df[mask_model][input_names])), index=model_df[mask_model].index).fillna(0)
model_df[rf_drive_class_names] = model_df[rf_drive_class_names].fillna(0)
model_df[rf_play_class_names] = model_df[rf_play_class_names].fillna(0)






In [None]:
# event_df["end_of_regulation_score_diff"] = np.where(event_df.merge(game_df, on="game_code", how="left")["quarter_y"]>4, 0, event_df["final_score_diff"])
# model_df["end_of_regulation_score_diff_fix"] = np.where(model_df.merge(game_df, on="game_code", how="left")["quarter_y"]>4, 0, model_df["final_score_diff"])


In [None]:
input_names_score_pred = [
    # 'time_left_in_game',
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'kick_off',
    # 'punt',
    'point_after_kick',
    'two_point_attempt',
    # 'field_goal_attempt',
    # 'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
    'rf_play_first_down',

] + rf_drive_class_names
output_name_score_diff = "end_of_regulation_score_diff_change"
mask_model = (
    (model_df.continuation==0)&
    (model_df[input_names_score_pred+[output_name_score_diff]].notna().all(axis=1))&
    (model_df["overtime"]==0)
)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], input_names_score_pred, output_name_score_diff, normalize=False)
# mlp_score_diff = MLPClassifier(hidden_layer_sizes=[100], verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
# mlp_score_diff.fit(X_train, y_train)
# rf_score_diff = RandomForestClassifier(n_estimators=200, max_depth=15, verbose=100, n_jobs=-1, random_state=1)
# rf_score_diff.fit(X_train, y_train)
# pickle.dump(mlp_score_diff, open(os.path.join(root_dir, 'models/mlp_score_diff.p'), 'wb'))
# pickle.dump(rf_score_diff, open(os.path.join(root_dir, 'models/rf_score_diff.p'), 'wb'))

rf_score_diff = pickle.load(open(os.path.join(root_dir, "models/rf_score_diff.p"), 'rb'))
mlp_score_diff = pickle.load(open(os.path.join(root_dir, "models/mlp_score_diff.p"), 'rb'))

# os.system('say "done"')

# X_train

In [None]:
model_df.end_of_regulation_score_diff_change

In [None]:
os.system('say "done"')
pd.DataFrame(rf_score_diff.feature_importances_, index=rf_score_diff.feature_names_in_)

In [None]:
pd.DataFrame(rf_score_diff.feature_importances_, index=rf_score_diff.feature_names_in_)
# os.system('say "done"')

In [None]:
pd.DataFrame(rf_score_diff.feature_importances_, index=rf_score_diff.feature_names_in_)
# os.system('say "done"')

In [None]:
rf_score_diff.verbose=0
mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(pd.concat([X_train, X_val, X_test])))
rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(pd.concat([X_train, X_val, X_test])))


In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(model_df[["down"]])
down_one_hot_cols = ["down_" + str(x) for x in enc.categories_[0]]
model_df[down_one_hot_cols] = pd.DataFrame(enc.transform(model_df[["down"]]).toarray(), columns=enc.categories_)

drive_outcome_dummy_cols = [x + "_binary" for x in pd.get_dummies(model_df["drive_outcome_desc_basic"]).columns]
end_of_regulation_score_diff_change_dummy_cols = ["end_of_regulation_score_diff_change_" + str(x) for x in pd.get_dummies(model_df["end_of_regulation_score_diff_change"]).columns]
model_df[drive_outcome_dummy_cols] = pd.get_dummies(model_df["drive_outcome_desc_basic"])
model_df[end_of_regulation_score_diff_change_dummy_cols] = pd.get_dummies(model_df["end_of_regulation_score_diff_change"])
model_df["ytg_adj"] = np.where((model_df.punt==1)|(model_df.field_goal_attempt==1), 0, model_df["ytg"])

output_names_chain = drive_outcome_dummy_cols +["end_of_regulation_score_diff_change"]
input_names = [
    'time_left_in_game',
    'time_left_in_half',
    # 'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    # 'from_scrimmage',
    'ytg_adj',
    'yd_from_goal',
    # 'down',
    'home_team_has_ball',
] + down_one_hot_cols[1:]
mask_model_score_diff = (
    (model_df.continuation==0)&
    (model_df[input_names+output_names_chain].notna().all(axis=1))&
    (model_df["overtime"]==0)
)
cv=GroupKFold(n_splits=3)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model_score_diff], input_names, output_names_chain)
y_train


os.system('say "done"')

In [None]:
input_names

In [None]:
from sklearn.tree import export_graphviz
# Export as dot file
input_names = [
    # 'time_left_in_game',
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    # 'kick_off',
    'punt',
    # 'point_after_kick',
    # 'two_point_attempt',
    'field_goal_attempt',
    # 'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',

]
rf_score_diff = pickle.load(open(os.path.join(root_dir, "models/rf_score_diff.p"), 'rb'))
estimator = rf_score_diff.estimators_[1]
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(rf_score_diff.feature_names_in_),
                # max_depth=3,
                # class_names = list(rf_score_diff.classes_),
                rounded = True, proportion = False, 
                precision = 2, filled = True)

import pydot

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tsvg', 'tree.dot', '-o', 'tree.svg', '-Gdpi=600'])

# Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')


In [None]:
input_names

In [None]:
y_train

In [None]:
from sklearn.pipeline import Pipeline
# from utils import TennisLiveScoreMultiClassifier, TennisLiveWinnerClassifierChainEnhanced
from custom_estimators import TennisLiveScoreMultiClassifier, TennisLiveWinnerClassifierChainEnhanced
from sklearn.model_selection import GridSearchCV

winner_parameters = {
    'n_estimators': [100],
    'max_depth': [4]
}
winner_pipeline = Pipeline(steps=[
    ('classifier', TennisLiveWinnerClassifierChainEnhanced(
        GridSearchCV(RandomForestClassifier(verbose = 100), winner_parameters, scoring='neg_log_loss', verbose=4, n_jobs=-1), cv=3,
    ))
])
winner_pipeline.fit(X_train[:10000], y_train[:10000])

In [None]:
test = pd.DataFrame(winner_pipeline.predict_proba(X_train[:10000]))

In [None]:
test

In [None]:
score_diff_change_list = mlp_score_diff.classes_
score_probability_matrix = np.zeros((mlp_score_diff_preds.values.shape))
outcomes_dict = {
    "xhome_win": {},
    "xdraw": {},
    "xaway_win": {},
    "xscore_diff_end_of_regulation": {},
}
model_df_index = model_df[mask_model].index
for x in range(len(mlp_score_diff_preds)):
    if round(x/100000) == x/100000:
        print(x)
    score_probability_matrix[x] = event_df["current_score_diff"][model_df_index[x]] + score_diff_change_list
    outcomes_dict["xhome_win"][x] = np.sum(mlp_score_diff_preds.values[x][score_probability_matrix[x]>0])
    outcomes_dict["xdraw"][x] = np.sum(mlp_score_diff_preds.values[x][score_probability_matrix[x]==0])
    outcomes_dict["xaway_win"][x] = np.sum(mlp_score_diff_preds.values[x][score_probability_matrix[x]<0])
    outcomes_dict["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * mlp_score_diff_preds.values[x])

event_df["xhome_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict["xhome_win"].values(), index=model_df[mask_model].index)
event_df["xdraw_score_diff_mlp"] = pd.DataFrame(outcomes_dict["xdraw"].values(), index=model_df[mask_model].index)
event_df["xaway_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict["xaway_win"].values(), index=model_df[mask_model].index)
event_df["xscore_diff_end_of_regulation_mlp"] = pd.DataFrame(outcomes_dict["xscore_diff_end_of_regulation"].values(), index=model_df[mask_model].index)


In [None]:
score_diff_change_list = rf_score_diff.classes_
score_probability_matrix = np.zeros((rf_score_diff_preds.values.shape))
outcomes_dict = {
    "xhome_win": {},
    "xdraw": {},
    "xaway_win": {},
    "xscore_diff_end_of_regulation": {},
}
model_df_index = model_df[mask_model].index
for x in range(len(rf_score_diff_preds)):
    if round(x/100000) == x/100000:
        print(x)
    score_probability_matrix[x] = event_df["current_score_diff"][model_df_index[x]] + score_diff_change_list
    outcomes_dict["xhome_win"][x] = np.sum(rf_score_diff_preds.values[x][score_probability_matrix[x]>0])
    outcomes_dict["xdraw"][x] = np.sum(rf_score_diff_preds.values[x][score_probability_matrix[x]==0])
    outcomes_dict["xaway_win"][x] = np.sum(rf_score_diff_preds.values[x][score_probability_matrix[x]<0])
    outcomes_dict["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * rf_score_diff_preds.values[x])

event_df["xhome_win_score_diff_rf"] = pd.DataFrame(outcomes_dict["xhome_win"].values(), index=model_df[mask_model].index)
event_df["xdraw_score_diff_rf"] = pd.DataFrame(outcomes_dict["xdraw"].values(), index=model_df[mask_model].index)
event_df["xaway_win_score_diff_rf"] = pd.DataFrame(outcomes_dict["xaway_win"].values(), index=model_df[mask_model].index)
event_df["xscore_diff_end_of_regulation_rf"] = pd.DataFrame(outcomes_dict["xscore_diff_end_of_regulation"].values(), index=model_df[mask_model].index)



In [None]:
output_name = "play_outcome"

calb_df = event_df[(event_df["season"]==2020)&~(event_df.xhome_win_score_diff_rf.isna())]
# calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_mlp"], 0, 1), n_bins=10, strategy="quantile")
# plt.plot(calb[1], calb[0], marker="o")
calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_rf"], 0, 1), n_bins=5, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")
calb_df = event_df[(event_df["season"]<2020)&~(event_df.xhome_win_score_diff_rf.isna())]
calb = calibration.calibration_curve(calb_df["home_team_win"], np.clip(calb_df["xhome_win_score_diff_rf"], 0, 1), n_bins=5, strategy="quantile")
plt.plot(calb[1], calb[0], marker="o")

plt.xlabel("predicted")
plt.ylabel("actual")
plt.legend()
plt.plot([0, 1], [0, 1])

In [None]:
pd.DataFrame(calb[1], calb[1] - calb[0])

In [None]:
mask_model

In [None]:
rf_score_diff.verbose=0
model_df["play_description"] = event_df["play_description"]
model_df["game_info"] = event_df["game_info"]
fourth_go_for_it = model_df[mask_model]
fourth_go_for_it = fourth_go_for_it[fourth_go_for_it.down == 4].reset_index()
# fourth_go_for_it[input_names_advanced] = 
fourth_go_for_it[["punt", "field_goal_attempt"]] = 0
fourth_down_punt = deepcopy(fourth_go_for_it)
fourth_down_punt[["punt", "field_goal_attempt"]] = [1, 0]
fourth_down_field_goal = deepcopy(fourth_go_for_it)
fourth_down_field_goal[["punt", "field_goal_attempt"]] = [0, 1]

# fourth_go_for_it[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(normalize_df(fourth_go_for_it[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# fourth_down_punt[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(normalize_df(fourth_down_punt[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# fourth_down_field_goal[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(normalize_df(fourth_down_field_goal[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# fourth_go_for_it[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(normalize_df(fourth_go_for_it[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# fourth_down_punt[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(normalize_df(fourth_down_punt[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# fourth_down_field_goal[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(normalize_df(fourth_down_field_goal[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])

# fourth_go_for_it[rf_play_class_names] =rf_play_outcome.predict_proba(normalize_df(fourth_go_for_it[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# fourth_down_punt[rf_play_class_names] =rf_play_outcome.predict_proba(normalize_df(fourth_down_punt[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# fourth_down_field_goal[rf_play_class_names] =rf_play_outcome.predict_proba(normalize_df(fourth_down_field_goal[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# fourth_go_for_it[rf_drive_class_names] =rf_drive_outcome.predict_proba(normalize_df(fourth_go_for_it[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# fourth_down_punt[rf_drive_class_names] =rf_drive_outcome.predict_proba(normalize_df(fourth_down_punt[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# fourth_down_field_goal[rf_drive_class_names] =rf_drive_outcome.predict_proba(normalize_df(fourth_down_field_goal[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])

fourth_go_for_it[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(fourth_go_for_it[input_names_advanced])
fourth_down_punt[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(fourth_down_punt[input_names_advanced])
fourth_down_field_goal[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(fourth_down_field_goal[input_names_advanced])
fourth_go_for_it[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(fourth_go_for_it[input_names_advanced])
fourth_down_punt[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(fourth_down_punt[input_names_advanced])
fourth_down_field_goal[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(fourth_down_field_goal[input_names_advanced])

fourth_go_for_it[rf_play_class_names] =rf_play_outcome.predict_proba(fourth_go_for_it[input_names])
fourth_down_punt[rf_play_class_names] =rf_play_outcome.predict_proba(fourth_down_punt[input_names])
fourth_down_field_goal[rf_play_class_names] =rf_play_outcome.predict_proba(fourth_down_field_goal[input_names])
fourth_go_for_it[rf_drive_class_names] =rf_drive_outcome.predict_proba(fourth_go_for_it[input_names])
fourth_down_punt[rf_drive_class_names] =rf_drive_outcome.predict_proba(fourth_down_punt[input_names])
fourth_down_field_goal[rf_drive_class_names] =rf_drive_outcome.predict_proba(fourth_down_field_goal[input_names])

# mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(pd.concat([X_train[], X_val, X_test])), index=model_df[mask_model&(model_df.down==4)].index)
# rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(pd.concat([X_train, X_val, X_test])), index=model_df[mask_model&(model_df.down==4)].index)
# fourth_go_for_it_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(normalize_df(fourth_go_for_it[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# fourth_down_punt_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(normalize_df(fourth_down_punt[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# fourth_down_field_goal_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(normalize_df(fourth_down_field_goal[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
fourth_go_for_it_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(fourth_go_for_it[input_names_score_pred]))
fourth_down_punt_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(fourth_down_punt[input_names_score_pred]))
fourth_down_field_goal_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(fourth_down_field_goal[input_names_score_pred]))



# rf_score_diff.predict_proba(fourth_go_for_it[input_names])


score_probability_matrix = {} 
outcomes_dict = {}
for x in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]: 
    outcomes_dict[x]={
        "xhome_win": {},
        "xdraw": {},
        "xaway_win": {},
        "xscore_diff_end_of_regulation": {}}
score_probability_matrix = np.zeros((fourth_go_for_it_rf_score_diff_preds.values.shape))

model_df_index = model_df[mask_model].index
for x in range(len(fourth_go_for_it_rf_score_diff_preds)):
    if round(x/10000) == x/10000:
        print(x)
    score_probability_matrix[x] = fourth_go_for_it["current_score_diff"][fourth_go_for_it.index[x]] + score_diff_change_list
    for i in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]:

        outcomes_dict[i]["xhome_win"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]>0])
        outcomes_dict[i]["xdraw"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]==0])
        outcomes_dict[i]["xaway_win"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]<0])
        outcomes_dict[i]["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * eval(i + "_rf_score_diff_preds").values[x])

for i in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]:
    eval(i)["xhome_win_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xhome_win"].values())
    eval(i)["xdraw_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xdraw"].values())
    eval(i)["xaway_win_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())
    eval(i)["xscore_diff_end_of_regulation_rf"] = pd.DataFrame(outcomes_dict[i]["xscore_diff_end_of_regulation"].values())




In [None]:
# # "xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"
# fourth_go_for_it_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(normalize_df(fourth_go_for_it[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# fourth_down_punt_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(normalize_df(fourth_down_punt[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# fourth_down_field_goal_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(normalize_df(fourth_down_field_goal[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
fourth_go_for_it_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba((fourth_go_for_it[input_names_score_pred])))
fourth_down_punt_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba((fourth_down_punt[input_names_score_pred])))
fourth_down_field_goal_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba((fourth_down_field_goal[input_names_score_pred])))


# mlp_score_diff.predict_proba(fourth_go_for_it[input_names])


score_probability_matrix = {} 
outcomes_dict = {}
for x in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]: 
    outcomes_dict[x]={
        "xhome_win": {},
        "xdraw": {},
        "xaway_win": {},
        "xscore_diff_end_of_regulation": {}}
score_probability_matrix = np.zeros((fourth_go_for_it_mlp_score_diff_preds.values.shape))

model_df_index = model_df[mask_model].index
for x in range(len(fourth_go_for_it_mlp_score_diff_preds)):
    if round(x/10000) == x/10000:
        print(x)
    score_probability_matrix[x] = fourth_go_for_it["current_score_diff"][fourth_go_for_it.index[x]] + score_diff_change_list
    for i in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]:

        outcomes_dict[i]["xhome_win"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]>0])
        outcomes_dict[i]["xdraw"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]==0])
        outcomes_dict[i]["xaway_win"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]<0])
        outcomes_dict[i]["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * eval(i + "_mlp_score_diff_preds").values[x])

for i in ["fourth_go_for_it", "fourth_down_punt", "fourth_down_field_goal"]:
    eval(i)["xhome_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xhome_win"].values())
    eval(i)["xdraw_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xdraw"].values())
    eval(i)["xaway_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())
    eval(i)["xscore_diff_end_of_regulation_mlp"] = pd.DataFrame(outcomes_dict[i]["xscore_diff_end_of_regulation"].values())




In [None]:
model_df["play_description"] = event_df["play_description"]
model_df["game_info"] = event_df["game_info"]
point_after_go_for_two = model_df[mask_model]
point_after_go_for_two = point_after_go_for_two[model_df["point_after_kick"] + model_df["two_point_attempt"] == 1].reset_index()
# point_after_go_for_two[input_names_advanced] = 
point_after_go_for_two[["point_after_kick", "two_point_attempt"]] = [0, 1]
point_after_kick = deepcopy(point_after_go_for_two)
point_after_kick[["point_after_kick", "two_point_attempt"]] = [1, 0]
# point_after_go_for_two[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(normalize_df(point_after_go_for_two[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# point_after_go_for_two[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(normalize_df(point_after_go_for_two[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# point_after_kick[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(normalize_df(point_after_kick[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# point_after_kick[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(normalize_df(point_after_kick[input_names_advanced], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_advanced])
# point_after_go_for_two[rf_play_class_names] =rf_play_outcome.predict_proba(normalize_df(point_after_go_for_two[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# point_after_go_for_two[rf_drive_class_names] =rf_drive_outcome.predict_proba(normalize_df(point_after_go_for_two[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# point_after_kick[rf_play_class_names] =rf_play_outcome.predict_proba(normalize_df(point_after_kick[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
# point_after_kick[rf_drive_class_names] =rf_drive_outcome.predict_proba(normalize_df(point_after_kick[input_names], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names])
point_after_go_for_two[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(point_after_go_for_two[input_names_advanced])
point_after_go_for_two[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(point_after_go_for_two[input_names_advanced])
point_after_kick[logit_play_advanced_class_names] = logit_play_outcome_advanced.predict_proba(point_after_kick[input_names_advanced])
point_after_kick[logit_drive_advanced_class_names] = logit_drive_outcome_advanced.predict_proba(point_after_kick[input_names_advanced])
point_after_go_for_two[rf_play_class_names] =rf_play_outcome.predict_proba(point_after_go_for_two[input_names])
point_after_go_for_two[rf_drive_class_names] =rf_drive_outcome.predict_proba(point_after_go_for_two[input_names])
point_after_kick[rf_play_class_names] =rf_play_outcome.predict_proba(point_after_kick[input_names])
point_after_kick[rf_drive_class_names] =rf_drive_outcome.predict_proba(point_after_kick[input_names])


# # "xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"
# point_after_go_for_two_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(normalize_df(point_after_go_for_two[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# point_after_kick_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(normalize_df(point_after_kick[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
point_after_go_for_two_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(point_after_go_for_two[input_names_score_pred]))
point_after_kick_rf_score_diff_preds = pd.DataFrame(rf_score_diff.predict_proba(point_after_kick[input_names_score_pred]))

score_probability_matrix = {} 
outcomes_dict = {}
for x in ["point_after_go_for_two", "point_after_kick"]: 
    outcomes_dict[x]={
        "xhome_win": {},
        "xdraw": {},
        "xaway_win": {},
        "xscore_diff_end_of_regulation": {}}
score_probability_matrix = np.zeros((point_after_go_for_two_rf_score_diff_preds.values.shape))

model_df_index = model_df[mask_model].index
for x in range(len(point_after_go_for_two_rf_score_diff_preds)):
    if round(x/10000) == x/10000:
        print(x)
    score_probability_matrix[x] = point_after_go_for_two["current_score_diff"][point_after_go_for_two.index[x]] + score_diff_change_list
    for i in ["point_after_go_for_two", "point_after_kick"]:

        outcomes_dict[i]["xhome_win"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]>0])
        outcomes_dict[i]["xdraw"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]==0])
        outcomes_dict[i]["xaway_win"][x] = np.sum(eval(i + "_rf_score_diff_preds").values[x][score_probability_matrix[x]<0])
        outcomes_dict[i]["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * eval(i + "_rf_score_diff_preds").values[x])

for i in ["point_after_go_for_two", "point_after_kick"]:
    eval(i)["xhome_win_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xhome_win"].values())
    eval(i)["xdraw_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xdraw"].values())
    eval(i)["xaway_win_score_diff_rf"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())
    eval(i)["xscore_diff_end_of_regulation_rf"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())
# point_after_go_for_two_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(normalize_df(point_after_go_for_two[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
# point_after_kick_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(normalize_df(point_after_kick[input_names_score_pred], model_df[mask_model&(model_df.season<2020)]).fillna(0)[input_names_score_pred]))
point_after_go_for_two_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(point_after_go_for_two[input_names_score_pred]))
point_after_kick_mlp_score_diff_preds = pd.DataFrame(mlp_score_diff.predict_proba(point_after_kick[input_names_score_pred]))

score_probability_matrix = {} 
outcomes_dict = {}
for x in ["point_after_go_for_two", "point_after_kick"]: 
    outcomes_dict[x]={
        "xhome_win": {},
        "xdraw": {},
        "xaway_win": {},
        "xscore_diff_end_of_regulation": {}}
score_probability_matrix = np.zeros((point_after_go_for_two_mlp_score_diff_preds.values.shape))

model_df_index = model_df[mask_model].index
for x in range(len(point_after_go_for_two_mlp_score_diff_preds)):
    if round(x/10000) == x/10000:
        print(x)
    score_probability_matrix[x] = point_after_go_for_two["current_score_diff"][point_after_go_for_two.index[x]] + score_diff_change_list
    for i in ["point_after_go_for_two", "point_after_kick"]:

        outcomes_dict[i]["xhome_win"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]>0])
        outcomes_dict[i]["xdraw"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]==0])
        outcomes_dict[i]["xaway_win"][x] = np.sum(eval(i + "_mlp_score_diff_preds").values[x][score_probability_matrix[x]<0])
        outcomes_dict[i]["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * eval(i + "_mlp_score_diff_preds").values[x])

for i in ["point_after_go_for_two", "point_after_kick"]:
    eval(i)["xhome_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xhome_win"].values())
    eval(i)["xdraw_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xdraw"].values())
    eval(i)["xaway_win_score_diff_mlp"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())
    eval(i)["xscore_diff_end_of_regulation_mlp"] = pd.DataFrame(outcomes_dict[i]["xaway_win"].values())



In [None]:
game_info_cols = [
    'game_code',
    'game_date',
    'season',
    'home_team_id',
    'home_team',
    'home_team_abbrev',
    'away_team_id',
    'away_team',
    'away_team_abbrev',
    'home_final_score',
    'away_final_score',
    'final_score_diff',
    'end_of_regulation_score_diff',
    'home_team_outcome',
    'home_team_win',
    'draw',
    'away_team_win',
    'cur_spread',
    'cur_over_under',
 ]
game_df = event_df.groupby(game_info_cols, as_index=False).max()[game_info_cols + ["quarter", "nevent"]]
game_df

In [None]:
beginning_of_game_df = game_df.loc[:, game_info_cols]
beginning_of_game_df[["nevent", "beginning_of_game", "end_of_game"]] = [0, 1, 0]


In [None]:
end_of_game_df = game_df.loc[:, game_info_cols + ["nevent"]]
end_of_game_df[["beginning_of_game", "end_of_game"]] = [0, 1]
beginning_of_game_df[["nevent", "beginning_of_game", "end_of_game"]] = [0, 1, 0]

pd.concat([model_df, beginning_of_game_df])[input_names]


In [None]:
list(event_df.columns)

In [None]:
game_begin_and_end_values = pd.DataFrame()
for game_code in (game_df["game_code"]):
    begin_temp = {
        "game_code":  game_code,
        "nevent":  0,
        "beginning_of_game": 1,
        "end_of_game": 0,
    }
    end_temp = {
        "game_code":  game_code,
        "nevent": game_df[game_df["game_code"]==game_code]["nevent"].values[0]+ 1,
        "beginning_of_game": 0,
        "end_of_game": 1,
    }
    game_begin_and_end_values = pd.concat([game_begin_and_end_values, pd.DataFrame(begin_temp, index=[0]), pd.DataFrame(end_temp, index=[0])])

In [None]:
game_begin_and_end_values[input_names] = np.zeros((len(game_begin_and_end_values), len(input_names)))

In [None]:
pd.concat([game_begin_and_end_values, model_df]).fillna(0)

In [None]:
from sklearn.multioutput import ClassifierChain
drive_outcome_dummy_cols = [x + "_binary" for x in pd.get_dummies(model_df["drive_outcome_desc_basic"]).columns]
end_of_regulation_score_diff_change_dummy_cols = ["end_of_regulation_score_diff_change_" + str(x) for x in pd.get_dummies(model_df["end_of_regulation_score_diff_change"]).columns]
model_df[drive_outcome_dummy_cols] = pd.get_dummies(model_df["drive_outcome_desc_basic"])
model_df[end_of_regulation_score_diff_change_dummy_cols] = pd.get_dummies(model_df["end_of_regulation_score_diff_change"])
model_df["yd_from_goal_adj"] = np.where((model_df.punt==1)|(model_df.field_goal_attempt==1), 0, model_df["yd_from_goal"])
model_df["ytg_adj"] = np.where((model_df.punt==1)|(model_df.field_goal_attempt==1), 0, model_df["ytg"])


down_ytg_adj_one_hot_cols = ["down_ytg_adj_" + str(x) for x in range(5)]
model_df[down_ytg_adj_one_hot_cols] = model_df[down_one_hot_cols].T.mul(model_df["ytg_adj"]).T


output_names_chain = ["drive_outcome_desc_basic", "end_of_regulation_score_diff_change"]
input_names = [
    'time_left_in_game',
    'time_left_in_half',
    # 'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    # 'from_scrimmage',
    'ytg_adj',
    'yd_from_goal',
    # 'down',
    'home_team_has_ball',
] + down_one_hot_cols[1:]
mask_model_score_diff = (
    (model_df.continuation==0)&
    (model_df[input_names+output_names_chain].notna().all(axis=1))&
    (model_df["overtime"]==0)
)
cv=GroupKFold(n_splits=3)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model_score_diff], input_names, output_names_chain)
base_rf =RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, verbose=100, n_jobs=-1)
# rf_class_chain = ClassifierChain(base_rf)
base_rf.fit(X_train, y_train)
base_mlp = MLPClassifier(hidden_layer_sizes=[100], verbose=True, early_stopping=True, n_iter_no_change=5, random_state=1)
base_mlp.fit(X_train, y_train)



rf_score_diff_preds_no_drive = pd.DataFrame(base_rf.predict_proba(pd.concat([X_train, X_val, X_test])))



score_diff_change_list = rf_score_diff.classes_
score_probability_matrix = np.zeros((rf_score_diff_preds_no_drive.values.shape))
outcomes_dict = {
    "xhome_win": {},
    "xdraw": {},
    "xaway_win": {},
    "xscore_diff_end_of_regulation": {},
}
model_df_index = model_df[mask_model_score_diff].index
for x in range(len(rf_score_diff_preds_no_drive)):
    if round(x/100000) == x/100000:
        print(x)
    score_probability_matrix[x] = event_df["current_score_diff"][model_df_index[x]] + score_diff_change_list
    outcomes_dict["xhome_win"][x] = np.sum(rf_score_diff_preds_no_drive.values[x][score_probability_matrix[x]>0])
    outcomes_dict["xdraw"][x] = np.sum(rf_score_diff_preds_no_drive.values[x][score_probability_matrix[x]==0])
    outcomes_dict["xaway_win"][x] = np.sum(rf_score_diff_preds_no_drive.values[x][score_probability_matrix[x]<0])
    outcomes_dict["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * rf_score_diff_preds_no_drive.values[x])

event_df["xhome_win_score_diff_rf_no_drive"] = pd.DataFrame(outcomes_dict["xhome_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xdraw_score_diff_rf_no_drive"] = pd.DataFrame(outcomes_dict["xdraw"].values(), index=model_df[mask_model_score_diff].index)
event_df["xaway_win_score_diff_rf_no_drive"] = pd.DataFrame(outcomes_dict["xaway_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xscore_diff_end_of_regulation_rf_no_drive"] = pd.DataFrame(outcomes_dict["xscore_diff_end_of_regulation"].values(), index=model_df[mask_model_score_diff].index)


mlp_score_diff_preds_no_drive = pd.DataFrame(base_mlp.predict_proba(pd.concat([X_train, X_val, X_test])))



score_diff_change_list = mlp_score_diff.classes_
score_probability_matrix = np.zeros((mlp_score_diff_preds_no_drive.values.shape))
outcomes_dict = {
    "xhome_win": {},
    "xdraw": {},
    "xaway_win": {},
    "xscore_diff_end_of_regulation": {},
}
model_df_index = model_df[mask_model_score_diff].index
for x in range(len(mlp_score_diff_preds_no_drive)):
    if round(x/100000) == x/100000:
        print(x)
    score_probability_matrix[x] = event_df["current_score_diff"][model_df_index[x]] + score_diff_change_list
    outcomes_dict["xhome_win"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]>0])
    outcomes_dict["xdraw"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]==0])
    outcomes_dict["xaway_win"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]<0])
    outcomes_dict["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * mlp_score_diff_preds_no_drive.values[x])

event_df["xhome_win_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xhome_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xdraw_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xdraw"].values(), index=model_df[mask_model_score_diff].index)
event_df["xaway_win_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xaway_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xscore_diff_end_of_regulation_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xscore_diff_end_of_regulation"].values(), index=model_df[mask_model_score_diff].index)


In [None]:
os.system('say "done"')
pd.DataFrame(base_rf.feature_importances_, index=base_rf.feature_names_in_)

In [None]:
os.system('say "done"')
pd.DataFrame(base_rf.feature_importances_, index=base_rf.feature_names_in_)

In [None]:
from sklearn.multioutput import ClassifierChain
drive_outcome_dummy_cols = [x + "_binary" for x in pd.get_dummies(model_df["drive_outcome_desc_basic"]).columns]
end_of_regulation_score_diff_change_dummy_cols = ["end_of_regulation_score_diff_change_" + str(x) for x in pd.get_dummies(model_df["end_of_regulation_score_diff_change"]).columns]
model_df[drive_outcome_dummy_cols] = pd.get_dummies(model_df["drive_outcome_desc_basic"])
model_df[end_of_regulation_score_diff_change_dummy_cols] = pd.get_dummies(model_df["end_of_regulation_score_diff_change"])
output_names_chain = ["end_of_regulation_score_diff_change"]
input_names = [
    # 'time_left_in_game',
    'time_left_in_half',
    'half',
    'current_score_diff',
    'current_score_total',
    'cur_spread',
    'cur_over_under',
    'home_timeouts_remaining',
    'away_timeouts_remaining',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    'from_scrimmage',
    'ytg',
    'yd_from_goal',
    'down',
    'home_team_has_ball',
]
mask_model_score_diff = (
    (model_df.continuation==0)&
    (model_df[input_names+output_names_chain].notna().all(axis=1))&
    (model_df["overtime"]==0)
)
cv=GroupKFold(n_splits=3)
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model_score_diff], input_names, output_names_chain)
base_mlp =RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, verbose=100, n_jobs=-1)
# mlp_class_chain = ClassifierChain(base_mlp)
base_mlp.fit(X_train, y_train)



mlp_score_diff_preds_no_drive = pd.DataFrame(base_mlp.predict_proba(pd.concat([X_train, X_val, X_test])))



score_diff_change_list = mlp_score_diff.classes_
score_probability_matrix = np.zeros((mlp_score_diff_preds_no_drive.values.shape))
outcomes_dict = {
    "xhome_win": {},
    "xdraw": {},
    "xaway_win": {},
    "xscore_diff_end_of_regulation": {},
}
model_df_index = model_df[mask_model_score_diff].index
for x in range(len(mlp_score_diff_preds_no_drive)):
    if round(x/100000) == x/100000:
        print(x)
    score_probability_matrix[x] = event_df["current_score_diff"][model_df_index[x]] + score_diff_change_list
    outcomes_dict["xhome_win"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]>0])
    outcomes_dict["xdraw"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]==0])
    outcomes_dict["xaway_win"][x] = np.sum(mlp_score_diff_preds_no_drive.values[x][score_probability_matrix[x]<0])
    outcomes_dict["xscore_diff_end_of_regulation"][x] = np.sum(score_probability_matrix[x] * mlp_score_diff_preds_no_drive.values[x])

event_df["xhome_win_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xhome_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xdraw_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xdraw"].values(), index=model_df[mask_model_score_diff].index)
event_df["xaway_win_score_diff_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xaway_win"].values(), index=model_df[mask_model_score_diff].index)
event_df["xscore_diff_end_of_regulation_mlp_no_drive"] = pd.DataFrame(outcomes_dict["xscore_diff_end_of_regulation"].values(), index=model_df[mask_model_score_diff].index)


In [None]:
X_train, y_train, group_train, X_test, y_test, group_test, X_val, y_val, group_val = create_train_test_val_df(model_df[mask_model], output_names_chain, "drive_outcome_desc_basic", normalize=False)
from sklearn.multiclass import OneVsRestClassifier
base_rf =RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, verbose=100, n_jobs=-1)
ovr = OneVsRestClassifier(base_rf)
ovr.fit(X_train, y_train.astype(str))

In [None]:
test = ovr.predict_proba(pd.concat([X_train, X_val, X_test]))

In [None]:
base_rf.feature_importances_

In [None]:
pd.DataFrame(test, columns=ovr.classes_)

In [None]:
df["class1"]

In [None]:
import xdcc
xdcc


In [None]:
np.mean(model_df[mask_model_score_diff&(model_df.season<2020)&(model_df.kick_off==1)][input_names])

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import plotly
import plotly.graph_objects as go
from plotly.offline import iplot
from plotly.subplots import make_subplots
import math


plotly.offline.init_notebook_mode()
event_df["home_team_has_ball_fixed"] = model_df["home_team_has_ball"]
def plot_game_lwp(game_info, model, width):
    game_df = deepcopy(event_df[(event_df["game_info"] == game_info)&(event_df["event_id"].isin([1,2,3,4,5,7,9,12,14,17,18,22,35,41,47,52,53,54,55,56]))&(event_df["continuation"]==0)&(event_df["overtime"]==0)])
    # print(game_df)
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    if model =="mlp":
        y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"]
        y1="xscore_diff_end_of_regulation_mlp"
    elif model =="rf":
        y=["xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
        y1="xscore_diff_end_of_regulation_rf"
    elif model =="mlp_no_drive":
        y=["xhome_win_score_diff_mlp_no_drive", "xaway_win_score_diff_mlp_no_drive", "xdraw_score_diff_mlp_no_drive"]
        y1="xscore_diff_end_of_regulation_mlp_no_drive"
    elif model =="rf_no_drive":
        y=["xhome_win_score_diff_rf_no_drive", "xaway_win_score_diff_rf_no_drive", "xdraw_score_diff_rf_no_drive"]
        y1="xscore_diff_end_of_regulation_rf_no_drive"
    # elif model == "loglin_basic":
    #     y=['xhome_win_basic_loglin', 'xhome_no_win_basic_loglin']
    # elif model == "loglin_by_minute":
    #     y=['xhome_win_loglin_minute', 'xhome_no_win_loglin_minute']
    # elif model == "mlp_basic":
    #     y=['xhome_win_basic_mlp', 'xhome_no_win_basic_mlp']
    # elif model =="rf_old":
    #     y=["xhome_win", "xaway_win", "xdraw"]
    # elif model =="mlp_old":
    #     y=["xhome_team_win_mlp", "xaway_team_win_mlp", "xdraw_mlp"]
    colors = ["darkkhaki", "skyblue", "gray"]
    game_df["yards_description"] = (
        game_df["event_name"]
        + " for "
        + game_df["yards_gained"].fillna(0).apply(int).apply(str)
        + " yards"
    )
    game_df["wpa"] = game_df[y[0]] - game_df[y[0]].shift(1)

    v = pd.DataFrame(game_df[["quarter", "nevent"]]).reset_index(drop=True)
    mask_ticks = v["quarter"][1:].reset_index(drop=True) == v["quarter"][
        :-1
    ].reset_index(drop=True)
    ticks_idx = [min(v["nevent"])] + list(v[:-1][~mask_ticks]["nevent"] + 1)
    if len(ticks_idx) == 4:
        ticks_values = [1, 2, 3, 4]
    else:
        ticks_values = [1, 2, 3, 4, "OT"]
    fig.add_trace(
        go.Scatter(
            x=game_df["nevent"],
            y=game_df[y[0]],
            customdata=game_df[["yards_description", "play_description", "cur_spread", "wpa"]],
            stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color=colors[0]),
            name=game_df["home_team"].tolist()[0],
            hovertemplate="<br>".join(
                [
                    "%{y}",
                    "%{customdata[0]}",
                    "%{customdata[1]}",
                    "%{customdata[2]}",
                    "%{customdata[3]}",
                ]
            ),
        )
    )
    try:
        fig.add_trace(go.Scatter(
            x=game_df["nevent"],
            y=game_df[y[2]],
            # hovertext=game_df[hover_values],
            stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color=colors[2]),
            name="Overtime",
        ))
    except:
        pass
    fig.add_trace(go.Scatter(
        x=game_df["nevent"],
        y=game_df[y[1]],
        # hovertext=game_df[hover_values],
        stackgroup="one",
        mode="lines",
        line=dict(width=0.5, color=colors[1]),
        name=game_df["away_team"].tolist()[0],
    ))
    try:
        fig.add_trace(go.Scatter(
            x=game_df["nevent"],
            y=game_df[y1],
            # hovertext=game_df[hover_values],
            # stackgroup="one",
            mode="lines",
            line=dict(width=0.5, color="black", dash='dash'),
            name="xfinal_score_diff",
        ), secondary_y=True)
    except:
        pass
    
    fig.update_xaxes(range=[1, np.max(game_df["nevent"])])
    fig.update_yaxes(range=[0, 1], secondary_y=False)
    fig.update_yaxes(range=[-20, 20], secondary_y=True, showgrid=False)
    # x_range = [-math.ceil(np.max(abs(game_df["xscore_diff_end_of_regulation"]))/10) * 10, math.ceil(np.max(abs(game_df["xscore_diff_end_of_regulation"]))/10) *10]
    # print(x_range)
    # fig.update_yaxes(range=x_range, secondary_y=True)
    # fig.update_yaxes(range=[-30, 30], secondary_y=True, showgrid=False)

    # iplot([fig1, fig2, fig3])
    
    fig.update_layout(
        title = game_df["game_info"].tolist()[0] + " (" + model + " model)",
        hovermode="x unified",
        width=width,
        xaxis=dict(
            tickvals=ticks_idx, ticktext=ticks_values, gridwidth=2
        ),
        yaxis=dict(tick0=0, dtick=0.25),
        # yaxis=dict(tick0=-30, dtick=30, gridcolor="black", gridwidth=2),
    )
    game_df["score_change"] = game_df["home_score_added"] + game_df["away_score_added"]
    game_df["score_str"] = (
        (game_df["away_score_added"] + game_df["away_start_score"]).apply(str)
        + "-"
        + (game_df["home_score_added"] + game_df["home_start_score"]).apply(str)
    )
    scores_idx_home = game_df[game_df["home_score_added"] >= 3]["nevent"].tolist()
    score_y_home = game_df[game_df["home_score_added"] >= 3][y[0]].tolist()
    score_home_value = game_df[game_df["home_score_added"] >= 3]["home_score_added"]
    score_home_str = np.where(score_home_value == 3, " FG", " TD")
    score_display_home = game_df[game_df["home_score_added"] >= 3]["score_str"].tolist()
    scores_idx_away = game_df[game_df["away_score_added"] >= 3]["nevent"].tolist()
    score_y_away = (game_df[game_df["away_score_added"] >= 3][y[0]]).tolist()
    score_away_value = game_df[game_df["away_score_added"] >= 3]["away_score_added"]
    score_away_str = np.where(score_away_value == 3, " FG", " TD")
    score_display_away = game_df[game_df["away_score_added"] >= 3]["score_str"].tolist()
    mask_poss_change = (game_df["home_team_has_ball_fixed"].shift(-1)!=game_df["home_team_has_ball_fixed"])&(game_df["score_change"]==0)&(game_df["score_change"].shift(-1)==0)
    ball_change_idx = game_df[mask_poss_change]["nevent"].tolist()
    ball_change_y = (game_df[mask_poss_change][y[0]]).tolist()
    # print(ball_change_idx)
    # print(ball_change_y)
    for x in range(len(scores_idx_home)):
        fig.add_annotation(
            x=scores_idx_home[x],
            y=score_y_home[x],
            text=game_df["home_team_abbrev"].tolist()[0]
            + score_home_str[x]
            + " "
            + score_display_home[x],
            showarrow=True,
        )
    for x in range(len(scores_idx_away)):
        fig.add_annotation(
            x=scores_idx_away[x],
            y=score_y_away[x],
            text=game_df["away_team_abbrev"].tolist()[0]
            + score_away_str[x]
            + " "
            + score_display_away[x],
            showarrow=True,
        )
    fig.add_trace(go.Scatter(
        x=ball_change_idx,
        y=ball_change_y,
        mode="markers",
        name="Possession Change",
        textposition="bottom center",
        marker=dict(color="blue")
    ))

    iplot(fig)
interact(plot_game_lwp, 
game_info=widgets.Dropdown(options=list(event_df.sort_values("game_date", ascending=False)["game_info"].drop_duplicates()), value="Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)"),
model=widgets.Dropdown(options=["mlp", "rf", "mlp_no_drive", "rf_no_drive"], value="mlp"),
width=widgets.IntSlider(min=500, max=1500, step=50, value=1200))
    # return ()

In [None]:
# drive_outcome_dummy_cols = [x + "_binary" for x in pd.get_dummies(model_df["drive_outcome_desc_basic"]).columns]
# model_df[drive_outcome_dummy_cols] = pd.get_dummies(model_df["drive_outcome_desc_basic"])

mask_model = (
    (model_df.continuation==0)&
    (model_df.down!=0)&
    (model_df.play_counts==1)&
    (model_df.event_id.isin(scrimmage_plays_we_want))&
    (model_df[input_names+[output_name]].notna().all(axis=1))&
    (model_df["from_scrimmage"]==1)&
    (model_df["overtime"]==0)
)
# play_outcome_dummy_cols = [x + "_binary" for x in pd.get_dummies(model_df["play_outcome"]).columns]
# model_df[play_outcome_dummy_cols] = pd.get_dummies(model_df["play_outcome"])
for x in range(1, 5):
    mask_sit = (mask_model)&(model_df.yd_from_goal<=5)&(model_df.down==x)
    print("down :", x, "n:", len(model_df[mask_sit]))
    print(pd.DataFrame((model_df[mask_sit]["drive_outcome_desc_basic"].value_counts() / len(model_df[mask_sit]))).sort_index(), "\n\n")


# np.sum(model_df[(mask_model)&(model_df.yd_from_goal<=3)&(model_df.down<=3)][drive_outcome_dummy_cols])


In [None]:
drive_outcome_dummy_cols

In [None]:
pd.DataFrame(model_df.drop_duplicates("game_code")["end_of_regulation_score_diff"].value_counts()).to_clipboard()

In [None]:
# rf_score_diff_preds[model_df[mask_model].reset_index()["game_code"]==2337728].to_clipboard()
model_df[["xhome_win_score_diff_rf", "xdraw_score_diff_rf", "xaway_win_score_diff_rf"]] = event_df[["xhome_win_score_diff_rf", "xdraw_score_diff_rf", "xaway_win_score_diff_rf"]]
model_df[mask_model&(model_df.game_code == 2337728)][input_names+["xhome_win_score_diff_rf", "xdraw_score_diff_rf", "xaway_win_score_diff_rf"]].to_clipboard()

In [None]:
from ipywidgets import *
from IPython.display import display

def fourth_down_bot(game_info, play_description, model, width):
    if model =="mlp":
        y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"]
        y1="xscore_diff_end_of_regulation_mlp"
    elif model =="rf":
        y=["xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
        y1="xscore_diff_end_of_regulation_rf"
    
    mask_fourth_down = (fourth_go_for_it.game_info == game_info) & (fourth_go_for_it.play_description == play_description)
    if fourth_go_for_it[mask_fourth_down]["home_team_has_ball"].values == 1:
        fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][[y[0], y1]], fourth_down_punt[mask_fourth_down][[y[0], y1]], fourth_down_field_goal[mask_fourth_down][[y[0], y1]]], keys=["Go For It", "Punt", "Field Goal"])
    else:
        fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][[y[1], y1]], fourth_down_punt[mask_fourth_down][[y[1], y1]], fourth_down_field_goal[mask_fourth_down][[y[1], y1]]], keys=["Go For It", "Punt", "Field Goal"])
    # fourth_down_bot_df = np.where(fourth_go_for_it[mask_fourth_down].home_team_has_ball == 1, pd.d)
    print(fourth_down_bot_df)


w1 = widgets.Dropdown(options=list(fourth_go_for_it.sort_values(["game_date"], ascending=False)["game_info"].drop_duplicates()), value="Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)")
w2 = widgets.Dropdown(options=list(fourth_go_for_it[fourth_go_for_it["game_info"]==w1.value].sort_values("nevent")["play_description"].drop_duplicates()), value="1st Qtr 12:04, Tied, 4th & 20, 74 Yards From Goal, Los Angeles Rams has ball, Off TO: 3, Def TO: 3 (8)")

interact(fourth_down_bot, 
game_info=w1,
play_description=w2,
model=widgets.Dropdown(options=["mlp", "rf"], value="mlp"),
width=widgets.IntSlider(min=500, max=1500, step=50, value=1200))




In [None]:
import ipywidgets as widgets
from IPython.display import display

game_event_dict = {}
for game in list(fourth_go_for_it.sort_values("game_date", ascending=False)["game_info"].drop_duplicates()):
    game_event_dict[game] = fourth_go_for_it[fourth_go_for_it["game_info"] == game]["play_description"]


In [None]:
from IPython.display import display_html
from itertools import chain,cycle
# game_event_dict_sample = {"Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)": game_event_dict["Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)"]}
game_event_dict_sample = game_event_dict

style = {'description_width': '50px'}
layout = Layout(width='1000px')
geoWs = {key: widgets.Select(options=game_event_dict_sample[key], style=style, layout=layout) for key in game_event_dict_sample}


In [None]:

def get_current_play():
    return {'game': i.children[0].value,
            'play': i.children[1].value}

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

print("starting")
def print_play(**func_kwargs):
    # print('func_kwargs', func_kwargs)
    # print('i.kwargs', i.kwargs)
    # print('get_current_play', get_current_play())
    # y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp", "xscore_diff_end_of_regulation_mlp", "xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf", "xscore_diff_end_of_regulation_rf"]
    y=["xhome_win_score_diff_mlp", "xdraw_score_diff_mlp", "xhome_win_score_diff_rf", "xdraw_score_diff_rf", "xscore_diff_end_of_regulation_mlp", "xscore_diff_end_of_regulation_rf",
    "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp", "xaway_win_score_diff_rf", "xdraw_score_diff_rf",]
    mask_fourth_down = (fourth_go_for_it.game_info == func_kwargs["game"]) & (fourth_go_for_it.play_description == func_kwargs["play"])
    k = 0
    final_score_diff_index = {}
    change_score_diff_index= {}
    for x in score_diff_change_list:
        final_score_diff_index[k] = (
            x + fourth_go_for_it[mask_fourth_down]["current_score_diff"].values[0]
        )
        change_score_diff_index[k] = np.where(fourth_go_for_it[mask_fourth_down]["home_team_has_ball"].values == 1, x, -x)[0]
        k += 1
    # print(change_score_diff_index)
    if fourth_go_for_it[mask_fourth_down]["home_team_has_ball"].values == 1:
        fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][y[:6]], fourth_down_punt[mask_fourth_down][y[:6]], fourth_down_field_goal[mask_fourth_down][y[:6]]], keys=["Go For It", "Punt", "Field Goal"])
    else:
        fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][y[6:]], fourth_down_punt[mask_fourth_down][y[6:]], fourth_down_field_goal[mask_fourth_down][y[6:]]], keys=["Go For It", "Punt", "Field Goal"])
    if fourth_go_for_it[mask_fourth_down]["yd_from_goal"].values >= 50:
        display(fourth_down_bot_df[:2])
    else:
        display(fourth_down_bot_df.head())
    # display(pd.concat([fourth_go_for_it[mask_fourth_down][logit_play_advanced_class_names], fourth_down_punt[mask_fourth_down][logit_play_advanced_class_names], fourth_down_field_goal[mask_fourth_down][logit_play_advanced_class_names]], keys=["Go For It", "Punt", "Field Goal"]))
    display(pd.concat([fourth_go_for_it[mask_fourth_down][["rf_play_first_down"] + rf_drive_class_names], 
    fourth_down_punt[mask_fourth_down][["rf_play_first_down"] + rf_drive_class_names], 
    fourth_down_field_goal[mask_fourth_down][["rf_play_first_down"] + rf_drive_class_names]], keys=["Go For It", "Punt", "Field Goal"]))
    # display(
    #     pd.DataFrame(
    #         pd.concat(
    #             [
    #                 round(fourth_go_for_it_mlp_score_diff_preds[mask_fourth_down], 5),
    #                 round(fourth_down_punt_mlp_score_diff_preds[mask_fourth_down], 5),
    #                 round(fourth_down_field_goal_mlp_score_diff_preds[mask_fourth_down], 5),
    #             ],
    #             ignore_index=True,
    #         ).T
    #     ).rename(index=final_score_diff_index, columns={0: "Go For It", 1: "Punt", 2: "Field Goal"}).sort_values("Go For It", ascending=False)
    # )
    
    fourth_down_go_for_it_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_go_for_it_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Go For It"}).sort_values("Go For It", ascending=False)[:10]
    fourth_down_punt_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_punt_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Punt"}).sort_values("Punt", ascending=False)[:10]
    fourth_down_field_goal_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_field_goal_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Field Goal"}).sort_values("Field Goal", ascending=False)[:10]

    fourth_down_go_for_it_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_go_for_it_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Go For It"}).sort_values("Go For It", ascending=False)[:10]
    fourth_down_punt_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_punt_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Punt"}).sort_values("Punt", ascending=False)[:10]
    fourth_down_field_goal_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_field_goal_mlp_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Field Goal"}).sort_values("Field Goal", ascending=False)[:10]

    df1_style = fourth_down_go_for_it_final_display.style.set_table_attributes("style='display:inline;'")
    df2_style = fourth_down_punt_final_display.style.set_table_attributes("style='display:inline'")
    df3_style = fourth_down_field_goal_final_display.style.set_table_attributes("style='display:inline'")
    # display_html("Final Score Diff (mlp)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)

    df1_style = fourth_down_go_for_it_change_display.style.set_table_attributes("style='display:inline;'")
    df2_style = fourth_down_punt_change_display.style.set_table_attributes("style='display:inline'")
    df3_style = fourth_down_field_goal_change_display.style.set_table_attributes("style='display:inline'")
    display_html("Change in Score Diff (mlp)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)
    fourth_down_go_for_it_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_go_for_it_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Go For It"}).sort_values("Go For It", ascending=False)[:10]
    fourth_down_punt_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_punt_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Punt"}).sort_values("Punt", ascending=False)[:10]
    fourth_down_field_goal_final_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_field_goal_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=final_score_diff_index, columns={0: "Field Goal"}).sort_values("Field Goal", ascending=False)[:10]

    fourth_down_go_for_it_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_go_for_it_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Go For It"}).sort_values("Go For It", ascending=False)[:10]
    fourth_down_punt_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_punt_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Punt"}).sort_values("Punt", ascending=False)[:10]
    fourth_down_field_goal_change_display = pd.DataFrame(
        pd.concat(
            [
                round(fourth_down_field_goal_rf_score_diff_preds[mask_fourth_down], 20),
            ],
            ignore_index=True,
        ).T
    ).rename(index=change_score_diff_index, columns={0: "Field Goal"}).sort_values("Field Goal", ascending=False)[:10]

    df1_style = fourth_down_go_for_it_final_display.style.set_table_attributes("style='display:inline;'")
    df2_style = fourth_down_punt_final_display.style.set_table_attributes("style='display:inline'")
    df3_style = fourth_down_field_goal_final_display.style.set_table_attributes("style='display:inline'")
    # display_html("Final Score Diff (rf)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)

    df1_style = fourth_down_go_for_it_change_display.style.set_table_attributes("style='display:inline;'")
    df2_style = fourth_down_punt_change_display.style.set_table_attributes("style='display:inline'")
    df3_style = fourth_down_field_goal_change_display.style.set_table_attributes("style='display:inline'")
    display_html("Change in Score Diff (rf)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)

    # display_html(fourth_down_go_for_it_display.head(5), fourth_down_punt_display.head(5), fourth_down_field_goal_display.head(5))


# def fourth_down_bot(game_info, play_description, model, width):
#     if model =="mlp":
#         y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"]
#         y1="xscore_diff_end_of_regulation_mlp"
#     elif model =="rf":
#         y=["xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
#         y1="xscore_diff_end_of_regulation_rf"
    
#     mask_fourth_down = (fourth_go_for_it.game_info == game_info) & (fourth_go_for_it.play_description == play_description)
#     if fourth_go_for_it[mask_fourth_down]["home_team_has_ball"].values == 1:
#         fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][[y[0], y1]], fourth_down_punt[mask_fourth_down][[y[0], y1]], fourth_down_field_goal[mask_fourth_down][[y[0], y1]]], keys=["Go For It", "Punt", "Field Goal"])
#     else:
#         fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][[y[1], y1]], fourth_down_punt[mask_fourth_down][[y[1], y1]], fourth_down_field_goal[mask_fourth_down][[y[1], y1]]], keys=["Go For It", "Punt", "Field Goal"])
#     print(fourth_down_bot_df)


def select_game(game):
    new_i = widgets.interactive(print_play, game=gameW, play=geoWs[game['new']])
    i.children = new_i.children

gameW = widgets.Dropdown(options=list(game_event_dict_sample), style=style, layout=layout)
init = gameW.value
playW = geoWs[init]

gameW.observe(select_game, 'value')

i = widgets.interactive(print_play, game=gameW, play=playW)

display(i)


In [None]:
pd.DataFrame(rf_score_diff.feature_importances_, rf_score_diff.feature_names_in_)

In [None]:
# game_event_dict_point_after_sample = {"Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)": game_event_dict_point_after["Cincinnati Bengals Los Angeles Rams 2022-02-13 2021 (2337728)"]}
game_event_dict_point_after_sample = game_event_dict_point_after

style = {'description_width': '50px'}
layout = Layout(width='1000px')
geoWs = {key: widgets.Select(options=game_event_dict_point_after_sample[key], style=style, layout=layout) for key in game_event_dict_point_after_sample}

def get_current_play():
    return {'game': i.children[0].value,
            'play': i.children[1].value}

def print_play(**func_kwargs):
    # print('func_kwargs', func_kwargs)
    # print('i.kwargs', i.kwargs)
    # print('get_current_play', get_current_play())
    # y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp", "xscore_diff_end_of_regulation_mlp", "xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf", "xscore_diff_end_of_regulation_rf"]
    y=["xhome_win_score_diff_mlp", "xdraw_score_diff_mlp", "xhome_win_score_diff_rf", "xdraw_score_diff_rf", 
    "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
    mask_point_after = (point_after_go_for_two.game_info == func_kwargs["game"]) & (point_after_go_for_two.play_description == func_kwargs["play"])
    if point_after_go_for_two[mask_point_after]["home_team_has_ball"].values == 1:
        point_after_bot_df = pd.concat([point_after_go_for_two[mask_point_after][y[:4]], point_after_kick[mask_point_after][y[:4]]], keys=["Go For Two", "PAT"])
    else:
        point_after_bot_df = pd.concat([point_after_go_for_two[mask_point_after][y[4:]], point_after_kick[mask_point_after][y[4:]]], keys=["Go For It", "PAT"])
    if point_after_go_for_two[mask_point_after]["yd_from_goal"].values >= 55:
        display(point_after_bot_df[:2])
    else:
        display(point_after_bot_df.head())
    # display(pd.concat([point_after_go_for_two[mask_point_after][logit_play_advanced_class_names], point_after_kick[mask_point_after][logit_play_advanced_class_names]]))
    # display(pd.concat([point_after_go_for_two[mask_point_after][logit_drive_advanced_class_names], point_after_kick[mask_point_after][logit_drive_advanced_class_names]]))


# def point_after_bot(game_info, play_description, model, width):
#     if model =="mlp":
#         y=["xhome_win_score_diff_mlp", "xaway_win_score_diff_mlp", "xdraw_score_diff_mlp"]
#         y1="xscore_diff_end_of_regulation_mlp"
#     elif model =="rf":
#         y=["xhome_win_score_diff_rf", "xaway_win_score_diff_rf", "xdraw_score_diff_rf"]
#         y1="xscore_diff_end_of_regulation_rf"
    
#     mask_point_after = (point_after_go_for_two.game_info == game_info) & (point_after_go_for_two.play_description == play_description)
#     if point_after_go_for_two[mask_point_after]["home_team_has_ball"].values == 1:
#         point_after_bot_df = pd.concat([point_after_go_for_two[mask_point_after][[y[0], y1]], point_after_kick[mask_point_after][[y[0], y1]], point_after_field_goal[mask_point_after][[y[0], y1]]], keys=["Go For It", "Punt", "Field Goal"])
#     else:
#         point_after_bot_df = pd.concat([point_after_go_for_two[mask_point_after][[y[1], y1]], point_after_kick[mask_point_after][[y[1], y1]], point_after_field_goal[mask_point_after][[y[1], y1]]], keys=["Go For It", "Punt", "Field Goal"])
#     print(point_after_bot_df)


def select_game(game):
    new_i = widgets.interactive(print_play, game=gameW, play=geoWs[game['new']])
    i.children = new_i.children

gameW = widgets.Dropdown(options=list(game_event_dict_point_after_sample), style=style, layout=layout)
init = gameW.value
playW = geoWs[init]

gameW.observe(select_game, 'value')

i = widgets.interactive(print_play, game=gameW, play=playW)

display(i)




In [None]:
pd.DataFrame(fourth_go_for_it_mlp_score_diff_preds[mask_fourth_down], columns=["score_" + str(x) for x in score_diff_change_list])

In [None]:
score_diff_change_list

In [None]:
game_event_dict

In [None]:
y=["xhome_win_score_diff_mlp", "xscore_diff_end_of_regulation_mlp", "xhome_win_score_diff_rf", "xscore_diff_end_of_regulation_rf", "xaway_win_score_diff_mlp", "xscore_diff_end_of_regulation_mlp", "xaway_win_score_diff_rf", "xscore_diff_end_of_regulation_rf"]
# if fourth_go_for_it[mask_fourth_down]["home_team_has_ball"].values == 1:
fourth_down_bot_df = pd.concat([fourth_go_for_it[mask_fourth_down][y[:4]], fourth_down_punt[mask_fourth_down][y[:4]], fourth_down_field_goal[mask_fourth_down][y[:4]]], keys=["Go For It", "Punt", "Field Goal"])
fourth_down_bot_df.iloc[:2]

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
rf_drive_outcome.verbose=0
rf_play_outcome.verbose=0

def plot_model_probabilities(input_feature):
    fig, axs = plt.subplots(1, 3, figsize=(20,5)) 
    pred_output_rf_play = rf_play_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    pred_output_rf_drive = rf_drive_outcome.predict_proba(pd.DataFrame(input_feature, columns = input_names))
    input_feature_go_for_it = deepcopy(input_feature)
    input_feature_punt = deepcopy(input_feature)
    input_feature_field_goal = deepcopy(input_feature)
    input_feature_go_for_it[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = 0
    input_feature_punt[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = [0, 1, 0, 0, 0]
    input_feature_field_goal[["kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"]] = [0, 0, 1, 0, 0]
    pred_output_rf_play_go_for_it = rf_play_outcome.predict_proba(pd.DataFrame(input_feature_go_for_it, columns = input_names))
    pred_output_rf_drive_go_for_it = rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_go_for_it, columns = input_names))
    pred_output_rf_play_punt = rf_play_outcome.predict_proba(pd.DataFrame(input_feature_punt, columns = input_names))
    pred_output_rf_drive_punt = rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_punt, columns = input_names))
    pred_output_rf_play_field_goal = rf_play_outcome.predict_proba(pd.DataFrame(input_feature_field_goal, columns = input_names))
    pred_output_rf_drive_field_goal = rf_drive_outcome.predict_proba(pd.DataFrame(input_feature_field_goal, columns = input_names))

    # pred_input
    # print(pd.DataFrame(input_feature, columns = input_names))
    input_feature[rf_play_class_names] = pred_output_rf_play
    input_feature[rf_drive_class_names] = pred_output_rf_drive
    input_feature_go_for_it[rf_play_class_names] = pred_output_rf_play_go_for_it
    input_feature_go_for_it[rf_drive_class_names] = pred_output_rf_drive_go_for_it
    input_feature_punt[rf_play_class_names] = pred_output_rf_play_punt
    input_feature_punt[rf_drive_class_names] = pred_output_rf_drive_punt
    input_feature_field_goal[rf_play_class_names] = pred_output_rf_play_field_goal
    input_feature_field_goal[rf_drive_class_names] = pred_output_rf_drive_field_goal

    pred_output_rf = rf_score_diff.predict_proba(input_feature[input_names_score_pred])
    # display(pd.DataFrame(pred_output_rf, columns = rf_score_diff.classes_, index=["rf"]).T.sort_values("rf", ascending=False).head(15))
    pred_output_go_for_it_rf = rf_score_diff.predict_proba(input_feature_go_for_it[input_names_score_pred])
    pred_output_go_for_it_rf_df = (pd.DataFrame(pred_output_go_for_it_rf, columns = rf_score_diff.classes_, index=["rf_go_for_it"]).T.sort_values("rf_go_for_it", ascending=False).head(15))
    pred_output_punt_rf = rf_score_diff.predict_proba(input_feature_punt[input_names_score_pred])
    pred_output_punt_rf_df = (pd.DataFrame(pred_output_punt_rf, columns = rf_score_diff.classes_, index=["rf_punt"]).T.sort_values("rf_punt", ascending=False).head(15))
    pred_output_field_goal_rf = rf_score_diff.predict_proba(input_feature_field_goal[input_names_score_pred])
    pred_output_field_goal_rf_df = (pd.DataFrame(pred_output_field_goal_rf, columns = rf_score_diff.classes_, index=["rf_field_goal"]).T.sort_values("rf_field_goal", ascending=False).head(15))
    df1_style = pred_output_go_for_it_rf_df.style.set_table_attributes("style='display:inline;'")
    df2_style = pred_output_punt_rf_df.style.set_table_attributes("style='display:inline'")
    df3_style = pred_output_field_goal_rf_df.style.set_table_attributes("style='display:inline'")
    display_html("Change in Score Diff (rf)", df1_style._repr_html_() + df2_style._repr_html_() + df3_style._repr_html_(), raw=True)
    axs[0].bar(rf_score_diff.classes_, pred_output_go_for_it_rf[0])
    axs[1].bar(rf_score_diff.classes_, pred_output_punt_rf[0])
    axs[2].bar(rf_score_diff.classes_, pred_output_field_goal_rf[0])
    y_max = np.max([pred_output_go_for_it_rf[0], pred_output_punt_rf[0], pred_output_field_goal_rf[0]])
    for x in range(3):
        axs[x].set_xlim(-10.5, 10.5)
        axs[x].set_xticks(np.arange(-10, 11, 1))
        axs[x].set_ylim(0, y_max*1.05)





style = {'description_width': '200px'}
layout = Layout(width='450px')
@interact(
    time_left_in_half=widgets.IntSlider(min=0, max=30, step=1, value=100, style=style, layout=layout),
    half=widgets.IntSlider(min=1, max=2, step=1, value=1, style=style, layout=layout),
    current_score_diff=widgets.IntSlider(min=-30, max=30, step=1, value=0, style=style, layout=layout),
    current_score_total=widgets.IntSlider(min=0, max=80, step=1, value=0, style=style, layout=layout),
    cur_spread=widgets.IntSlider(min=-20, max=20, step=1, value=-3, style=style, layout=layout),
    cur_over_under=widgets.IntSlider(min=30, max=60, step=1, value=45, style=style, layout=layout),
    home_timeouts_remaining=widgets.IntSlider(min=0, max=3, step=1, value=3, style=style, layout=layout),
    away_timeouts_remaining=widgets.IntSlider(min=0, max=3, step=1, value=3, style=style, layout=layout),
    ytg=widgets.IntSlider(min=1, max=30, step=1, value=10, style=style, layout=layout),
    yd_from_goal=widgets.IntSlider(min=1, max=100, step=1, value=75, style=style, layout=layout),
    down=widgets.IntSlider(min=1, max=4, step=1, value=4, style=style, layout=layout),
    home_team_has_ball=widgets.IntSlider(min=0, max=1, step=1, value=1, style=style, layout=layout),
    play_type=widgets.Dropdown(options=["scrimmage", "kick_off", "punt", "field_goal_attempt", "point_after_kick", "two_point_attempt"], style=style, layout=layout),
)
def g(
    time_left_in_half,
    half,
    current_score_diff,
    current_score_total,
    cur_spread,
    cur_over_under,
    home_timeouts_remaining,
    away_timeouts_remaining,
    ytg,
    yd_from_goal,
    down,
    home_team_has_ball,
    play_type,
):
    punt = np.where(play_type == "punt", 1, 0)
    field_goal_attempt = np.where(play_type == "field_goal_attempt", 1, 0)
    kick_off = np.where(play_type == "kick_off", 1, 0)
    point_after_kick = np.where(play_type == "point_after_kick", 1, 0)
    two_point_attempt = np.where(play_type == "two_point_attempt", 1, 0)
    scrimmage = np.where(play_type == "scrimmage", 1, 0)
    down = np.where((kick_off==1)|(point_after_kick==1)|(two_point_attempt==1), 0, down)
    ytg_adj = np.where(yd_from_goal < ytg, yd_from_goal, ytg)
    # down=4

    input_feature = pd.DataFrame(np.array(
        [
            time_left_in_half * 60,
            half,
            current_score_diff,
            current_score_total,
            cur_spread,
            cur_over_under,
            home_timeouts_remaining,
            away_timeouts_remaining,
            punt,
            field_goal_attempt,
            ytg_adj,
            yd_from_goal,
            down,
            home_team_has_ball,
        ],
    ).reshape(1, -1), columns=input_names)
    input_feature[["kick_off", "point_after_kick", "two_point_attempt"]] = [kick_off, point_after_kick, two_point_attempt]
    input_feature = input_feature
    # input_feature = normalize_df(input_feature, model_df[mask_model])
    # print(input_feature)


    # print(pd.DataFrame(input_feature, columns=mlp_play_outcome.feature_names_in_))
    plot_model_probabilities(input_feature)
    

In [None]:
plt.bar(rf_score_diff.classes_, rf_score_diff_preds.iloc[0, :])

In [None]:
rf_score_diff_preds