In [1]:
import os
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
data_dir = os.path.join(root_dir, 'data')

import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.metrics import log_loss
from IPython.display import display, HTML
import pickle

%matplotlib notebook



# Data
Here we have the initial pull of datasets include:
* game_df: this is game information for all games from 2008 to 2021 including unplayed games
* test_game_codes are going to be the game_codes of games in 2021 that are played (note: game_state_id 11 is finished game, 1 is Pre-Game)
* sim_game_codes are all games (played and unplayed) of 2021
* we split up all the game_codes into "played_game_codes" and "unplayed_game_codes"

#

In [2]:
game_df = pd.read_parquet(os.path.join(data_dir, 'game_data.parquet'))
game_df = game_df.sort_values(by='game_date', ascending=False).reset_index(drop=True)
game_codes = game_df.game_code.values
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
played_game_codes = game_df.loc[game_df.game_state_id == 11, 'game_code'].values
unplayed_game_codes = game_df.loc[game_df.game_state_id != 11, 'game_code'].values
assert game_df.loc[~game_df.game_code.isin(unplayed_game_codes)].isna().sum().sum() == 0, 'Unexpected NaN values found in game data frame'
display(HTML(game_df.head(3).to_html(index=False)))
print('Games in sample: {0:d}'.format(game_codes.size))


game_code,game_date,home_team_id,away_team_id,season,home_team_abbrev,away_team_abbrev,week,home_score,away_score,status,game_state_id
2337728,2022-02-13 15:30:00,327,343,2021,Cin,LAR,23,20,23,Final,11
2337725,2022-01-30 15:30:00,343,359,2021,LAR,SF,21,20,17,Final,11
2337724,2022-01-30 14:00:00,339,327,2021,KC,Cin,21,24,27,Final,11


Games in sample: 3758


# Prior Data
Prior data is pulled from csv file and this will give us the inputs we need for the pre-game match predictions

In [3]:
prior_df = pd.read_csv(os.path.join(data_dir, 'game_priors.csv'))
display(HTML(prior_df.head(3).to_html(index=False)))

game_code,home_team_id,away_team_id,home_team_abbrev,away_team_abbrev,prior_home,prior_away,game_date
887191,329,347,Cle,Min,0.300686,0.69693,2009-09-13 13:00:00
887208,334,347,Det,Min,0.203864,0.795172,2009-09-20 13:00:00
887257,347,327,Min,Cin,0.677353,0.320797,2009-12-13 12:00:00


# Event Data
* Named event_df
* Is the play by play data from all games 2008 to 2021

# Input Features
Now that we have the datasets loaded we can load the features:
* prior_home: estimated probability of the home team winning at t=0
* prior_away: estimated probability of the away team winning at t=0
* home_team_has_ball: binary value for whether home team is in possession of the ball
* home_start_score: the score of the home team at the beginning of each play
* away_start_score: the score of the away team at the beginning of each play
* quarter: the current quarter/period the game is in (1-4 for all games, 5 if they are in the overtime period)
* play_start_time: numeric value of the time remaining in the quarter (900 at the beginning of the quarter, 0 at the end)
* yd_from_goal: the amount of yards between the line of scrimmage and the goal line for the team in possession of the ball
* down: the amount of downs that the team in possession of the ball has accumulate (1-4, down=-1 in plays that are not from scrimmage)
* ytg: the amount of yards between the current line of scrimmage and the first down line. (ytg=-1 in plays that are not from scrimmage)

# Target
remaining_exact_score: this is a numeric value for all the different combinations of remaining score (note: in this value sample, max_away_score=59 and max_home_score=62)
* for example if the current score is 17-24 (away_start_score=17 & home_start_score=24) and the final score is 27-30, then:
    *        remaining_exact_score = (27 - 17) + (62 + 1) * (30 - 24) = 422
* this ensures that all combinations of remaining exact scores are unique values


# Merged Table
full_df: the merged table of events_df and prior_df keeping only the input features and the target

In [4]:
events_df = pd.read_parquet(os.path.join(data_dir, 'event_data.parquet'))
events_df["half"] = round((events_df["quarter"] + 0.01) / 2)
events_df["home_timeout"] = np.where(((events_df["event_id"]==57)&(events_df["home_team_has_ball"]==1))|((events_df["event_id"]==58)&(events_df["home_team_has_ball"]==0)), 1, 0)
events_df["away_timeout"] = np.where(((events_df["event_id"]==57)&(events_df["home_team_has_ball"]==0))|((events_df["event_id"]==58)&(events_df["home_team_has_ball"]==1)), 1, 0)
events_df = events_df.sort_values(["game_code", "nevent"])
events_df["home_timeouts_remaining"] = np.clip(3 - events_df.groupby(["game_code", "half"])["home_timeout"].cumsum(), 0, 3)
events_df["away_timeouts_remaining"] = np.clip(3 - events_df.groupby(["game_code", "half"])["away_timeout"].cumsum(), 0, 3)

max_away_score = np.max(game_df["away_score"])
max_home_score = np.max(game_df["home_score"])

input_names = [
    "prior_home",
    "prior_away",
    "home_team_has_ball",
    "home_start_score",
    "away_start_score",
    "quarter",
    "overtime",
    "play_start_time",
    "yd_from_goal",
    "from_scrimmage",
    "kick_off",
    "punt",
    "point_after_kick",
    "two_point_attempt",
    "field_goal_attempt",
    "down",
    "ytg",
    "home_timeouts_remaining",
    "away_timeouts_remaining",
]
output_name = 'remaining_exact_score'
events_df['remaining_exact_score'] = events_df["away_rest_of_game_score"] + \
                                     (max_away_score + 1) * events_df['home_rest_of_game_score']
events_df['mov_change'] = np.where(events_df["home_team_has_ball"], events_df["home_score_added"], -events_df["away_score_added"])
full_df = events_df.merge(prior_df, on="game_code")
full_df = full_df[full_df[input_names+[output_name]].notna().all(axis=1)]


In [5]:
# Show how the data frame looks like at the beginning and end of a game
sample_game_code = full_df.sample(1).iloc[0]['game_code']
sample_game_info = prior_df.loc[prior_df.game_code == sample_game_code, ['home_team_abbrev', 'away_team_abbrev', 'game_date']].iloc[0].tolist()
print('\nData sample for game: {0} v {1} ({2})'.format(*sample_game_info))
display(HTML(full_df.loc[full_df.game_code == sample_game_code][["game_code", "nevent"] + input_names]
             .iloc[list(range(0, 5)) + list(range(-5, 0))]
             .to_html(index=False)))


Data sample for game: Phi v Ari (2017-10-08 13:00:00)


game_code,nevent,prior_home,prior_away,home_team_has_ball,home_start_score,away_start_score,quarter,overtime,play_start_time,yd_from_goal,from_scrimmage,kick_off,punt,point_after_kick,two_point_attempt,field_goal_attempt,down,ytg,home_timeouts_remaining,away_timeouts_remaining
1744782,1,0.472271,0.525711,1,0,0,1,0,900.0,65,0,1,0,0,0,0,0,-1,3,3
1744782,2,0.472271,0.525711,0,0,0,1,0,900.0,99,0,1,0,0,0,0,0,-1,3,3
1744782,3,0.472271,0.525711,0,0,0,1,0,894.0,82,1,0,0,0,0,0,1,10,3,3
1744782,4,0.472271,0.525711,0,0,0,1,0,859.0,81,1,0,0,0,0,0,2,9,3,3
1744782,5,0.472271,0.525711,0,0,0,1,0,855.0,81,1,0,0,0,0,0,3,9,3,3
1744782,202,0.472271,0.525711,1,34,7,4,0,72.0,100,0,0,0,0,0,0,0,-1,2,2
1744782,203,0.472271,0.525711,1,34,7,4,0,72.0,100,0,1,0,0,0,0,0,-1,2,2
1744782,204,0.472271,0.525711,1,34,7,4,0,65.0,80,1,0,0,0,0,0,1,10,2,2
1744782,205,0.472271,0.525711,1,34,7,4,0,35.0,81,1,0,0,0,0,0,2,11,2,2
1744782,206,0.472271,0.525711,1,34,7,4,0,0.0,82,0,0,0,0,0,0,3,-1,2,2


# Train/Test Data Split
Training and test dataframes are created (2009-2020 are training seasons and 2021 is the test season)
# Model
This is the stored model that predicts the probability of each remaining score combination at each point of the game

In [6]:
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
mask_test = full_df.game_code.isin(test_game_codes)
output_name_score_change = "mov_change"

X_train = full_df.loc[(~mask_test)&(full_df.continuation==0), input_names]
y_train_score_change = full_df.loc[(~mask_test)&(full_df.continuation==0), output_name_score_change]
X_test = full_df.loc[mask_test&(full_df.continuation==0), input_names].values
y_test_score_change = full_df.loc[mask_test&(full_df.continuation==0), output_name_score_change].values


rf_score_change = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=0, n_jobs=-1).fit(X_train, y_train_score_change)

# clf = MLPClassifier(
#     hidden_layer_sizes=[10,5],
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization parameter
#     learning_rate_init=0.001,
#     batch_size=128,
#     random_state=1,
#     max_iter=50, #50
#     early_stopping=True,
#     validation_fraction=0.1,
#     n_iter_no_change=5,
#     verbose=True).fit(X_train, y_train)
# pickle.dump(clf, open(os.path.join(root_dir, 'models/game_score_new_4.sav'), 'wb'))

# os.system('say "done"')


In [7]:
score_change_columns = [
    "away_increase_6",
    "away_increase_3",
    "away_increase_2",
    "away_increase_1",
    "no_increase",
    "home_increase_1",
    "home_increase_2",
    "home_increase_3",
    "home_increase_6",
]

full_df[score_change_columns] = rf_score_change.predict_proba(full_df[input_names])

In [12]:
X_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), input_names+score_change_columns]
y_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), output_name]
X_test_rf = full_df.loc[mask_test&(full_df.continuation==0), input_names + score_change_columns].values
y_test_rf = full_df.loc[mask_test&(full_df.continuation==0), output_name].values

# rf = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=100, n_jobs=-1).fit(X_train_rf, y_train_rf)
# pickle.dump(rf, open(os.path.join(root_dir, 'models/game_score_random_forest_100_10_new_features.p'), 'wb'))

In [14]:
pd.DataFrame(rf.feature_importances_, index=rf.feature_names_in_)

Unnamed: 0,0
prior_home,0.024223
prior_away,0.023436
home_team_has_ball,0.010016
home_start_score,0.096227
away_start_score,0.085747
quarter,0.353267
overtime,0.014698
play_start_time,0.120295
yd_from_goal,0.020631
from_scrimmage,0.00375


In [9]:
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
mask_test = full_df.game_code.isin(test_game_codes)

X_train = full_df.loc[~mask_test, input_names]
y_train = full_df.loc[~mask_test, output_name]
X_test = full_df.loc[mask_test, input_names].values
y_test = full_df.loc[mask_test, output_name].values
n_categories = (max_home_score + 1) * (max_away_score + 1)




# clf = MLPClassifier(
#     hidden_layer_sizes=[10,5],
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization parameter
#     learning_rate_init=0.001,
#     batch_size=128,
#     random_state=1,
#     max_iter=50, #50
#     early_stopping=True,
#     validation_fraction=0.1,
#     n_iter_no_change=5,
#     verbose=True).fit(X_train, y_train)
# pickle.dump(clf, open(os.path.join(root_dir, 'models/game_score_new_4.sav'), 'wb'))

# os.system('say "done"')


In [15]:
full_df.columns

Index(['game_code', 'game_date_x', 'season', 'home_team_id_x',
       'away_team_id_x', 'home_final_score', 'away_final_score',
       'final_score_diff', 'home_team_outcome', 'nevent', 'quarter',
       'overtime', 'home_rest_of_game_score', 'away_rest_of_game_score',
       'home_team_has_ball', 'kick_off', 'punt', 'point_after_kick',
       'two_point_attempt', 'field_goal_attempt', 'home_score_added',
       'away_score_added', 'current_score_diff', 'current_score_total',
       'home_start_score', 'away_start_score', 'yd_from_goal',
       'from_scrimmage', 'event_name', 'event_id', 'continuation', 'down',
       'ytg', 'play_start_time', 'half', 'home_timeout', 'away_timeout',
       'home_timeouts_remaining', 'away_timeouts_remaining',
       'remaining_exact_score', 'home_team_id_y', 'away_team_id_y',
       'home_team_abbrev', 'away_team_abbrev', 'prior_home', 'prior_away',
       'game_date_y'],
      dtype='object')

In [10]:
clf = pickle.load(open(os.path.join(root_dir, "models/game_score_new_4.sav"), 'rb'))
clf_old = pickle.load(open(os.path.join(root_dir, "models/game_score.sav"), 'rb'))


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [11]:
import time
def get_model_outputs(model, input_data, running_scores):
    start_time = time.time()
    raw_output = model.predict_proba(input_data)
    # Exact score outputs    
    score_probs = np.zeros((input_data.shape[0], n_categories))
    score_probs[:, model.classes_] = raw_output
    # 1X2 prediction & team score outputs
    outcome_probs = np.zeros((input_data.shape[0], 3))
    home_score_probs = np.zeros((input_data.shape[0], max_home_score + 1))
    away_score_probs = np.zeros((input_data.shape[0], max_away_score + 1))
    for home_score in range(max_home_score + 1):
        ft_home_score = home_score + running_scores[:, 0]
        for away_score in range(max_away_score + 1):            
            # print(home_score, "-", away_score)
            ft_away_score = away_score + running_scores[:, 1]
            remaining_prob = score_probs[:, away_score + (max_away_score + 1) * home_score]
            # 1X2 - Home win
            outcome_probs[:, 0] = np.where(ft_home_score > ft_away_score,
                                           outcome_probs[:, 0] + remaining_prob,
                                           outcome_probs[:, 0])
            # 1X2 - Draw
            outcome_probs[:, 1] = np.where(ft_home_score == ft_away_score,
                                           outcome_probs[:, 1] + remaining_prob,
                                           outcome_probs[:, 1])
            # 1X2 - Away win
            outcome_probs[:, 2] = np.where(ft_home_score < ft_away_score,
                                           outcome_probs[:, 2] + remaining_prob,
                                           outcome_probs[:, 2])
            # Team scores
            home_score_probs[:, home_score] += remaining_prob
            away_score_probs[:, away_score] += remaining_prob
    return {
        'remaining_score': score_probs,
        'home_score': home_score_probs,
        'away_score': away_score_probs,
        'ft_outcome': outcome_probs
    }

## Usage example
Below is just a little bit of code that shows how the input and output data looks like for a few samples in a random game:

In [12]:
example_game_code = np.random.choice(test_game_codes)
example_indices = [0, 1, -2, -1]
example_input = full_df.sort_values("nevent").loc[full_df.game_code == example_game_code, input_names].values[example_indices]
full_df = full_df.sort_values(["game_code", "nevent"], ascending=True)
example_running_score = full_df.loc[full_df.game_code == example_game_code,
                                    ['home_start_score', 'away_start_score']].values[example_indices]
example_output = get_model_outputs(clf, example_input, example_running_score)
print('\nExample input data:')
display(HTML(pd.DataFrame(data=example_input, columns=input_names).to_html(index=False)))

# print('\nExample outputs (exact score):')
# print(example_output['remaining_score'][0][0:10])
# print(example_output['remaining_score'][0][10:20])
# print(example_output['remaining_score'][0][20:30])

print('\nExample outputs (home team score):')
display(HTML(pd.DataFrame(data=example_output['home_score'], columns=np.arange(max_home_score + 1)).to_html(index=False)))

print('\nExample outputs (away team score):')
display(HTML(pd.DataFrame(data=example_output['away_score'], columns=np.arange(max_away_score + 1)).to_html(index=False)))

print('\nExample outputs (1X2):')
display(HTML(pd.DataFrame(data=example_output['ft_outcome'], columns=['home win', 'draw', 'away win']).to_html(index=False)))




Example input data:


prior_home,prior_away,home_team_has_ball,home_start_score,away_start_score,quarter,overtime,play_start_time,yd_from_goal,from_scrimmage,kick_off,punt,point_after_kick,two_point_attempt,field_goal_attempt,down,ytg,home_timeouts_remaining,away_timeouts_remaining
0.521872,0.47262,1.0,0.0,0.0,1.0,0.0,900.0,65.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,3.0,3.0
0.521872,0.47262,0.0,0.0,0.0,1.0,0.0,900.0,92.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,3.0,3.0
0.521872,0.47262,0.0,7.0,22.0,4.0,0.0,9.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,3.0
0.521872,0.47262,0.0,7.0,22.0,4.0,0.0,0.0,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,3.0



Example outputs (home team score):


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
0.009299,6e-06,0.000258,0.015628,1.835141e-05,0.00062,0.017926,0.02033,0.002319,0.012087,0.040055,0.004432,0.011235,0.042203,0.029243,0.011038,0.039037,0.067118,0.009894,0.02410061,0.070928,0.037157,0.015781,0.04732962,0.068764,0.010514,0.02944098,0.05490783,0.02867547,0.01236582,0.03304905,0.04746305,0.00804529,0.01555245,0.02788981,0.01861617,0.00801273,0.01387431,0.02585552,0.003479759,0.006610927,0.008659825,0.008514496,0.004648549,0.004130129,0.01111238,0.001771064,0.002447184,0.004677998,0.004822364,0.000269263,0.001717037,0.001782822,0.0005350019,0.0005136974,0.001579461,0.0005157834,0.0001026871,0.0002658023,0.0005487982,2.109816e-10,0.0,0.0002254235
0.00981,9e-06,0.000291,0.016151,2.733538e-05,0.000701,0.018378,0.021203,0.002634,0.012634,0.040303,0.004893,0.011543,0.04189,0.029969,0.011722,0.038689,0.065834,0.010613,0.02416551,0.06816,0.037381,0.016348,0.04577567,0.066226,0.011223,0.02899624,0.05267204,0.02922165,0.01274996,0.03245804,0.04595462,0.008492712,0.01596651,0.027643,0.01902708,0.008308912,0.01433891,0.02574239,0.003777478,0.006890904,0.00899814,0.008758774,0.004884787,0.004384123,0.01120002,0.001906701,0.002520774,0.004895535,0.004980571,0.0002968309,0.001856029,0.001931497,0.0005696206,0.0005540901,0.001668842,0.0005585219,0.0001138381,0.0002816872,0.0005835037,2.178525e-10,0.0,0.0002431598
0.946434,0.000162,0.001977,0.007142,4.367532e-09,2.2e-05,0.005724,0.026306,0.009929,0.000103,0.000621,0.000366,1.4e-05,0.00024,0.000684,0.000161,1.5e-05,2.7e-05,4.3e-05,5.738479e-07,5e-06,1.1e-05,5e-06,8.251067e-07,4e-06,2e-06,5.418387e-08,1.072368e-07,7.211253e-07,1.555196e-07,6.999231e-08,1.000347e-08,9.201624e-08,2.132639e-09,2.943906e-07,2.330418e-08,3.788621e-07,3.992223e-11,1.561512e-09,3.223278e-09,8.7366e-11,6.439774e-12,2.315232e-10,2.961635e-12,1.43992e-12,2.417232e-12,3.611207e-13,5.630024999999999e-19,3.889584e-09,8.886182e-13,5.93746e-17,6.402107e-12,2.353995e-12,2.209313e-18,2.033689e-29,1.173379e-32,8.477665000000001e-18,3.677677e-12,6.963872e-31,2.529445e-12,2.331782e-12,0.0,2.529214e-12
0.943732,0.000172,0.00208,0.007377,4.923178e-09,2.4e-05,0.005978,0.02754,0.010591,0.000112,0.000667,0.000399,1.6e-05,0.000261,0.000742,0.000178,1.7e-05,3e-05,5e-05,6.579554e-07,6e-06,1.3e-05,6e-06,9.479742e-07,5e-06,2e-06,6.383003e-08,1.259578e-07,8.250173e-07,1.806619e-07,8.219993e-08,1.176858e-08,1.082694e-07,2.761505e-09,3.71017e-07,2.857672e-08,4.46089e-07,4.826528e-11,1.869854e-09,3.945163e-09,1.138561e-10,8.063941e-12,2.921284e-10,3.78164e-12,2.040139e-12,2.677955e-12,4.800643e-13,7.734756999999999e-19,4.651162e-09,1.232728e-12,7.834054000000001e-17,7.764537e-12,2.612976e-12,3.024245e-18,3.278345e-29,1.974682e-32,1.3066720000000001e-17,4.452639e-12,1.1891509999999999e-30,2.801165e-12,2.581392e-12,0.0,3.05321e-12



Example outputs (away team score):


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
0.015702,2.3e-05,0.00098,0.021332,7.2e-05,0.001275,0.021486,0.033931,0.002821,0.019587,0.062016,0.003409,0.013751,0.052474,0.040693,0.009391,0.03822,0.086254,0.010737,0.019359,0.068137,0.035822,0.0119651,0.05190182,0.06221648,0.01132105,0.02457042,0.04982841,0.02528144,0.008100008,0.02539562,0.03498096,0.006978288,0.01473612,0.0249708,0.01385221,0.006338212,0.01337724,0.01495906,0.003237243,0.003919894,0.01257504,0.006218789,0.002041133,0.002496812,0.004382605,0.0006273741,0.0005674365,0.00181385,0.001758376,0.000112988,0.001110649,0.0003795071,8.259604e-07,1.200999e-07,0.0001146387,2.251713e-05,0.0,0.0,0.0003731913
0.016236,3.3e-05,0.00104,0.021629,9.6e-05,0.00139,0.0217,0.034266,0.00315,0.019861,0.060519,0.003831,0.014132,0.051363,0.040982,0.009885,0.038102,0.083476,0.011437,0.019768,0.065894,0.03602,0.01257201,0.05021577,0.06020633,0.01197715,0.02471461,0.04872143,0.02560807,0.008703881,0.02549134,0.03469101,0.007464632,0.01520932,0.02520663,0.01429583,0.00672479,0.0138474,0.01544771,0.003423801,0.004145027,0.01296798,0.006548041,0.002217699,0.002689508,0.004665042,0.0006730663,0.0006225168,0.001974604,0.001872607,0.0001260244,0.001204157,0.0004051229,1.246408e-06,1.862333e-07,0.0001270756,2.699066e-05,0.0,0.0,0.0004023391
0.942578,0.003232,0.001064,0.025385,8.4e-05,3.8e-05,0.002356,0.02183,0.000651,0.000118,0.001468,8.1e-05,1.4e-05,0.000122,0.000878,1.7e-05,3e-06,3.8e-05,3e-06,2e-06,2e-06,3.5e-05,1.671276e-07,9.704128e-08,6.519201e-07,1.678015e-08,1.27817e-09,5.185934e-08,2.372131e-07,1.391786e-08,4.089538e-08,2.871949e-08,2.456471e-12,2.638132e-09,8.894486e-10,2.57247e-09,6.163131e-10,4.262853e-12,5.353801e-09,3.084376e-16,4.896958e-15,5.377659e-11,1.840164e-12,2.392468e-15,2.321325e-12,2.670251e-10,4.59187e-12,4.016671e-18,1.582959e-16,4.685958e-18,5.135167999999999e-19,1.120662e-18,2.9671819999999996e-19,1.940822e-18,5.079063e-13,4.310982e-09,4.495452e-24,0.0,0.0,3.836022e-24
0.93991,0.003453,0.001128,0.026354,9.6e-05,4.2e-05,0.002488,0.022822,0.000695,0.000129,0.001587,8.9e-05,1.5e-05,0.000134,0.000948,1.8e-05,3e-06,4.3e-05,3e-06,2e-06,2e-06,3.9e-05,1.934208e-07,1.127947e-07,7.517506e-07,1.970845e-08,1.540056e-09,6.095319e-08,2.722735e-07,1.659334e-08,4.844408e-08,3.383687e-08,3.096776e-12,3.192585e-09,1.084173e-09,3.045734e-09,7.888062e-10,5.378405e-12,6.261026e-09,4.333724e-16,6.33437e-15,6.952052e-11,2.473471e-12,2.972741e-15,2.569772e-12,3.139952e-10,5.083124e-12,5.41918e-18,2.309585e-16,6.335691e-18,7.03397e-19,1.530119e-18,4.3031079999999995e-19,2.654796e-18,6.182712e-13,5.079551e-09,6.722013e-24,0.0,0.0,5.700843e-24



Example outputs (1X2):


home win,draw,away win
0.56227,0.024951,0.41278
0.558119,0.025172,0.416708
3.2e-05,1.4e-05,0.999954
3.5e-05,1.6e-05,0.999949


### Visualization of predictions
This is a simple interactive dashboard that lets the user select any game from the test data set and plots match outcome (1X2) and team score predictions. Just pick a game from the drop-down menu and click the "Plot" button.

In [13]:
game_info_df = game_df[['game_code', 'game_date', 'home_team_id', 'away_team_id', 'season', 'home_team_abbrev', 'away_team_abbrev']]
game_info_df['game_description'] = ['{0} {1} v {2} ({3})'.format(i.game_date, i.home_team_abbrev, i.away_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]
game_info_df = game_info_df.loc[game_info_df.game_code.isin(test_game_codes), ['game_code', 'game_description']]
game_info_df['ft_score'] = str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'away_score'].values) + "-" + str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'home_score'].values)

h_f = None
h_ax = None
h_ax_twin = None

def update_dashboard(change):
    global info_textbox
    info_textbox.value = ''
    if h_ax is not None:
        h_ax.cla()
        h_ax_twin.cla()
    plot()
    
def print_to_textbox(string, textbox_handle, clear_textbox=False):
    if textbox_handle is None:
        print(string)
    else:
        if clear_textbox or (textbox_handle.value == ''):
            textbox_handle.value = string
        else:
            textbox_handle.value += '<br>' + string

def plot():
    global h_f, h_ax, h_ax_twin
    if h_f is None:
        h_f, h_ax = plt.subplots(1, figsize=(9, 4))
        
    plot_game_code = game_info_df.set_index('game_description').loc[match_picker.value, 'game_code']
    plot_time = full_df.loc[full_df.game_code == plot_game_code, ['quarter', 'play_start_time']]
    plot_time = ((900 - plot_time.play_start_time) + \
                 (plot_time.quarter - 1 ) * 900).values
                 
    plot_input = full_df.loc[full_df.game_code == plot_game_code, input_names].values
    plot_running_score = full_df.loc[full_df.game_code == plot_game_code, ['home_start_score', 'away_start_score']].values
    plot_output = get_model_outputs(clf, plot_input, plot_running_score)    
    print_to_textbox('{0:d} prediction samples found'.format(plot_input.shape[0]), info_textbox)
    
    # Main axis (1X2 prediction)
    plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
    plot_y1 = np.vstack((plot_output['ft_outcome'][:, 0], plot_output['ft_outcome'][:, 0])).flatten(order='F')[:-1]
    plot_y2 = np.vstack((plot_output['ft_outcome'][:, 1], plot_output['ft_outcome'][:, 1])).flatten(order='F')[:-1]
    plot_y3 = np.vstack((plot_output['ft_outcome'][:, 2], plot_output['ft_outcome'][:, 2])).flatten(order='F')[:-1]
    h_ax.stackplot(plot_x, plot_y1, plot_y2, plot_y3,
                   labels=['home','draw','away'],
                   colors=['khaki', 'lightgray', 'lightskyblue'],
                   zorder=0)
    h_ax.legend(loc='upper left')
    if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
        x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
    else:
        x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
        
    h_ax.set_xticks(x_tick_pos)
    h_ax.set_xticklabels(x_tick_str)
    h_ax.set_xlabel('Match time')
    h_ax.set_xlim(0, plot_x.max())
    h_ax.set_ylim(0, 1)
    h_ax.set_yticks(np.arange(0, 1.01, 0.25))
    h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
    h_ax.set_ylabel('Probability')
    h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))
    
    # Twin axis (score prediction)
    if h_ax_twin is None:
        h_ax_twin = h_ax.twinx()
    # Home
    plot_home_score = plot_running_score[:, 0] + \
        np.sum(plot_output['home_score'] * np.tile(np.arange(max_home_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_home_score = np.vstack((plot_home_score, plot_home_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_home_score, '-k', linewidth=1, zorder=10, color='darkgoldenrod', label='home score')
    # Away
    plot_away_score = plot_running_score[:, 1] + \
        np.sum(plot_output['away_score'] * np.tile(np.arange(max_away_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_away_score = np.vstack((plot_away_score, plot_away_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_away_score, '-k', linewidth=1, zorder=10, color='dodgerblue', label='away score')
    
    max_score = int(np.ceil(np.append(plot_home_score, plot_away_score).max()))
    h_ax_twin.set_ylim(0, max_score)
    h_ax_twin.set_yticks(np.arange(0, max_score + 0.5))
    h_ax_twin.set_ylabel('Predicted score')
    
    h_f.tight_layout()
    
match_picker = widgets.Dropdown(
    options=game_info_df.game_description.values,
    description='Match'
)
run_btn = widgets.Button(
    description='Plot'
)
info_textbox = widgets.HTML(value="")

display(widgets.VBox([
    widgets.HBox([match_picker, run_btn]),
    info_textbox
]))
run_btn.on_click(update_dashboard)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_info_df['game_description'] = ['{0} {1} v {2} ({3})'.format(i.game_date, i.home_team_abbrev, i.away_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]


VBox(children=(HBox(children=(Dropdown(description='Match', options=('2022-02-13 15:30:00 Cin v LAR (2337728)'…

In [14]:
input_names_old = ['prior_home', 'prior_away','home_team_has_ball', 'home_start_score', 'away_start_score', 'quarter', 'play_start_time', 'yd_from_goal', 'down', 'ytg']
game_info_df = game_df[['game_code', 'game_date', 'home_team_id', 'away_team_id', 'season', 'home_team_abbrev', 'away_team_abbrev']]
game_info_df['game_description'] = ['{0} {1} at {2} ({3})'.format(i.game_date, i.away_team_abbrev, i.home_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]
game_info_df = game_info_df.loc[game_info_df.game_code.isin(test_game_codes), ['game_code', 'game_description']]
game_info_df['ft_score'] = str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'away_score'].values) + "-" + str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'home_score'].values)

h_f = None
h_ax = None
h_ax_twin = None

def update_dashboard(change):
    global info_textbox
    info_textbox.value = ''
    if h_ax is not None:
        h_ax.cla()
        h_ax_twin.cla()
    plot()
    
def print_to_textbox(string, textbox_handle, clear_textbox=False):
    if textbox_handle is None:
        print(string)
    else:
        if clear_textbox or (textbox_handle.value == ''):
            textbox_handle.value = string
        else:
            textbox_handle.value += '<br>' + string

def plot():
    global h_f, h_ax, h_ax_twin
    if h_f is None:
        h_f, h_ax = plt.subplots(1, figsize=(9, 4))
        
    plot_game_code = game_info_df.set_index('game_description').loc[match_picker.value, 'game_code']
    plot_time = full_df.loc[full_df.game_code == plot_game_code, ['quarter', 'play_start_time']]
    plot_time = ((900 - plot_time.play_start_time) + \
                 (plot_time.quarter - 1 ) * 900).values
                 
    plot_input = full_df.loc[full_df.game_code == plot_game_code, input_names_old].values
    plot_running_score = full_df.loc[full_df.game_code == plot_game_code, ['home_start_score', 'away_start_score']].values
    plot_output = get_model_outputs(clf_old, plot_input, plot_running_score)    
    print_to_textbox('{0:d} prediction samples found'.format(plot_input.shape[0]), info_textbox)
    
    # Main axis (1X2 prediction)
    plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
    plot_y1 = np.vstack((plot_output['ft_outcome'][:, 0], plot_output['ft_outcome'][:, 0])).flatten(order='F')[:-1]
    plot_y2 = np.vstack((plot_output['ft_outcome'][:, 1], plot_output['ft_outcome'][:, 1])).flatten(order='F')[:-1]
    plot_y3 = np.vstack((plot_output['ft_outcome'][:, 2], plot_output['ft_outcome'][:, 2])).flatten(order='F')[:-1]
    h_ax.stackplot(plot_x, plot_y1, plot_y2, plot_y3,
                   labels=['home','draw','away'],
                   colors=['khaki', 'lightgray', 'lightskyblue'],
                   zorder=0)
    h_ax.legend(loc='upper left')
    if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
        x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
    else:
        x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
        
    h_ax.set_xticks(x_tick_pos)
    h_ax.set_xticklabels(x_tick_str)
    h_ax.set_xlabel('Match time')
    h_ax.set_xlim(0, plot_x.max())
    h_ax.set_ylim(0, 1)
    h_ax.set_yticks(np.arange(0, 1.01, 0.25))
    h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
    h_ax.set_ylabel('Probability')
    h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))
    
    # Twin axis (score prediction)
    if h_ax_twin is None:
        h_ax_twin = h_ax.twinx()
    # Home
    plot_home_score = plot_running_score[:, 0] + \
        np.sum(plot_output['home_score'] * np.tile(np.arange(max_home_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_home_score = np.vstack((plot_home_score, plot_home_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_home_score, '-k', linewidth=1, zorder=10, color='darkgoldenrod', label='home score')
    # Away
    plot_away_score = plot_running_score[:, 1] + \
        np.sum(plot_output['away_score'] * np.tile(np.arange(max_away_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_away_score = np.vstack((plot_away_score, plot_away_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_away_score, '-k', linewidth=1, zorder=10, color='dodgerblue', label='away score')
    
    max_score = int(np.ceil(np.append(plot_home_score, plot_away_score).max()))
    h_ax_twin.set_ylim(0, max_score)
    h_ax_twin.set_yticks(np.arange(0, max_score + 0.5))
    h_ax_twin.set_ylabel('Predicted score')
    
    h_f.tight_layout()
    
match_picker = widgets.Dropdown(
    options=game_info_df.game_description.values,
    description='Match'
)
run_btn = widgets.Button(
    description='Plot'
)
info_textbox = widgets.HTML(value="")

display(widgets.VBox([
    widgets.HBox([match_picker, run_btn]),
    info_textbox
]))
run_btn.on_click(update_dashboard)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_info_df['game_description'] = ['{0} {1} at {2} ({3})'.format(i.game_date, i.away_team_abbrev, i.home_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]


VBox(children=(HBox(children=(Dropdown(description='Match', options=('2022-02-13 15:30:00 LAR at Cin (2337728)…

## Preparing predictions for 2021 season simulation
This is the final section of this notebook, and its purpose is to use the model we have trained to generate match predictions for all games in the 2021/22 season. These will be used in the next notebook to simulate the outcome of the competition.

Since the purpose of this tutorial is not to be run live while games play, we can make the following simplification: we will assume that all 2021 games are either finished or not yet started, and will use observed results and predictions for them, resepectively. Therefore, we will gather input features and generate pre-game predictions only for unplayed games.

First, let's get results for played games:

In [None]:
results_df = game_df.loc[game_df.season==2021, ["game_code", "home_team_id", "away_team_id", "home_score", "away_score"]]
results_df = results_df.set_index('game_code').loc[np.intersect1d(sim_game_codes, played_game_codes)].reset_index()
team_names = game_df.loc[game_df.season==2021, ["home_team_id", "home_team_abbrev"]]
division_data = pd.read_parquet(os.path.join(data_dir, "division_data.parquet"))
team_names =team_names.rename(columns={'home_team_id': 'id', 'home_team_abbrev': 'name'}).drop_duplicates()
current_division_data = division_data.loc[division_data.season==2021].rename(columns={'team_id': 'id'})
team_names= team_names.merge(current_division_data)
team_names = team_names.to_dict(orient='records')

Now let's deal with unplayed games: get input features and pass to the model to get predictions:

In [None]:
sim_df = game_df.set_index('game_code').loc[unplayed_game_codes, ['home_team_id', 'away_team_id']].reset_index()

# Add prior match outcome probabilities
sim_df = sim_df.merge(
    right=prior_df[['game_code'] + np.intersect1d(input_names, prior_df.columns).tolist()],
    how='left', on='game_code'
)
# input_names = ['prior_home', 'prior_away','home_team_has_ball', 'home_start_score', 'away_start_score', 'quarter', 'overtime', 'play_start_time', 'yd_from_goal', 'from_scrimmage', 'kick_off', 'punt', 'point_after_kick', 'two_point_attempt', 'field_goal_attempt', 'down', 'ytg']

sim_df['prior_home'] = np.where(np.isnan(sim_df["prior_home"]), np.mean(full_df["prior_home"]), sim_df["prior_home"])
sim_df['prior_away'] = np.where(np.isnan(sim_df["prior_away"]), np.mean(full_df["prior_away"]), sim_df["prior_away"])
sim_df['home_team_has_ball'] = 0
sim_df['home_start_score'] = 0
sim_df['away_start_score'] = 0
sim_df['quarter'] = 1
sim_df['overtime'] = 0
sim_df['play_start_time'] = 900
sim_df['yd_from_goal'] = 70
sim_df['from_scrimmage'] = 0
sim_df['kick_off'] = 1
sim_df['punt'] = 0
sim_df['point_after_kick'] = 0
sim_df['two_point_attempt'] = 0
sim_df['field_goal_attempt'] = 0
sim_df['down'] = 0
sim_df['ytg'] = -1
sim_df['home_timeouts_remaining'] = 3
sim_df['away_timeouts_remaining'] = 3
sim_input = sim_df[input_names].values
sim_running_score = sim_df[['home_start_score', 'away_start_score']].values
sim_output = get_model_outputs(clf, sim_input, sim_running_score)


Finally, put results and predictions together and store in a JSON file for the simulation notebook:

In [None]:

# Create predictions object
predictions = []
for ind_game, game_code in enumerate(sim_df.game_code.values):
    predictions.append({
        'game_code': int(game_code),
        'home_team_id': int(sim_df.iloc[ind_game]['home_team_id']),
        'away_team_id': int(sim_df.iloc[ind_game]['away_team_id']),
        'pred_exact_score': sim_output['remaining_score'][ind_game].tolist(),
        'pred_outcome': sim_output['ft_outcome'][ind_game].tolist(),
        'current_score': sim_running_score[ind_game].tolist()
    })
sim_data = {
    'teams': team_names,
    'results': results_df.to_dict(orient='records'),
    'predictions': predictions,
    'prediction_params': {'max_home_score': int(max_home_score), 'max_away_score': int(max_away_score)}
}
with open(os.path.join(data_dir, 'simulation_inputs.json'), 'w') as f:
    json.dump(sim_data, f)
os.system('say "done"')


In [None]:
play_subset = (full_df.game_code == 2337619) & (full_df.down==4)&(full_df.from_scrimmage==1)

play_idx = 134
plot_input = full_df.loc[play_subset, input_names].values
plot_running_score = full_df.loc[play_subset, ['home_start_score', 'away_start_score']].values
plot_output = get_model_outputs(clf, plot_input, plot_running_score)    
plot_input_df = pd.DataFrame(plot_input, columns=input_names)


In [None]:
plot_input_opposite = full_df.loc[play_subset, input_names].values
for x in range(len(plot_input)):
    plot_input_opposite[x, 11] = 1 - plot_input_opposite[x][11]
pd.DataFrame(plot_input, columns=input_names)

In [None]:
plot_output = get_model_outputs(clf, plot_input, plot_running_score)
plot_output_opposite = get_model_outputs(clf, plot_input_opposite, plot_running_score)

In [None]:
pd.DataFrame(plot_output_opposite["ft_outcome"])

In [None]:
pd.DataFrame(plot_output["ft_outcome"])

In [None]:
test_df = full_df.loc[~mask_test, input_names]
scores_df = full_df.loc[~mask_test, ['home_final_score', 'away_final_score']]


In [None]:
# plot_input = full_df.loc[~mask_test, input_names].values
# plot_running_score = full_df.loc[~mask_test, ['home_start_score', 'away_start_score']].values
# plot_output = get_model_outputs(clf, plot_input, plot_running_score)


In [None]:
prior_df[prior_df["game_code"]==2411003]

In [None]:
sim_df['prior_home'] = np.where(np.isnan(sim_df["prior_home"]), np.mean(full_df["prior_home"]), sim_df["prior_home"])
sim_df['prior_away'] = np.where(np.isnan(sim_df["prior_away"]), np.mean(full_df["prior_away"]), sim_df["prior_away"])
sim_df['home_team_has_ball'] = 0
sim_df['home_start_score'] = 0
sim_df['away_start_score'] = 0
sim_df['quarter'] = 1
sim_df['overtime'] = 0
sim_df['play_start_time'] = 900
sim_df['yd_from_goal'] = 70
sim_df['from_scrimmage'] = 0
sim_df['kick_off'] = 1
sim_df['punt'] = 0
sim_df['point_after_kick'] = 0
sim_df['two_point_attempt'] = 0
sim_df['field_goal_attempt'] = 0
sim_df['down'] = 0
sim_df['ytg'] = -1


In [None]:
full_df["home_win"] = np.where(full_df["home_team_outcome"]=="W", 1, 0)
full_df["draw"] = np.where(full_df["home_team_outcome"]=="T", 1, 0)
full_df["away_win"] = np.where(full_df["home_team_outcome"]=="L", 1, 0)


In [None]:
plot_game_code = 2337720

plot_input = full_df.loc[mask_test,input_names].values
plot_running_score = full_df.loc[mask_test, ['home_start_score', 'away_start_score']].values
plot_output = get_model_outputs(clf, plot_input, plot_running_score)
# pd.concat([pd.DataFrame(plot_input, columns=input_names), pd.DataFrame(plot_output["ft_outcome"],columns=["home_win", "draw", "away_win"])], axis=1).to_csv("Bills at Chiefs 01-23-2022 new.csv")
test_df = full_df.loc[mask_test, full_df.columns].reset_index()
scores_df = full_df.loc[mask_test, ['home_final_score', 'away_final_score']]
test_values = pd.concat([test_df, pd.DataFrame(plot_output['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
# pd.DataFrame(plot_output['ft_outcome'])


In [None]:
plot_game_code = 2337720

plot_input_old = full_df.loc[mask_test,input_names_old].values
plot_running_score_old = full_df.loc[mask_test, ['home_start_score', 'away_start_score']].values
plot_output_old = get_model_outputs(clf_old, plot_input_old, plot_running_score_old)
# pd.concat([pd.DataFrame(plot_input_old, columns=input_names_old), pd.DataFrame(plot_output_old["ft_outcome"],columns=["home_win", "draw", "away_win"])], axis=1).to_csv("Bills at Chiefs 01-23-2022 new.csv")
full_df["home_win"] = np.where(full_df["home_team_outcome"]=="W", 1, 0)
full_df["draw"] = np.where(full_df["home_team_outcome"]=="T", 1, 0)
full_df["away_win"] = np.where(full_df["home_team_outcome"]=="L", 1, 0)
test_df_old = full_df.loc[mask_test, full_df.columns].reset_index()
scores_df = full_df.loc[mask_test, ['home_final_score', 'away_final_score']]
test_values_old = pd.concat([test_df, pd.DataFrame(plot_output_old['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
# pd.DataFrame(plot_output_old['ft_outcome'])


In [None]:
from sklearn.calibration import calibration_curve
calib_home_win = calibration_curve(test_values["home_win"], test_values["xhome_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_home_win[1], calib_home_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
from sklearn.calibration import calibration_curve
calib_home_win = calibration_curve(test_values_old["home_win"], test_values_old["xhome_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_home_win[1], calib_home_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_away_win = calibration_curve(test_values["away_win"], test_values["xaway_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_df = test_values[test_values["quarter"]<=3]

calib_away_win = calibration_curve(calib_df["away_win"], calib_df["xaway_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
X_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), input_names]
y_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), output_name]
X_test_rf = full_df.loc[mask_test&(full_df.continuation==0), input_names].values
y_test_rf = full_df.loc[mask_test&(full_df.continuation==0), output_name].values




# rf = RandomForestClassifier(n_estimators=10, max_depth=10, verbose=100).fit(X_train_rf, y_train_rf)
# pickle.dump(rf, open(os.path.join(root_dir, 'models/game_score_random_forest.p'), 'wb'))

rf = pickle.load(open(os.path.join(root_dir, "models/game_score_random_forest.p"), 'rb'))
os.system('say "done"')


In [None]:
rf_output = get_model_outputs(rf, plot_input, plot_running_score)

In [None]:
from catboost import CatBoostClassifier
X_train_cb = full_df.loc[~mask_test&(full_df.continuation==0), input_names]
y_train_cb = full_df.loc[~mask_test&(full_df.continuation==0), "home_team_outcome"]
X_test_cb = full_df.loc[mask_test&(full_df.continuation==0), input_names]
y_test_cb = full_df.loc[mask_test&(full_df.continuation==0), "home_team_outcome"]

categoricals = [
    'home_team_has_ball',
    'overtime',
    'from_scrimmage',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    'down',
]
param_spaces={
    'random_strength': 39.38775510204081,
    'early_stopping_rounds': 5,
    'min_child_samples': 1,
    'max_depth': 12,
    'learning_rate': 0.05591836734693878,
    'l2_leaf_reg': 55.146938775510215}
cb = CatBoostClassifier(cat_features=categoricals, verbose=1, **param_spaces).fit(X_train_cb, y_train_cb)
pickle.dump(cb, open(os.path.join(root_dir, 'models/game_outcome_catboost.p'), 'wb'))

In [None]:
full_df = full_df.reset_index()
full_df[["xaway_win", "xdraw", "xhome_win"]] = pd.DataFrame(cb.predict_proba(pd.DataFrame(full_df[input_names])))


In [None]:
full_df

In [None]:
test_game_cb = full_df.loc[full_df.game_code==2337720]
plt.plot(test_game_cb["nevent"], test_game_cb["xhome_win"])

In [None]:
from copy import deepcopy
predict_tool = {
    'prior_home': .38,
    'prior_away': .62,
    'home_team_has_ball': None,
    'home_start_score': None,
    'away_start_score': None,
    'quarter': None,
    'overtime': None,
    'play_start_time': None,
    'yd_from_goal': None,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': [0, 1, 0],
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': [0, 0, 1],
    'down': 4,
    'ytg': None,
    'home_timeouts_remaining': None,
    'away_timeouts_remaining': None,
    
}
predict_input = pd.DataFrame(predict_tool, index=["go", "punt", "FGA"])
predict_tool_need = {
    'home_team_has_ball': 1,
    'home_start_score': 14,
    'away_start_score': 14,
    'quarter': 3,
    'overtime': 0,
    'play_start_time': 606,
    'yd_from_goal': 40,
    'ytg': 1,
    'home_timeouts_remaining': 3,
    'away_timeouts_remaining': 3,
}
predict_input[[
    'home_team_has_ball',
    'home_start_score',
    'away_start_score',
    'quarter',
    'overtime',
    'play_start_time',
    'yd_from_goal',
    'ytg',
    'home_timeouts_remaining',
    'away_timeouts_remaining']] = list(predict_tool_need.values())
cb.predict_proba(predict_input)

In [None]:
predict_input

In [None]:

predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_field_goal = deepcopy(predict_input_go_for_it)
predict_input_field_goal["field_goal_attempt"] = 1

print(np.around(cb.predict_proba(predict_input), 3))
print(np.around(cb.predict_proba(predict_input_go_for_it), 3))
print(np.around(cb.predict_proba(predict_input_field_goal), 3))


In [None]:
calib_df = full_df

calib_away_win = calibration_curve(calib_df["draw"], calib_df["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, .04], [0, .04])


In [None]:
full_df[mask_test][-1:].values


In [None]:
rf_df = pd.concat([test_df, pd.DataFrame(rf_output["ft_outcome"], columns=["xhome_win", "xdraw", "xaway_win"]), pd.DataFrame(rf_output["home_score"])], axis=1)

In [None]:
pd.DataFrame(rf_output["home_score"]).drop_duplicates()

In [None]:
test_game_rf = rf_df[rf_df["game_code"]==2337720]
plt.plot(test_game_rf["nevent"], test_game_rf["xhome_win"])


In [None]:
plt.plot(test_game["nevent"], test_game["xhome_win"])

In [None]:
calib_df = test_values[test_values["quarter"]==3]

calib_draw = calibration_curve(calib_df["draw"], calib_df["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_draw = calibration_curve(test_values["draw"], test_values["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_draw = calibration_curve(test_values_old["draw"], test_values_old["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
test_game = test_values[test_values["game_code"]==2337720]
test_game["home_win_no_ties"] = test_game["xhome_win"] / (test_game["xhome_win"] + test_game["xaway_win"])
test_game["away_win_no_ties"] = test_game["xaway_win"] / (test_game["xhome_win"] + test_game["xaway_win"])

In [None]:
h_f, h_ax = plt.subplots(1, figsize=(9, 4))
plot_time = test_game[['quarter', 'play_start_time']]
plot_time = ((900 - plot_time.play_start_time) + \
                (plot_time.quarter - 1 ) * 900).values
plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
plot_y1 = np.vstack((test_game["home_win_no_ties"], test_game["home_win_no_ties"])).flatten(order='F')[:-1]
plot_y2 = np.vstack((test_game["away_win_no_ties"], test_game["away_win_no_ties"])).flatten(order='F')[:-1]
h_ax.stackplot(plot_x, plot_y1, plot_y2,
                labels=['Chiefs','Bills'],
                colors=['#E31837', '#00338D'],
                zorder=0)
h_ax.legend(loc='upper left')
if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
    x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
    x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
else:
    x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
    x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
    
h_ax.set_xticks(x_tick_pos)
h_ax.set_xticklabels(x_tick_str)
h_ax.set_xlabel('Match time')
h_ax.set_xlim(0, plot_x.max())
h_ax.set_ylim(0, 1)
h_ax.set_yticks(np.arange(0, 1.01, 0.25))
h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
h_ax.set_ylabel('Probability')
h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))


In [None]:
plt.plot(plot_time, test_game["home_win_no_ties"], '-k')

In [None]:
from copy import deepcopy
plot_game_code = 2337720
mask_fourth_down_bot = (full_df.game_code.isin(test_game_codes))&(full_df.down==4)&(full_df.from_scrimmage==1)
plot_input_fourth_down_bot = full_df.loc[mask_fourth_down_bot,input_names]
plot_input_fourth_down_bot_go = deepcopy(plot_input_fourth_down_bot)
plot_input_fourth_down_bot_punt = deepcopy(plot_input_fourth_down_bot)
plot_input_fourth_down_bot_go["punt"] = 0
plot_input_fourth_down_bot_punt["punt"] = 1
plot_running_score_fourth_down_bot = full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
plot_output_fourth_down_bot = get_model_outputs(clf, plot_input_fourth_down_bot, plot_running_score_fourth_down_bot)
plot_output_fourth_down_bot_go = get_model_outputs(clf, plot_input_fourth_down_bot_go, plot_running_score_fourth_down_bot)
plot_output_fourth_down_bot_punt = get_model_outputs(clf, plot_input_fourth_down_bot_punt, plot_running_score_fourth_down_bot)
test_df_fourth_down_bot = full_df.loc[mask_fourth_down_bot, full_df.columns].reset_index()
test_values_fourth_down_bot = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
test_values_fourth_down_bot_go = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot_go['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
test_values_fourth_down_bot_punt = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot_punt['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)


In [None]:
test_values_fourth_down_bot["xhome_win_go"] = test_values_fourth_down_bot_go["xhome_win"]
test_values_fourth_down_bot["xhome_win_punt"] = test_values_fourth_down_bot_punt["xhome_win"]
test_values_fourth_down_bot["xgo_advantage"] = test_values_fourth_down_bot['xhome_win_go'] - test_values_fourth_down_bot['xhome_win_punt']

In [None]:
test_df_fourth_down_bot

In [None]:
min(test_df_fourth_down_bot[test_df_fourth_down_bot["punt"]==1]["yd_from_goal"])

In [None]:
max(test_df_fourth_down_bot[test_df_fourth_down_bot["field_goal_attempt"]==1]["yd_from_goal"])

In [None]:
np.array([24, 17])

In [None]:
predict_tool = {
    'prior_home': .6,
    'prior_away': .3995,
    'home_team_has_ball': 0,
    'home_start_score': 24,
    'away_start_score': 20,
    'quarter': 4,
    'overtime': 0,
    'play_start_time': 200,
    'yd_from_goal': 5,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': 1,
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': 0,
    'down': 4,
    'ytg': 1,
}
predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_go_for_it["punt"] = 0
predict_running_score = predict_input[["home_start_score", "away_start_score"]].values
# full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
print(get_model_outputs(clf, predict_input, predict_running_score)["ft_outcome"])
print(get_model_outputs(clf, predict_input_go_for_it, predict_running_score)["ft_outcome"])


In [None]:
predict_tool = {
    'prior_home': .6,
    'prior_away': .4,
    'home_team_has_ball': 0,
    'home_start_score': 24,
    'away_start_score': 20,
    'quarter': 4,
    'overtime': 0,
    'play_start_time': 200,
    'yd_from_goal': 5,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': 1,
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': 0,
    'down': 4,
    'ytg': 1,
}
predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_go_for_it["punt"] = 0
predict_running_score = predict_input[["home_start_score", "away_start_score"]].values
# full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
print(get_model_outputs(clf, predict_input, predict_running_score)["ft_outcome"])
print(get_model_outputs(clf, predict_input_go_for_it, predict_running_score)["ft_outcome"])


In [None]:


full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
get_model_outputs(clf, predict_input, np.array([24, 17]).values())

In [None]:
full_df["ytg_bucket"] = round((full_df["yd_from_goal"]-5)/10) * 10
fourth_downs = full_df.loc[mask_fourth_down_bot]

fourth_downs[input_names+["ytg_bucket"]].groupby("ytg_bucket").mean()

In [None]:
fourth_downs["yd_from_goal"].drop_duplicates().sort_values()

In [None]:
extra_point = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1), input_names].values
extra_point_all = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1)].values
extra_point_running_score = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1), ["home_start_score", "away_start_score"]].values
extra_point_outputs = get_model_outputs(clf, extra_point, extra_point_running_score)
extra_point_df = pd.concat([pd.DataFrame(extra_point_all, columns=full_df.columns), pd.DataFrame(extra_point_outputs["home_score"])], axis=1)

In [None]:
clf.intercepts_[2]

In [None]:
extra_point_df.to_csv("test.csv")

In [None]:
extra_point

In [None]:
full_df[["event_id", "event_name"]].drop_duplicates().sort_values("event_id")