In [1]:
import os
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
data_dir = os.path.join(root_dir, 'data')

import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import ipywidgets as widgets
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.metrics import log_loss
from IPython.display import display, HTML
import pickle
from sklearn.model_selection import GroupKFold, RandomizedSearchCV, cross_val_predict
import scipy
def uniform_distribution(lo, hi):
    return scipy.stats.uniform(lo, hi - lo)
def ProbaScoreProxy(y_true, y_probs, proxied_func, **kwargs):
    return proxied_func(y_true, y_probs, **kwargs)

%matplotlib notebook



# Data
Here we have the initial pull of datasets include:
* game_df: this is game information for all games from 2008 to 2021 including unplayed games
* test_game_codes are going to be the game_codes of games in 2021 that are played (note: game_state_id 11 is finished game, 1 is Pre-Game)
* sim_game_codes are all games (played and unplayed) of 2021
* we split up all the game_codes into "played_game_codes" and "unplayed_game_codes"

#

In [2]:
game_df = pd.read_parquet(os.path.join(data_dir, 'game_data.parquet'))
game_df = game_df.sort_values(by='game_date', ascending=False).reset_index(drop=True)
odds_df = pd.read_parquet(os.path.join(data_dir, 'odds_data.parquet'))
odds_df = odds_df.drop_duplicates("game_code")
game_codes = game_df.game_code.values
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
played_game_codes = game_df.loc[game_df.game_state_id == 11, 'game_code'].values
unplayed_game_codes = game_df.loc[game_df.game_state_id != 11, 'game_code'].values
assert game_df.loc[~game_df.game_code.isin(unplayed_game_codes)].isna().sum().sum() == 0, 'Unexpected NaN values found in game data frame'
display(HTML(game_df.head(3).to_html(index=False)))
print('Games in sample: {0:d}'.format(game_codes.size))


game_code,game_date,home_team_id,away_team_id,season,home_team_abbrev,away_team_abbrev,week,game_type_id,home_score,away_score,status,game_state_id
2337728,2022-02-13 15:30:00,327,343,2021,Cin,LAR,23,3,20,23,Final,11
2337725,2022-01-30 15:30:00,343,359,2021,LAR,SF,21,7,20,17,Final,11
2337724,2022-01-30 14:00:00,339,327,2021,KC,Cin,21,7,24,27,Final,11


Games in sample: 3758


# Prior Data
Prior data is pulled from csv file and this will give us the inputs we need for the pre-game match predictions

In [3]:
prior_df = pd.read_csv(os.path.join(data_dir, 'game_priors.csv'))
display(HTML(prior_df.head(3).to_html(index=False)))

game_code,home_team_id,away_team_id,home_team_abbrev,away_team_abbrev,prior_home,prior_away,game_date
887191,329,347,Cle,Min,0.300686,0.69693,2009-09-13 13:00:00
887208,334,347,Det,Min,0.203864,0.795172,2009-09-20 13:00:00
887257,347,327,Min,Cin,0.677353,0.320797,2009-12-13 12:00:00


# Event Data
* Named event_df
* Is the play by play data from all games 2008 to 2021

# Input Features
Now that we have the datasets loaded we can load the features:
* prior_home: estimated probability of the home team winning at t=0
* prior_away: estimated probability of the away team winning at t=0
* home_team_has_ball: binary value for whether home team is in possession of the ball
* home_start_score: the score of the home team at the beginning of each play
* away_start_score: the score of the away team at the beginning of each play
* quarter: the current quarter/period the game is in (1-4 for all games, 5 if they are in the overtime period)
* play_start_time: numeric value of the time remaining in the quarter (900 at the beginning of the quarter, 0 at the end)
* yd_from_goal: the amount of yards between the line of scrimmage and the goal line for the team in possession of the ball
* down: the amount of downs that the team in possession of the ball has accumulate (1-4, down=-1 in plays that are not from scrimmage)
* ytg: the amount of yards between the current line of scrimmage and the first down line. (ytg=-1 in plays that are not from scrimmage)

# Target
remaining_exact_score: this is a numeric value for all the different combinations of remaining score (note: in this value sample, max_away_score=59 and max_home_score=62)
* for example if the current score is 17-24 (away_start_score=17 & home_start_score=24) and the final score is 27-30, then:
    *        remaining_exact_score = (27 - 17) + (62 + 1) * (30 - 24) = 422
* this ensures that all combinations of remaining exact scores are unique values


# Merged Table
full_df: the merged table of events_df and prior_df keeping only the input features and the target

In [4]:
events_df = pd.read_parquet(os.path.join(data_dir, 'event_data.parquet'))
events_df["half"] = round((events_df["quarter"] + 0.01) / 2)
events_df["home_timeout"] = np.where(((events_df["event_id"]==57)&(events_df["home_team_has_ball"]==1))|((events_df["event_id"]==58)&(events_df["home_team_has_ball"]==0)), 1, 0)
events_df["away_timeout"] = np.where(((events_df["event_id"]==57)&(events_df["home_team_has_ball"]==0))|((events_df["event_id"]==58)&(events_df["home_team_has_ball"]==1)), 1, 0)
events_df = events_df.sort_values(["game_code", "nevent"])
events_df["home_timeouts_remaining"] = np.clip(3 - events_df.groupby(["game_code", "half"])["home_timeout"].cumsum(), 0, 3)
events_df["away_timeouts_remaining"] = np.clip(3 - events_df.groupby(["game_code", "half"])["away_timeout"].cumsum(), 0, 3)

max_away_score = np.max(game_df["away_score"])
max_home_score = np.max(game_df["home_score"])

input_names = [
    # "prior_home",
    # "prior_away",
    "vegas_away_prediction",
    "vegas_home_prediction",
    # "cur_spread",
    # "cur_over_under",
    "home_team_has_ball",
    # "home_start_score",
    # "away_start_score",
    "current_score_diff", 
    "current_score_total",
    "quarter",
    "overtime",
    "play_start_time",
    "yd_from_goal",
    "from_scrimmage",
    "kick_off",
    "punt",
    "point_after_kick",
    "two_point_attempt",
    "field_goal_attempt",
    "down",
    "ytg",
    "home_timeouts_remaining",
    "away_timeouts_remaining",
]
output_name = 'remaining_exact_score'
events_df['remaining_exact_score'] = events_df["away_rest_of_game_score"] + \
                                     (max_away_score + 1) * events_df['home_rest_of_game_score']
events_df['mov_change'] = np.where(events_df["home_team_has_ball"], np.where(events_df["home_score_added"]==7, 1, events_df["home_score_added"]), -events_df["away_score_added"])
full_df = events_df.merge(prior_df, on="game_code", how="left").merge(odds_df, on="game_code", how="left")
full_df["cur_spread"].fillna((full_df["cur_spread"].mean()), inplace=True)
full_df["cur_over_under"].fillna((full_df["cur_over_under"].mean()), inplace=True)
full_df["vegas_away_prediction"] = full_df["cur_over_under"] / 2 + full_df["cur_spread"] * 0.5
full_df["vegas_home_prediction"] = full_df["cur_over_under"] / 2 - full_df["cur_spread"] * 0.5
full_df = full_df[full_df[input_names+[output_name]].notna().all(axis=1)]


In [5]:
# Show how the data frame looks like at the beginning and end of a game
sample_game_code = full_df.sample(1).iloc[0]['game_code']
sample_game_info = prior_df.loc[prior_df.game_code == sample_game_code, ['home_team_abbrev', 'away_team_abbrev', 'game_date']].iloc[0].tolist()
print('\nData sample for game: {0} v {1} ({2})'.format(*sample_game_info))
display(HTML(full_df.loc[full_df.game_code == sample_game_code][["game_code", "nevent"] + input_names]
             .iloc[list(range(0, 5)) + list(range(-5, 0))]
             .to_html(index=False)))


Data sample for game: NYG v TB (2012-09-16 13:00:00)


game_code,nevent,vegas_away_prediction,vegas_home_prediction,home_team_has_ball,current_score_diff,current_score_total,quarter,overtime,play_start_time,yd_from_goal,from_scrimmage,kick_off,punt,point_after_kick,two_point_attempt,field_goal_attempt,down,ytg,home_timeouts_remaining,away_timeouts_remaining
1204651,1,17.25,26.25,0,0,0,1,0,900.0,65,0,1,0,0,0,0,0,-1,3,3
1204651,2,17.25,26.25,1,0,0,1,0,900.0,103,0,1,0,0,0,0,0,-1,3,3
1204651,3,17.25,26.25,1,0,0,1,0,894.0,70,1,0,0,0,0,0,1,10,3,3
1204651,4,17.25,26.25,1,0,0,1,0,862.0,62,1,0,0,0,0,0,2,2,3,3
1204651,5,17.25,26.25,1,0,0,1,0,817.0,22,1,0,0,0,0,0,1,10,3,3
1204651,228,17.25,26.25,0,7,75,4,0,18.0,45,1,0,0,0,0,0,1,10,2,1
1204651,229,17.25,26.25,0,7,75,4,0,12.0,45,1,0,0,0,0,0,2,10,2,1
1204651,230,17.25,26.25,1,7,75,4,0,12.0,76,0,0,0,0,0,0,0,-1,2,1
1204651,231,17.25,26.25,1,7,75,4,0,5.0,70,1,0,0,0,0,0,1,10,2,1
1204651,232,17.25,26.25,1,7,75,4,0,0.0,71,0,0,0,0,0,0,2,-1,2,1


# Train/Test Data Split
Training and test dataframes are created (2009-2020 are training seasons and 2021 is the test season)
# Model
This is the stored model that predicts the probability of each remaining score combination at each point of the game

In [6]:
input_names = [
    # "prior_home",
    # "prior_away",
    # "vegas_away_prediction",
    # "vegas_home_prediction",
    "cur_spread",
    "cur_over_under",
    "home_team_has_ball",
    # "home_start_score",
    # "away_start_score",
    "current_score_diff", 
    "current_score_total",
    "quarter",
    "overtime",
    "play_start_time",
    "yd_from_goal",
    "from_scrimmage",
    "kick_off",
    "punt",
    "point_after_kick",
    "two_point_attempt",
    "field_goal_attempt",
    "down",
    "ytg",
    "home_timeouts_remaining",
    "away_timeouts_remaining",
]
mask_train = (full_df.season_x<=2019)&(full_df.continuation==0)&(full_df[output_name]>=0)
mask_test = (full_df.season_x==2021)&(full_df.continuation==0)&(full_df[output_name]>=0)
mask_val = (full_df.season_x==2020)&(full_df.continuation==0)&(full_df[output_name]>=0)

X_train_rf = full_df.loc[~mask_test, input_names]
y_train_rf = full_df.loc[~mask_test, output_name]
X_test_rf = full_df.loc[mask_test, input_names]
y_test_rf = full_df.loc[mask_test, output_name]

# # rf = pickle.load(open(os.path.join(root_dir, "models/game_score_random_forest_100_10_new_features.p"), 'rb'))
# rf = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=100, n_jobs=-1).fit(X_train_rf, y_train_rf)
# pickle.dump(rf, open(os.path.join(root_dir, 'models/game_score_random_forest_100_10_vegas_spread.p'), 'wb'))
# rf.score(X_test_rf, y_test_rf)

In [12]:
brier_scorer = make_scorer(
    ProbaScoreProxy,
    greater_is_better=False,  # True
    needs_proba=True,
    proxied_func=brier_score_loss,
)
output_name_score_change = "mov_change"
mask_train = (full_df.season_x<=2019)&(full_df.continuation==0)&(full_df[output_name]>=0)
mask_test = (full_df.season_x==2021)&(full_df.continuation==0)&(full_df[output_name]>=0)
mask_val = (full_df.season_x==2020)&(full_df.continuation==0)&(full_df[output_name]>=0)
group_col = "game_code"

X_all = full_df[input_names]
y_all_score_change = full_df[output_name_score_change]
group_all_score_change = full_df[group_col]


X_train = full_df.loc[mask_train, input_names]
y_train_score_change = full_df.loc[mask_train, output_name_score_change]
groups_train = full_df.loc[mask_train, group_col]
X_test = full_df.loc[mask_test, input_names]
y_test_score_change = full_df.loc[mask_test, output_name_score_change]
groups_test = full_df.loc[mask_test, group_col]
X_val = full_df.loc[mask_val, input_names]
y_val_score_change = full_df.loc[mask_val, output_name_score_change]
group_val = full_df.loc[mask_val, group_col]

categoricals = [
    'home_team_has_ball',
    'overtime',
    'from_scrimmage',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    'down',
]
COMPUTE_PARAMS = {"thread_count": -1}
# COMPUTE_PARAMS = {"task_type":"GPU", "devices": "0:1", "bootstrap_type": "Poisson"}
param_spaces={
    "learning_rate": uniform_distribution(0.01, 0.1),
    "max_depth": range(5, 16),
    # "subsample": uniform_distribution(0.9, 1),
    "l2_leaf_reg": uniform_distribution(1, 4),
    "boosting_type": ["Plain"],
    }

search_score_change_best = {
    'boosting_type': 'Plain',
    'l2_leaf_reg': 2.2958350559263474,
    'learning_rate': 0.036210622617823776,
    'max_depth': 15,
    'subsample': 0.9399860971715256
 }
import catboost as cb
curr_val_pool_score_change =cb.Pool(
        data=X_val,
        label=y_val_score_change,
        cat_features=categoricals,
        feature_names=input_names,
)

CLASSIFIER_PARAMS = {
    # "n_estimators": 10,
    "num_trees": 1000,
    "auto_class_weights": "Balanced",

    # "loss_function": JITRmseObjective(),
    # "eval_metric": JITRmseMetric(),
    **COMPUTE_PARAMS,
}
HPO_PARAMS = {"n_jobs": 1}

cv=GroupKFold(n_splits=3)



In [8]:
# search_score_change = pickle.load(open(os.path.join(root_dir, "models/score_change_catboost.p"), 'rb'))


In [13]:
# search_score_change = pickle.load(open(os.path.join(root_dir, "models/score_change_catboost.p"), 'rb'))
FIT_PARAMS = {
    "verbose": 1,
    "early_stopping_rounds": 5,
    "use_best_model": True,
}

clf_score_change=cb.CatBoostClassifier(early_stopping_rounds=5,cat_features=categoricals, **CLASSIFIER_PARAMS)

search_score_change = RandomizedSearchCV(clf_score_change, param_spaces,cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=100)
# search_score_change.fit(X_train[:10000],y_train_score_change[:10000], eval_set=curr_val_pool_score_change,groups=groups_train[:10000], **FIT_PARAMS)
search_score_change.fit(X_train,y_train_score_change, eval_set=curr_val_pool_score_change,groups=groups_train, **FIT_PARAMS)
# pickle.dump(search_score_change, open(os.path.join(root_dir, 'models/score_change_catboost.p'), 'wb'))
# search_score_change = pickle.load(open(os.path.join(root_dir, "models/score_change_catboost.p"), 'rb'))
# import winsound
# frequency = 1000  # Set Frequency To 2500 Hertz
# duration = 500  # Set Duration To 1000 ms == 1 second
# winsound.Beep(frequency, duration)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3; 1/10] START boosting_type=Plain, l2_leaf_reg=2.1236203565420873, learning_rate=0.09556428757689246, max_depth=15


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 1.5550601	test: 1.5579795	best: 1.5579795 (0)	total: 1.92s	remaining: 31m 57s
1:	learn: 1.2759224	test: 1.2856356	best: 1.2856356 (1)	total: 3.04s	remaining: 25m 15s
2:	learn: 1.0777008	test: 1.0916044	best: 1.0916044 (2)	total: 4.82s	remaining: 26m 42s
3:	learn: 0.9325709	test: 0.9495296	best: 0.9495296 (3)	total: 6.44s	remaining: 26m 43s
4:	learn: 0.8239001	test: 0.8437958	best: 0.8437958 (4)	total: 7.05s	remaining: 23m 22s
5:	learn: 0.7411585	test: 0.7621519	best: 0.7621519 (5)	total: 7.6s	remaining: 20m 58s
6:	learn: 0.6691329	test: 0.6926475	best: 0.6926475 (6)	total: 8.54s	remaining: 20m 11s
7:	learn: 0.6114961	test: 0.6371933	best: 0.6371933 (7)	total: 9.13s	remaining: 18m 52s
8:	learn: 0.5581990	test: 0.5856204	best: 0.5856204 (8)	total: 10.3s	remaining: 18m 52s
9:	learn: 0.5097133	test: 0.5357358	best: 0.5357358 (9)	total: 11.3s	remaining: 18m 38s
10:	learn: 0.4726345	test: 0.5003599	best: 0.5003599 (10)	total: 12.1s	remaining: 18m 6s
11:	learn: 0.4351057	test: 0.463

In [None]:
big_cv = GroupKFold(n_splits=5)
final_model = cb.CatBoostClassifier(n_estimators=search_score_change.best_estimator_.tree_count_,**search_score_change.best_params_,cat_features=categoricals,verbose=1)
predictions_score_change = cross_val_predict(final_model,X_all, y_all_score_change,groups=group_all_score_change,cv=big_cv,verbose=1,n_jobs=5)


In [None]:
X_test

In [None]:
pd.concat([X_train, y_train_score_change], axis=1).groupby("mov_change").count()

In [None]:
rf_100_none = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=100, n_jobs=-1, max_features=None).fit(X_train_rf, y_train_rf)
pickle.dump(rf_100_none, open(os.path.join(root_dir, 'models/game_score_random_forest_100_10_no_max_features_vegas_spread.p'), 'wb'))
rf_100_none.verbose = 0
rf_100_none.score(X_test_rf, y_test_rf)

In [None]:
rf_100_none.verbose = 0
rf_100_none.score(X_test_rf, y_test_rf)

In [None]:
rf_50_none = RandomForestClassifier(n_estimators=50, max_depth=10, verbose=100, n_jobs=-1, max_features=None).fit(X_train_rf, y_train_rf)
pickle.dump(rf_50_none, open(os.path.join(root_dir, 'models/game_score_random_forest_50_no_max_features_vegas_spread.p'), 'wb'))
rf_50_none.verbose = 0
rf_50_none.score(X_test_rf, y_test_rf)

In [None]:
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
mask_test = full_df.game_code.isin(test_game_codes)
output_name_score_change = "mov_change"

X_train = full_df.loc[(~mask_test)&(full_df.continuation==0), input_names]
y_train_score_change = full_df.loc[(~mask_test)&(full_df.continuation==0), output_name_score_change]
X_test = full_df.loc[mask_test&(full_df.continuation==0), input_names]
y_test_score_change = full_df.loc[mask_test&(full_df.continuation==0), output_name_score_change]


# rf_score_change = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=10, n_jobs=-1).fit(X_train, y_train_score_change)
# pickle.dump(rf_score_change, open(os.path.join(root_dir, 'models/score_change_random_forest_100_10.p'), 'wb'))
rf_score_change = pickle.load(open(os.path.join(root_dir, "models/score_change_random_forest_100_10.p"), 'rb'))
score_change_columns = [
    "away_increase_6",
    "away_increase_3",
    "away_increase_2",
    "away_increase_1",
    "no_increase",
    "home_increase_1",
    "home_increase_2",
    "home_increase_3",
    "home_increase_6",
]

full_df[score_change_columns] = rf_score_change.predict_proba(full_df[input_names])

# clf = MLPClassifier(
#     hidden_layer_sizes=[10,5],
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization parameter
#     learning_rate_init=0.001,
#     batch_size=128,
#     random_state=1,
#     max_iter=50, #50
#     early_stopping=True,
#     validation_fraction=0.1,
#     n_iter_no_change=5,
#     verbose=True).fit(X_train, y_train)
# pickle.dump(clf, open(os.path.join(root_dir, 'models/game_score_new_4.sav'), 'wb'))

# os.system('say "done"')


In [None]:
# n_categories = (max_home_score + 1) * (max_away_score + 1)
# param_spaces={
#     "learning_rate": [0.01],
#     "max_depth": range(5, 16),
#     "subsample": uniform_distribution(0.9, 1),
#     "l2_leaf_reg": uniform_distribution(1, 4),
#     "boosting_type": ["Plain"],
#     }
# y_train = full_df.loc[mask_train, output_name]
# y_test = full_df.loc[mask_test, output_name]
# y_val = full_df.loc[mask_val, output_name]
# curr_val_pool_score_combination =cb.Pool(
#         data=X_val,
#         label=y_val,
#         feature_names=input_names
#     )


# clf_score_combinations=cb.CatBoostClassifier(early_stopping_rounds=5, classes_count=n_categories, **CLASSIFIER_PARAMS)

# search_score_combinations = RandomizedSearchCV(clf_score_combinations, param_spaces,cv=cv,random_state=42,n_iter=1,n_jobs=1,verbose=1)
# search_score_combinations.fit(X_train,y_train, eval_set=curr_val_pool_score_combination,groups=groups_train, **FIT_PARAMS)

In [None]:
predict_values = pd.DataFrame(search_score_change.predict_proba(full_df[input_names]), columns=search_score_change.classes_)

In [None]:
search_score_change.classes_

In [None]:
search_score_change.best_params_

In [None]:
X_home = full_df[input_names+score_change_columns]
X_away = deepcopy(X_home)

In [None]:
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
mask_test = full_df.game_code.isin(test_game_codes)
output_name_home_score = "home_rest_of_game_score"
output_name_away_score = "away_rest_of_game_score"

X_train = full_df.loc[(full_df.season_x<=2019)&(full_df.continuation==0), input_names+score_change_columns]
y_train_home_score_prediction = full_df.loc[(full_df.season_x<=2019)&(full_df.continuation==0), output_name_home_score]
y_train_away_score_prediction = full_df.loc[(full_df.season_x<=2019)&(full_df.continuation==0), output_name_away_score]
groups_train = full_df.loc[(full_df.season_x<=2019)&(full_df.continuation==0), "game_code"]
X_test = full_df.loc[(full_df.season_x==2021)&(full_df.continuation==0), input_names+score_change_columns]
y_test_home_score_prediction = full_df.loc[(full_df.season_x==2021)&(full_df.continuation==0), output_name_home_score]
y_test_away_score_prediction = full_df.loc[(full_df.season_x==2021)&(full_df.continuation==0), output_name_away_score]
groups_test = full_df.loc[(full_df.season_x==2021)&(full_df.continuation==0), "game_code"]
X_val = full_df.loc[(full_df.season_x==2020)&(full_df.continuation==0), input_names+score_change_columns]
y_val_home_score_prediction = full_df.loc[(full_df.season_x==2020)&(full_df.continuation==0), output_name_home_score]
y_val_away_score_prediction = full_df.loc[(full_df.season_x==2020)&(full_df.continuation==0), output_name_away_score]
groups_val = full_df.loc[(full_df.season_x==2010)&(full_df.continuation==0), "game_code"]

categoricals = [
    'home_team_has_ball',
    'overtime',
    'from_scrimmage',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    'down',
]
COMPUTE_PARAMS = {"thread_count": -1}
param_spaces={
    "learning_rate":np.linspace(0.01,0.1),
    "min_child_samples":range(1,1000,100),
    "subsample":np.linspace(0.333,1),
    "random_strength":np.linspace(1,100),
    "l2_leaf_reg":np.linspace(0.1,100),
    "max_depth":range(2,16)}

# param_spaces={
#     'random_strength': 39.38775510204081,
#     'early_stopping_rounds': 5,
#     'min_child_samples': 1,
#     'max_depth': 12,
#     'learning_rate': 0.05591836734693878,
#     'l2_leaf_reg': 55.146938775510215,
#     **COMPUTE_PARAMS}
import catboost as cb
param_spaces = {
    'subsample': 0.6324693877551021,
    'random_strength': 100.0,
    'min_child_samples': 301,
    'max_depth': 11,
    'learning_rate': 0.02836734693877551,
    'l2_leaf_reg': 49.030612244897966
 }
curr_val_pool =cb.Pool(
        data=X_val,
        label=y_val_away_score_prediction,
        cat_features=categoricals,
        feature_names=input_names + score_change_columns
    )

cv=GroupKFold(n_splits=3)
#param_distributions={"alpha":np.linspace(3200,4500,1000)}
reg=CatBoostRegressor(early_stopping_rounds=5,cat_features=categoricals,n_estimators=10000)#, devices="0:1",bootstrap_type= "Poisson", max_bin=32,thread_count=1,objective="RMSE")
search=RandomizedSearchCV(reg,param_spaces,cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=10)
search.fit(X_train,y_train_away_score_prediction, eval_set=curr_val_pool,groups=groups_train)
# cb_score_prediction_home = CatBoostRegressor(cat_features=categoricals, verbose=1, **param_spaces).fit(X_train, y_train_home_score_prediction)
# cb_score_prediction_away = CatBoostRegressor(cat_features=categoricals, verbose=1, **param_spaces).fit(X_train, y_train_away_score_prediction)


# rf_score_change = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=10, n_jobs=-1).fit(X_train, y_train_score_change)
# rf_score_prediction_home = RandomForestRegressor(n_estimators=50, max_depth=12, verbose=5, n_jobs=-1).fit(X_train, y_train_home_score_prediction)
# rf_score_prediction_away = RandomForestRegressor(n_estimators=50, max_depth=12, verbose=5, n_jobs=-1).fit(X_train, y_train_away_score_prediction)
# cb_score_prediction_home = CatBoostRegressor(n_estimators=50, max_depth=12, verbose=5, n_jobs=-1).fit(X_train, y_train_home_score_prediction)

# pickle.dump(rf_score_change, open(os.path.join(root_dir, 'models/score_change_random_forest_100_10.p'), 'wb'))

# clf = MLPClassifier(
#     hidden_layer_sizes=[10,5],
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization parameter
#     learning_rate_init=0.001,
#     batch_size=128,
#     random_state=1,
#     max_iter=50, #50
#     early_stopping=True,
#     validation_fraction=0.1,
#     n_iter_no_change=5,
#     verbose=True).fit(X_train, y_train)
# pickle.dump(clf, open(os.path.join(root_dir, 'models/game_score_new_4.sav'), 'wb'))

os.system('say "done"')


In [None]:
param_spaces = {
    'subsample': [0.6324693877551021],
    'random_strength': [100.0],
    'min_child_samples': [301],
    'max_depth': [11],
    'learning_rate': [0.02836734693877551],
    'l2_leaf_reg': [49.030612244897966]
 }
curr_val_pool =cb.Pool(
        data=X_val,
        label=y_val_away_score_prediction,
        cat_features=categoricals,
        feature_names=input_names + score_change_columns
    )

cv=GroupKFold(n_splits=3)
#param_distributions={"alpha":np.linspace(3200,4500,1000)}
reg=CatBoostRegressor(early_stopping_rounds=5,cat_features=categoricals,n_estimators=10000)#, devices="0:1",bootstrap_type= "Poisson", max_bin=32,thread_count=1,objective="RMSE")
search_home=RandomizedSearchCV(reg,param_spaces,cv=cv,random_state=42,n_iter=10,n_jobs=1,verbose=10)
search_home.fit(X_train,y_train_home_score_prediction, eval_set=curr_val_pool,groups=groups_train)


In [None]:
big_cv = GroupKFold(n_splits=5)
final_model_home = cb.CatBoostRegressor(n_estimators=search_home.best_estimator_.tree_count_, **search_home.best_params_,cat_features=categoricals,verbose=1)
full_df["xhome_score_rest_of_game_cb"] = cross_val_predict(final_model_home,full_df[input_names+score_change_columns],full_df[output_name_away_score],groups=full_df["game_code"],cv=big_cv,verbose=1,n_jobs=5)

In [None]:
search.best_estimator_.tree_count_

In [None]:
big_cv = GroupKFold(n_splits=5)
final_model = cb.CatBoostRegressor(n_estimators=search.best_estimator_.tree_count_, **search.best_params_,cat_features=categoricals,verbose=1)
full_df["xaway_score_rest_of_game_cb"] = cross_val_predict(final_model,full_df[input_names+score_change_columns],full_df[output_name_away_score],groups=full_df["game_code"],cv=big_cv,verbose=1,n_jobs=5)

In [None]:
search
# pickle.dump(rf_score_change, open(os.path.join(root_dir, 'models/score_change_random_forest_100_10.p'), 'wb'))
pickle.dump(search, open(os.path.join(root_dir, 'models/search_cv_away_score.p'), 'wb'))
pickle.dump(reg, open(os.path.join(root_dir, 'models/cb_away_score.p'), 'wb'))
pickle.dump(final_model, open(os.path.join(root_dir, 'models/final_away_score.p'), 'wb'))

In [None]:
# rf_score_prediction_away.verbose = 0
# rf_score_prediction_home.verbose = 0
full_df["xaway_score_rest_of_game"] = cb_score_prediction_away.predict(full_df[input_names+score_change_columns])
full_df["xhome_score_rest_of_game"] = cb_score_prediction_home.predict(full_df[input_names+score_change_columns])
full_df["xaway_score"] = full_df["xaway_score_rest_of_game"] + full_df["away_start_score"]
full_df["xhome_score"] = full_df["xhome_score_rest_of_game"] + full_df["home_start_score"]
%matplotlib inline


In [None]:
super_bowl = full_df[full_df["game_code"] == 1321887]
super_bowl.to_clipboard()

In [None]:
example_score = [[ 13.5,  41.,   1.,   0.,   0.,   1.,   0., 900.,  65.,   0.,   1.,
          0.,   0.,   0.,   0.,   0.,  -1.,   3.,   3.,   0.,   0.,   0.,
          0.,   1.,   0.,   0.,   0.,   0.]]
print(rf_score_prediction_away.predict(example_score))
print(rf_score_prediction_home.predict(example_score))

In [None]:
%matplotlib inline
beginning_of_game = full_df[full_df["nevent"]==1]
plt.scatter(beginning_of_game["vegas_away_prediction"], beginning_of_game["xaway_score_rest_of_game_cb"])
plt.plot([0, 50], [0, 50])

In [None]:
plt.scatter(beginning_of_game["vegas_away_prediction"], beginning_of_game["away_rest_of_game_score"])
plt.scatter(beginning_of_game["xaway_score_rest_of_game_cb"], beginning_of_game["away_rest_of_game_score"])
plt.plot([0, 50], [0, 50])

In [None]:
beginning_of_game = full_df[full_df["nevent"]==1]
plt.scatter(beginning_of_game["vegas_away_prediction"], beginning_of_game["xaway_score_rest_of_game"])
plt.plot([0, 50], [0, 50])

In [None]:
plt.scatter(beginning_of_game["vegas_home_prediction"], beginning_of_game["xhome_score_rest_of_game"])
plt.plot([0, 50], [0, 50])

In [None]:
plt.scatter(beginning_of_game["vegas_home_prediction"], beginning_of_game["xhome_score_rest_of_game"])
plt.plot([0, 50], [0, 50])

In [None]:
%matplotlib inline
plt.scatter(full_df["away_final_score"], full_df["xaway_score"])

In [None]:
%matplotlib inline
plt.scatter(full_df["away_final_score"], full_df["xaway_score"])

In [None]:
pd.DataFrame(rf_score_prediction_away.feature_importances_, index=rf_score_prediction_away.feature_names_in_)

In [None]:
input_names

In [None]:
test_game_codes = game_df.loc[(game_df.season == 2021) & (game_df.game_state_id == 11), 'game_code'].values
sim_game_codes = game_df.loc[game_df.season == 2021, 'game_code'].values
mask_test = full_df.game_code.isin(test_game_codes)

X_train = full_df.loc[~mask_test, input_names]
y_train = full_df.loc[~mask_test, output_name]
X_test = full_df.loc[mask_test, input_names].values
y_test = full_df.loc[mask_test, output_name].values
n_categories = (max_home_score + 1) * (max_away_score + 1)




# clf = MLPClassifier(
#     hidden_layer_sizes=[10,5],
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization parameter
#     learning_rate_init=0.001,
#     batch_size=128,
#     random_state=1,
#     max_iter=50, #50
#     early_stopping=True,
#     validation_fraction=0.1,
#     n_iter_no_change=5,
#     verbose=True).fit(X_train, y_train)
# pickle.dump(clf, open(os.path.join(root_dir, 'models/game_score_new_4.sav'), 'wb'))

# os.system('say "done"')


In [None]:
clf = pickle.load(open(os.path.join(root_dir, "models/game_score_new_4.sav"), 'rb'))
clf_old = pickle.load(open(os.path.join(root_dir, "models/game_score.sav"), 'rb'))
clf.feature_names_in_.toli

In [None]:
%matplotlib inline
plt.scatter(full_df["prior_home"], full_df["cur_spread"])

In [None]:
import time
def get_model_outputs(model, input_data, running_scores):
    start_time = time.time()
    raw_output = model.predict_proba(input_data)
    # Exact score outputs    
    score_probs = np.zeros((input_data.shape[0], n_categories))
    score_probs[:, model.classes_] = raw_output
    # 1X2 prediction & team score outputs
    outcome_probs = np.zeros((input_data.shape[0], 3))
    home_score_probs = np.zeros((input_data.shape[0], max_home_score + 1))
    away_score_probs = np.zeros((input_data.shape[0], max_away_score + 1))
    for home_score in range(max_home_score + 1):
        ft_home_score = home_score + running_scores[:, 0]
        for away_score in range(max_away_score + 1):            
            # print(home_score, "-", away_score)
            ft_away_score = away_score + running_scores[:, 1]
            remaining_prob = score_probs[:, away_score + (max_away_score + 1) * home_score]
            # 1X2 - Home win
            outcome_probs[:, 0] = np.where(ft_home_score > ft_away_score,
                                           outcome_probs[:, 0] + remaining_prob,
                                           outcome_probs[:, 0])
            # 1X2 - Draw
            outcome_probs[:, 1] = np.where(ft_home_score == ft_away_score,
                                           outcome_probs[:, 1] + remaining_prob,
                                           outcome_probs[:, 1])
            # 1X2 - Away win
            outcome_probs[:, 2] = np.where(ft_home_score < ft_away_score,
                                           outcome_probs[:, 2] + remaining_prob,
                                           outcome_probs[:, 2])
            # Team scores
            home_score_probs[:, home_score] += remaining_prob
            away_score_probs[:, away_score] += remaining_prob
    return {
        'remaining_score': score_probs,
        'home_score': home_score_probs,
        'away_score': away_score_probs,
        'ft_outcome': outcome_probs
    }

## Usage example
Below is just a little bit of code that shows how the input and output data looks like for a few samples in a random game:

In [None]:
example_game_code = np.random.choice(test_game_codes)
example_indices = [0, 1, -2, -1]
example_input = full_df.sort_values("nevent").loc[full_df.game_code == example_game_code, input_names].values[example_indices]
full_df = full_df.sort_values(["game_code", "nevent"], ascending=True)
example_running_score = full_df.loc[full_df.game_code == example_game_code,
                                    ['home_start_score', 'away_start_score']].values[example_indices]
example_output = get_model_outputs(clf, example_input, example_running_score)
print('\nExample input data:')
display(HTML(pd.DataFrame(data=example_input, columns=input_names).to_html(index=False)))

# print('\nExample outputs (exact score):')
# print(example_output['remaining_score'][0][0:10])
# print(example_output['remaining_score'][0][10:20])
# print(example_output['remaining_score'][0][20:30])

print('\nExample outputs (home team score):')
display(HTML(pd.DataFrame(data=example_output['home_score'], columns=np.arange(max_home_score + 1)).to_html(index=False)))

print('\nExample outputs (away team score):')
display(HTML(pd.DataFrame(data=example_output['away_score'], columns=np.arange(max_away_score + 1)).to_html(index=False)))

print('\nExample outputs (1X2):')
display(HTML(pd.DataFrame(data=example_output['ft_outcome'], columns=['home win', 'draw', 'away win']).to_html(index=False)))

### Visualization of predictions
This is a simple interactive dashboard that lets the user select any game from the test data set and plots match outcome (1X2) and team score predictions. Just pick a game from the drop-down menu and click the "Plot" button.

In [None]:
game_info_df = game_df[['game_code', 'game_date', 'home_team_id', 'away_team_id', 'season', 'home_team_abbrev', 'away_team_abbrev']]
game_info_df['game_description'] = ['{0} {1} v {2} ({3})'.format(i.game_date, i.home_team_abbrev, i.away_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]
game_info_df = game_info_df.loc[game_info_df.game_code.isin(test_game_codes), ['game_code', 'game_description']]
game_info_df['ft_score'] = str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'away_score'].values) + "-" + str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'home_score'].values)

h_f = None
h_ax = None
h_ax_twin = None

def update_dashboard(change):
    global info_textbox
    info_textbox.value = ''
    if h_ax is not None:
        h_ax.cla()
        h_ax_twin.cla()
    plot()
    
def print_to_textbox(string, textbox_handle, clear_textbox=False):
    if textbox_handle is None:
        print(string)
    else:
        if clear_textbox or (textbox_handle.value == ''):
            textbox_handle.value = string
        else:
            textbox_handle.value += '<br>' + string

def plot():
    global h_f, h_ax, h_ax_twin
    if h_f is None:
        h_f, h_ax = plt.subplots(1, figsize=(9, 4))
        
    plot_game_code = game_info_df.set_index('game_description').loc[match_picker.value, 'game_code']
    plot_time = full_df.loc[full_df.game_code == plot_game_code, ['quarter', 'play_start_time']]
    plot_time = ((900 - plot_time.play_start_time) + \
                 (plot_time.quarter - 1 ) * 900).values
                 
    plot_input = full_df.loc[full_df.game_code == plot_game_code, input_names].values
    plot_running_score = full_df.loc[full_df.game_code == plot_game_code, ['home_start_score', 'away_start_score']].values
    plot_output = get_model_outputs(clf, plot_input, plot_running_score)    
    print_to_textbox('{0:d} prediction samples found'.format(plot_input.shape[0]), info_textbox)
    
    # Main axis (1X2 prediction)
    plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
    plot_y1 = np.vstack((plot_output['ft_outcome'][:, 0], plot_output['ft_outcome'][:, 0])).flatten(order='F')[:-1]
    plot_y2 = np.vstack((plot_output['ft_outcome'][:, 1], plot_output['ft_outcome'][:, 1])).flatten(order='F')[:-1]
    plot_y3 = np.vstack((plot_output['ft_outcome'][:, 2], plot_output['ft_outcome'][:, 2])).flatten(order='F')[:-1]
    h_ax.stackplot(plot_x, plot_y1, plot_y2, plot_y3,
                   labels=['home','draw','away'],
                   colors=['khaki', 'lightgray', 'lightskyblue'],
                   zorder=0)
    h_ax.legend(loc='upper left')
    if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
        x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
    else:
        x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
        
    h_ax.set_xticks(x_tick_pos)
    h_ax.set_xticklabels(x_tick_str)
    h_ax.set_xlabel('Match time')
    h_ax.set_xlim(0, plot_x.max())
    h_ax.set_ylim(0, 1)
    h_ax.set_yticks(np.arange(0, 1.01, 0.25))
    h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
    h_ax.set_ylabel('Probability')
    h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))
    
    # Twin axis (score prediction)
    if h_ax_twin is None:
        h_ax_twin = h_ax.twinx()
    # Home
    plot_home_score = plot_running_score[:, 0] + \
        np.sum(plot_output['home_score'] * np.tile(np.arange(max_home_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_home_score = np.vstack((plot_home_score, plot_home_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_home_score, '-k', linewidth=1, zorder=10, color='darkgoldenrod', label='home score')
    # Away
    plot_away_score = plot_running_score[:, 1] + \
        np.sum(plot_output['away_score'] * np.tile(np.arange(max_away_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_away_score = np.vstack((plot_away_score, plot_away_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_away_score, '-k', linewidth=1, zorder=10, color='dodgerblue', label='away score')
    
    max_score = int(np.ceil(np.append(plot_home_score, plot_away_score).max()))
    h_ax_twin.set_ylim(0, max_score)
    h_ax_twin.set_yticks(np.arange(0, max_score + 0.5))
    h_ax_twin.set_ylabel('Predicted score')
    
    h_f.tight_layout()
    
match_picker = widgets.Dropdown(
    options=game_info_df.game_description.values,
    description='Match'
)
run_btn = widgets.Button(
    description='Plot'
)
info_textbox = widgets.HTML(value="")

display(widgets.VBox([
    widgets.HBox([match_picker, run_btn]),
    info_textbox
]))
run_btn.on_click(update_dashboard)

In [None]:
input_names_old = ['prior_home', 'prior_away','home_team_has_ball', 'home_start_score', 'away_start_score', 'quarter', 'play_start_time', 'yd_from_goal', 'down', 'ytg']
game_info_df = game_df[['game_code', 'game_date', 'home_team_id', 'away_team_id', 'season', 'home_team_abbrev', 'away_team_abbrev']]
game_info_df['game_description'] = ['{0} {1} at {2} ({3})'.format(i.game_date, i.away_team_abbrev, i.home_team_abbrev, i.game_code) for _, i in game_info_df.iterrows()]
game_info_df = game_info_df.loc[game_info_df.game_code.isin(test_game_codes), ['game_code', 'game_description']]
game_info_df['ft_score'] = str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'away_score'].values) + "-" + str(game_df.set_index('game_code').loc[game_info_df.game_code.values, 'home_score'].values)

h_f = None
h_ax = None
h_ax_twin = None

def update_dashboard(change):
    global info_textbox
    info_textbox.value = ''
    if h_ax is not None:
        h_ax.cla()
        h_ax_twin.cla()
    plot()
    
def print_to_textbox(string, textbox_handle, clear_textbox=False):
    if textbox_handle is None:
        print(string)
    else:
        if clear_textbox or (textbox_handle.value == ''):
            textbox_handle.value = string
        else:
            textbox_handle.value += '<br>' + string

def plot():
    global h_f, h_ax, h_ax_twin
    if h_f is None:
        h_f, h_ax = plt.subplots(1, figsize=(9, 4))
        
    plot_game_code = game_info_df.set_index('game_description').loc[match_picker.value, 'game_code']
    plot_time = full_df.loc[full_df.game_code == plot_game_code, ['quarter', 'play_start_time']]
    plot_time = ((900 - plot_time.play_start_time) + \
                 (plot_time.quarter - 1 ) * 900).values
                 
    plot_input = full_df.loc[full_df.game_code == plot_game_code, input_names_old].values
    plot_running_score = full_df.loc[full_df.game_code == plot_game_code, ['home_start_score', 'away_start_score']].values
    plot_output = get_model_outputs(clf_old, plot_input, plot_running_score)    
    print_to_textbox('{0:d} prediction samples found'.format(plot_input.shape[0]), info_textbox)
    
    # Main axis (1X2 prediction)
    plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
    plot_y1 = np.vstack((plot_output['ft_outcome'][:, 0], plot_output['ft_outcome'][:, 0])).flatten(order='F')[:-1]
    plot_y2 = np.vstack((plot_output['ft_outcome'][:, 1], plot_output['ft_outcome'][:, 1])).flatten(order='F')[:-1]
    plot_y3 = np.vstack((plot_output['ft_outcome'][:, 2], plot_output['ft_outcome'][:, 2])).flatten(order='F')[:-1]
    h_ax.stackplot(plot_x, plot_y1, plot_y2, plot_y3,
                   labels=['home','draw','away'],
                   colors=['khaki', 'lightgray', 'lightskyblue'],
                   zorder=0)
    h_ax.legend(loc='upper left')
    if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
        x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
    else:
        x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
        x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
        
    h_ax.set_xticks(x_tick_pos)
    h_ax.set_xticklabels(x_tick_str)
    h_ax.set_xlabel('Match time')
    h_ax.set_xlim(0, plot_x.max())
    h_ax.set_ylim(0, 1)
    h_ax.set_yticks(np.arange(0, 1.01, 0.25))
    h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
    h_ax.set_ylabel('Probability')
    h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))
    
    # Twin axis (score prediction)
    if h_ax_twin is None:
        h_ax_twin = h_ax.twinx()
    # Home
    plot_home_score = plot_running_score[:, 0] + \
        np.sum(plot_output['home_score'] * np.tile(np.arange(max_home_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_home_score = np.vstack((plot_home_score, plot_home_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_home_score, '-k', linewidth=1, zorder=10, color='darkgoldenrod', label='home score')
    # Away
    plot_away_score = plot_running_score[:, 1] + \
        np.sum(plot_output['away_score'] * np.tile(np.arange(max_away_score + 1), (plot_input.shape[0], 1)), axis=1)
    plot_away_score = np.vstack((plot_away_score, plot_away_score)).flatten(order='F')[:-1]
    h_ax_twin.plot(plot_x, plot_away_score, '-k', linewidth=1, zorder=10, color='dodgerblue', label='away score')
    
    max_score = int(np.ceil(np.append(plot_home_score, plot_away_score).max()))
    h_ax_twin.set_ylim(0, max_score)
    h_ax_twin.set_yticks(np.arange(0, max_score + 0.5))
    h_ax_twin.set_ylabel('Predicted score')
    
    h_f.tight_layout()
    
match_picker = widgets.Dropdown(
    options=game_info_df.game_description.values,
    description='Match'
)
run_btn = widgets.Button(
    description='Plot'
)
info_textbox = widgets.HTML(value="")

display(widgets.VBox([
    widgets.HBox([match_picker, run_btn]),
    info_textbox
]))
run_btn.on_click(update_dashboard)

## Preparing predictions for 2021 season simulation
This is the final section of this notebook, and its purpose is to use the model we have trained to generate match predictions for all games in the 2021/22 season. These will be used in the next notebook to simulate the outcome of the competition.

Since the purpose of this tutorial is not to be run live while games play, we can make the following simplification: we will assume that all 2021 games are either finished or not yet started, and will use observed results and predictions for them, resepectively. Therefore, we will gather input features and generate pre-game predictions only for unplayed games.

First, let's get results for played games:

In [None]:
results_df = game_df.loc[game_df.season==2021, ["game_code", "home_team_id", "away_team_id", "home_score", "away_score"]]
results_df = results_df.set_index('game_code').loc[np.intersect1d(sim_game_codes, played_game_codes)].reset_index()
team_names = game_df.loc[game_df.season==2021, ["home_team_id", "home_team_abbrev"]]
division_data = pd.read_parquet(os.path.join(data_dir, "division_data.parquet"))
team_names =team_names.rename(columns={'home_team_id': 'id', 'home_team_abbrev': 'name'}).drop_duplicates()
current_division_data = division_data.loc[division_data.season==2021].rename(columns={'team_id': 'id'})
team_names= team_names.merge(current_division_data)
team_names = team_names.to_dict(orient='records')

Now let's deal with unplayed games: get input features and pass to the model to get predictions:

In [None]:
sim_df = game_df.set_index('game_code').loc[unplayed_game_codes, ['home_team_id', 'away_team_id']].reset_index()

# Add prior match outcome probabilities
sim_df = sim_df.merge(
    right=prior_df[['game_code'] + np.intersect1d(input_names, prior_df.columns).tolist()],
    how='left', on='game_code'
)
# input_names = ['prior_home', 'prior_away','home_team_has_ball', 'home_start_score', 'away_start_score', 'quarter', 'overtime', 'play_start_time', 'yd_from_goal', 'from_scrimmage', 'kick_off', 'punt', 'point_after_kick', 'two_point_attempt', 'field_goal_attempt', 'down', 'ytg']

sim_df['prior_home'] = np.where(np.isnan(sim_df["prior_home"]), np.mean(full_df["prior_home"]), sim_df["prior_home"])
sim_df['prior_away'] = np.where(np.isnan(sim_df["prior_away"]), np.mean(full_df["prior_away"]), sim_df["prior_away"])
sim_df['home_team_has_ball'] = 0
sim_df['home_start_score'] = 0
sim_df['away_start_score'] = 0
sim_df['quarter'] = 1
sim_df['overtime'] = 0
sim_df['play_start_time'] = 900
sim_df['yd_from_goal'] = 70
sim_df['from_scrimmage'] = 0
sim_df['kick_off'] = 1
sim_df['punt'] = 0
sim_df['point_after_kick'] = 0
sim_df['two_point_attempt'] = 0
sim_df['field_goal_attempt'] = 0
sim_df['down'] = 0
sim_df['ytg'] = -1
sim_df['home_timeouts_remaining'] = 3
sim_df['away_timeouts_remaining'] = 3
sim_input = sim_df[input_names].values
sim_running_score = sim_df[['home_start_score', 'away_start_score']].values
sim_output = get_model_outputs(clf, sim_input, sim_running_score)


Finally, put results and predictions together and store in a JSON file for the simulation notebook:

In [None]:

# Create predictions object
predictions = []
for ind_game, game_code in enumerate(sim_df.game_code.values):
    predictions.append({
        'game_code': int(game_code),
        'home_team_id': int(sim_df.iloc[ind_game]['home_team_id']),
        'away_team_id': int(sim_df.iloc[ind_game]['away_team_id']),
        'pred_exact_score': sim_output['remaining_score'][ind_game].tolist(),
        'pred_outcome': sim_output['ft_outcome'][ind_game].tolist(),
        'current_score': sim_running_score[ind_game].tolist()
    })
sim_data = {
    'teams': team_names,
    'results': results_df.to_dict(orient='records'),
    'predictions': predictions,
    'prediction_params': {'max_home_score': int(max_home_score), 'max_away_score': int(max_away_score)}
}
with open(os.path.join(data_dir, 'simulation_inputs.json'), 'w') as f:
    json.dump(sim_data, f)
os.system('say "done"')


In [None]:
play_subset = (full_df.game_code == 2337619) & (full_df.down==4)&(full_df.from_scrimmage==1)

play_idx = 134
plot_input = full_df.loc[play_subset, input_names].values
plot_running_score = full_df.loc[play_subset, ['home_start_score', 'away_start_score']].values
plot_output = get_model_outputs(clf, plot_input, plot_running_score)    
plot_input_df = pd.DataFrame(plot_input, columns=input_names)


In [None]:
plot_input_opposite = full_df.loc[play_subset, input_names].values
for x in range(len(plot_input)):
    plot_input_opposite[x, 11] = 1 - plot_input_opposite[x][11]
pd.DataFrame(plot_input, columns=input_names)

In [None]:
plot_output = get_model_outputs(clf, plot_input, plot_running_score)
plot_output_opposite = get_model_outputs(clf, plot_input_opposite, plot_running_score)

In [None]:
pd.DataFrame(plot_output_opposite["ft_outcome"])

In [None]:
pd.DataFrame(plot_output["ft_outcome"])

In [None]:
test_df = full_df.loc[~mask_test, input_names]
scores_df = full_df.loc[~mask_test, ['home_final_score', 'away_final_score']]


In [None]:
# plot_input = full_df.loc[~mask_test, input_names].values
# plot_running_score = full_df.loc[~mask_test, ['home_start_score', 'away_start_score']].values
# plot_output = get_model_outputs(clf, plot_input, plot_running_score)


In [None]:
prior_df[prior_df["game_code"]==2411003]

In [None]:
sim_df['prior_home'] = np.where(np.isnan(sim_df["prior_home"]), np.mean(full_df["prior_home"]), sim_df["prior_home"])
sim_df['prior_away'] = np.where(np.isnan(sim_df["prior_away"]), np.mean(full_df["prior_away"]), sim_df["prior_away"])
sim_df['home_team_has_ball'] = 0
sim_df['home_start_score'] = 0
sim_df['away_start_score'] = 0
sim_df['quarter'] = 1
sim_df['overtime'] = 0
sim_df['play_start_time'] = 900
sim_df['yd_from_goal'] = 70
sim_df['from_scrimmage'] = 0
sim_df['kick_off'] = 1
sim_df['punt'] = 0
sim_df['point_after_kick'] = 0
sim_df['two_point_attempt'] = 0
sim_df['field_goal_attempt'] = 0
sim_df['down'] = 0
sim_df['ytg'] = -1


In [None]:
full_df["home_win"] = np.where(full_df["home_team_outcome"]=="W", 1, 0)
full_df["draw"] = np.where(full_df["home_team_outcome"]=="T", 1, 0)
full_df["away_win"] = np.where(full_df["home_team_outcome"]=="L", 1, 0)


In [None]:
plot_game_code = 2337720

plot_input = full_df.loc[mask_test,input_names].values
plot_running_score = full_df.loc[mask_test, ['home_start_score', 'away_start_score']].values
plot_output = get_model_outputs(clf, plot_input, plot_running_score)
# pd.concat([pd.DataFrame(plot_input, columns=input_names), pd.DataFrame(plot_output["ft_outcome"],columns=["home_win", "draw", "away_win"])], axis=1).to_csv("Bills at Chiefs 01-23-2022 new.csv")
test_df = full_df.loc[mask_test, full_df.columns].reset_index()
scores_df = full_df.loc[mask_test, ['home_final_score', 'away_final_score']]
test_values = pd.concat([test_df, pd.DataFrame(plot_output['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
# pd.DataFrame(plot_output['ft_outcome'])


In [None]:
plot_game_code = 2337720

plot_input_old = full_df.loc[mask_test,input_names_old].values
plot_running_score_old = full_df.loc[mask_test, ['home_start_score', 'away_start_score']].values
plot_output_old = get_model_outputs(clf_old, plot_input_old, plot_running_score_old)
# pd.concat([pd.DataFrame(plot_input_old, columns=input_names_old), pd.DataFrame(plot_output_old["ft_outcome"],columns=["home_win", "draw", "away_win"])], axis=1).to_csv("Bills at Chiefs 01-23-2022 new.csv")
full_df["home_win"] = np.where(full_df["home_team_outcome"]=="W", 1, 0)
full_df["draw"] = np.where(full_df["home_team_outcome"]=="T", 1, 0)
full_df["away_win"] = np.where(full_df["home_team_outcome"]=="L", 1, 0)
test_df_old = full_df.loc[mask_test, full_df.columns].reset_index()
scores_df = full_df.loc[mask_test, ['home_final_score', 'away_final_score']]
test_values_old = pd.concat([test_df, pd.DataFrame(plot_output_old['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
# pd.DataFrame(plot_output_old['ft_outcome'])


In [None]:
from sklearn.calibration import calibration_curve
calib_home_win = calibration_curve(test_values["home_win"], test_values["xhome_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_home_win[1], calib_home_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
from sklearn.calibration import calibration_curve
calib_home_win = calibration_curve(test_values_old["home_win"], test_values_old["xhome_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_home_win[1], calib_home_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_away_win = calibration_curve(test_values["away_win"], test_values["xaway_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_df = test_values[test_values["quarter"]<=3]

calib_away_win = calibration_curve(calib_df["away_win"], calib_df["xaway_win"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
X_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), input_names]
y_train_rf = full_df.loc[~mask_test&(full_df.continuation==0), output_name]
X_test_rf = full_df.loc[mask_test&(full_df.continuation==0), input_names].values
y_test_rf = full_df.loc[mask_test&(full_df.continuation==0), output_name].values




# rf = RandomForestClassifier(n_estimators=10, max_depth=10, verbose=100).fit(X_train_rf, y_train_rf)
# pickle.dump(rf, open(os.path.join(root_dir, 'models/game_score_random_forest.p'), 'wb'))

rf = pickle.load(open(os.path.join(root_dir, "models/game_score_random_forest.p"), 'rb'))
os.system('say "done"')


In [None]:
rf_output = get_model_outputs(rf, plot_input, plot_running_score)

In [None]:
from catboost import CatBoostClassifier
X_train_cb = full_df.loc[~mask_test&(full_df.continuation==0), input_names]
y_train_cb = full_df.loc[~mask_test&(full_df.continuation==0), "home_team_outcome"]
X_test_cb = full_df.loc[mask_test&(full_df.continuation==0), input_names]
y_test_cb = full_df.loc[mask_test&(full_df.continuation==0), "home_team_outcome"]

categoricals = [
    'home_team_has_ball',
    'overtime',
    'from_scrimmage',
    'kick_off',
    'punt',
    'point_after_kick',
    'two_point_attempt',
    'field_goal_attempt',
    'down',
]
param_spaces={
    'random_strength': 39.38775510204081,
    'early_stopping_rounds': 5,
    'min_child_samples': 1,
    'max_depth': 12,
    'learning_rate': 0.05591836734693878,
    'l2_leaf_reg': 55.146938775510215}
cb = CatBoostClassifier(cat_features=categoricals, verbose=1, **param_spaces).fit(X_train_cb, y_train_cb)
pickle.dump(cb, open(os.path.join(root_dir, 'models/game_outcome_catboost.p'), 'wb'))

In [None]:
full_df = full_df.reset_index()
full_df[["xaway_win", "xdraw", "xhome_win"]] = pd.DataFrame(cb.predict_proba(pd.DataFrame(full_df[input_names])))


In [None]:
full_df

In [None]:
test_game_cb = full_df.loc[full_df.game_code==2337720]
plt.plot(test_game_cb["nevent"], test_game_cb["xhome_win"])

In [None]:
from copy import deepcopy
predict_tool = {
    'prior_home': .38,
    'prior_away': .62,
    'home_team_has_ball': None,
    'home_start_score': None,
    'away_start_score': None,
    'quarter': None,
    'overtime': None,
    'play_start_time': None,
    'yd_from_goal': None,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': [0, 1, 0],
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': [0, 0, 1],
    'down': 4,
    'ytg': None,
    'home_timeouts_remaining': None,
    'away_timeouts_remaining': None,
    
}
predict_input = pd.DataFrame(predict_tool, index=["go", "punt", "FGA"])
predict_tool_need = {
    'home_team_has_ball': 1,
    'home_start_score': 14,
    'away_start_score': 14,
    'quarter': 3,
    'overtime': 0,
    'play_start_time': 606,
    'yd_from_goal': 40,
    'ytg': 1,
    'home_timeouts_remaining': 3,
    'away_timeouts_remaining': 3,
}
predict_input[[
    'home_team_has_ball',
    'home_start_score',
    'away_start_score',
    'quarter',
    'overtime',
    'play_start_time',
    'yd_from_goal',
    'ytg',
    'home_timeouts_remaining',
    'away_timeouts_remaining']] = list(predict_tool_need.values())
cb.predict_proba(predict_input)

In [None]:
predict_input

In [None]:

predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_field_goal = deepcopy(predict_input_go_for_it)
predict_input_field_goal["field_goal_attempt"] = 1

print(np.around(cb.predict_proba(predict_input), 3))
print(np.around(cb.predict_proba(predict_input_go_for_it), 3))
print(np.around(cb.predict_proba(predict_input_field_goal), 3))


In [None]:
calib_df = full_df

calib_away_win = calibration_curve(calib_df["draw"], calib_df["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_away_win[1], calib_away_win[0], marker="o")
plt.plot([0, .04], [0, .04])


In [None]:
full_df[mask_test][-1:].values


In [None]:
rf_df = pd.concat([test_df, pd.DataFrame(rf_output["ft_outcome"], columns=["xhome_win", "xdraw", "xaway_win"]), pd.DataFrame(rf_output["home_score"])], axis=1)

In [None]:
pd.DataFrame(rf_output["home_score"]).drop_duplicates()

In [None]:
test_game_rf = rf_df[rf_df["game_code"]==2337720]
plt.plot(test_game_rf["nevent"], test_game_rf["xhome_win"])


In [None]:
plt.plot(test_game["nevent"], test_game["xhome_win"])

In [None]:
calib_df = test_values[test_values["quarter"]==3]

calib_draw = calibration_curve(calib_df["draw"], calib_df["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_draw = calibration_curve(test_values["draw"], test_values["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
calib_draw = calibration_curve(test_values_old["draw"], test_values_old["xdraw"], strategy="quantile", n_bins=10)
%matplotlib inline
plt.plot(calib_draw[1], calib_draw[0], marker="o")
plt.plot([0, 1], [0, 1])


In [None]:
test_game = test_values[test_values["game_code"]==2337720]
test_game["home_win_no_ties"] = test_game["xhome_win"] / (test_game["xhome_win"] + test_game["xaway_win"])
test_game["away_win_no_ties"] = test_game["xaway_win"] / (test_game["xhome_win"] + test_game["xaway_win"])

In [None]:
h_f, h_ax = plt.subplots(1, figsize=(9, 4))
plot_time = test_game[['quarter', 'play_start_time']]
plot_time = ((900 - plot_time.play_start_time) + \
                (plot_time.quarter - 1 ) * 900).values
plot_x = np.append(np.vstack((plot_time[:-1], plot_time[1:])).flatten(order='F'), plot_time[-1])
plot_y1 = np.vstack((test_game["home_win_no_ties"], test_game["home_win_no_ties"])).flatten(order='F')[:-1]
plot_y2 = np.vstack((test_game["away_win_no_ties"], test_game["away_win_no_ties"])).flatten(order='F')[:-1]
h_ax.stackplot(plot_x, plot_y1, plot_y2,
                labels=['Chiefs','Bills'],
                colors=['#E31837', '#00338D'],
                zorder=0)
h_ax.legend(loc='upper left')
if np.max(full_df.loc[full_df.game_code == plot_game_code, "quarter"].values) ==5:
    x_tick_pos = (60 * np.arange(0, 61, 15)).tolist()
    x_tick_str = ["Q1", "Q2", "Q3", "Q4", "OT"]
else:
    x_tick_pos = (60 * np.arange(0, 60, 15)).tolist()
    x_tick_str = ["Q1", "Q2", "Q3", "Q4"]
    
h_ax.set_xticks(x_tick_pos)
h_ax.set_xticklabels(x_tick_str)
h_ax.set_xlabel('Match time')
h_ax.set_xlim(0, plot_x.max())
h_ax.set_ylim(0, 1)
h_ax.set_yticks(np.arange(0, 1.01, 0.25))
h_ax.set_yticklabels(['{0:.0f}%'.format(100 * i) for i in np.arange(0, 1.01, 0.25)])    
h_ax.set_ylabel('Probability')
h_ax.set_title('{0}'.format(*game_info_df.set_index('game_code').loc[plot_game_code, ['game_description']].tolist()))


In [None]:
plt.plot(plot_time, test_game["home_win_no_ties"], '-k')

In [None]:
from copy import deepcopy
plot_game_code = 2337720
mask_fourth_down_bot = (full_df.game_code.isin(test_game_codes))&(full_df.down==4)&(full_df.from_scrimmage==1)
plot_input_fourth_down_bot = full_df.loc[mask_fourth_down_bot,input_names]
plot_input_fourth_down_bot_go = deepcopy(plot_input_fourth_down_bot)
plot_input_fourth_down_bot_punt = deepcopy(plot_input_fourth_down_bot)
plot_input_fourth_down_bot_go["punt"] = 0
plot_input_fourth_down_bot_punt["punt"] = 1
plot_running_score_fourth_down_bot = full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
plot_output_fourth_down_bot = get_model_outputs(clf, plot_input_fourth_down_bot, plot_running_score_fourth_down_bot)
plot_output_fourth_down_bot_go = get_model_outputs(clf, plot_input_fourth_down_bot_go, plot_running_score_fourth_down_bot)
plot_output_fourth_down_bot_punt = get_model_outputs(clf, plot_input_fourth_down_bot_punt, plot_running_score_fourth_down_bot)
test_df_fourth_down_bot = full_df.loc[mask_fourth_down_bot, full_df.columns].reset_index()
test_values_fourth_down_bot = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
test_values_fourth_down_bot_go = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot_go['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)
test_values_fourth_down_bot_punt = pd.concat([test_df_fourth_down_bot, pd.DataFrame(plot_output_fourth_down_bot_punt['ft_outcome'], columns=["xhome_win", "xdraw", "xaway_win"])], axis = 1)


In [None]:
test_values_fourth_down_bot["xhome_win_go"] = test_values_fourth_down_bot_go["xhome_win"]
test_values_fourth_down_bot["xhome_win_punt"] = test_values_fourth_down_bot_punt["xhome_win"]
test_values_fourth_down_bot["xgo_advantage"] = test_values_fourth_down_bot['xhome_win_go'] - test_values_fourth_down_bot['xhome_win_punt']

In [None]:
test_df_fourth_down_bot

In [None]:
min(test_df_fourth_down_bot[test_df_fourth_down_bot["punt"]==1]["yd_from_goal"])

In [None]:
max(test_df_fourth_down_bot[test_df_fourth_down_bot["field_goal_attempt"]==1]["yd_from_goal"])

In [None]:
np.array([24, 17])

In [None]:
predict_tool = {
    'prior_home': .6,
    'prior_away': .3995,
    'home_team_has_ball': 0,
    'home_start_score': 24,
    'away_start_score': 20,
    'quarter': 4,
    'overtime': 0,
    'play_start_time': 200,
    'yd_from_goal': 5,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': 1,
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': 0,
    'down': 4,
    'ytg': 1,
}
predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_go_for_it["punt"] = 0
predict_running_score = predict_input[["home_start_score", "away_start_score"]].values
# full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
print(get_model_outputs(clf, predict_input, predict_running_score)["ft_outcome"])
print(get_model_outputs(clf, predict_input_go_for_it, predict_running_score)["ft_outcome"])


In [None]:
predict_tool = {
    'prior_home': .6,
    'prior_away': .4,
    'home_team_has_ball': 0,
    'home_start_score': 24,
    'away_start_score': 20,
    'quarter': 4,
    'overtime': 0,
    'play_start_time': 200,
    'yd_from_goal': 5,
    'from_scrimmage': 1,
    'kick_off': 0,
    'punt': 1,
    'point_after_kick': 0,
    'two_point_attempt': 0,
    'field_goal_attempt': 0,
    'down': 4,
    'ytg': 1,
}
predict_input = pd.DataFrame(predict_tool, index=[0])
predict_input_go_for_it = deepcopy(predict_input)
predict_input_go_for_it["punt"] = 0
predict_input_go_for_it["punt"] = 0
predict_running_score = predict_input[["home_start_score", "away_start_score"]].values
# full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
print(get_model_outputs(clf, predict_input, predict_running_score)["ft_outcome"])
print(get_model_outputs(clf, predict_input_go_for_it, predict_running_score)["ft_outcome"])


In [None]:


full_df.loc[mask_fourth_down_bot, ['home_start_score', 'away_start_score']].values
get_model_outputs(clf, predict_input, np.array([24, 17]).values())

In [None]:
full_df["ytg_bucket"] = round((full_df["yd_from_goal"]-5)/10) * 10
fourth_downs = full_df.loc[mask_fourth_down_bot]

fourth_downs[input_names+["ytg_bucket"]].groupby("ytg_bucket").mean()

In [None]:
fourth_downs["yd_from_goal"].drop_duplicates().sort_values()

In [None]:
extra_point = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1), input_names].values
extra_point_all = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1)].values
extra_point_running_score = full_df.loc[(mask_test)&(full_df.point_after_kick + full_df.two_point_attempt==1)&(full_df.home_team_has_ball==1), ["home_start_score", "away_start_score"]].values
extra_point_outputs = get_model_outputs(clf, extra_point, extra_point_running_score)
extra_point_df = pd.concat([pd.DataFrame(extra_point_all, columns=full_df.columns), pd.DataFrame(extra_point_outputs["home_score"])], axis=1)

In [None]:
clf.intercepts_[2]

In [None]:
extra_point_df.to_csv("test.csv")

In [None]:
extra_point

In [None]:
full_df[["event_id", "event_name"]].drop_duplicates().sort_values("event_id")