In [9]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn as sk
assert sk.__version__ >= "0.20"

# XGBoost Import
import xgboost as xgb
xgb.set_config(verbosity = 0)
# assert xgb.get_config["verbosity"] == 0

# Common imports
import numpy as np
import os
import pandas as pd
import seaborn as sns

# to make this notebook's output stable across runs
np.random.seed(0)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from IPython.display import display, HTML

pd.set_option('display.max_columns', None)

# Get current directory of file (so filesystem should work regardless of user machine)
cwd = os.getcwd()
print(cwd)

# Read in data from sources
play_by_play_path = os.path.join(cwd, 'Data')
play_by_play_path = os.path.join(play_by_play_path, 'syracuse_gsw_basic_pbp.csv')
play_by_play_df = pd.read_csv(play_by_play_path)

results_path = os.path.join(cwd, 'Data')
results_path = os.path.join(results_path, "syracuse_gsw_basic_results.csv")
game_results_df = pd.read_csv(results_path)


game_data_master_path = os.path.join(cwd, 'game_data_master.csv')
game_data_master = pd.read_csv(game_data_master_path)



# Reset indices
play_by_play_df.set_index('id', inplace= True)
game_results_df.set_index('id', inplace= True)
display(play_by_play_df.head(2))




C:\Users\perso\Documents\GitHub\syracusebasketballchallenge


Unnamed: 0_level_0,season,season_type,nba_game_id,home,away,game_date,quarter,poss_id,off_team,def_team,poss_time,oncourt_id,play_type,off_def,player_type,player_name,player_nba_id,play_clock,sequence_id,play_id,result_id,seq_result_id,play_zone,last_updated_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1219,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,1,LAL,GSW,720,1,Rim Gather,OFF,Ball-Handler,DeAndre Jordan,201599.0,716,1,2,0022100002_1_1,0022100002_1_1,Rim,26:31.2
1227,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,3,LAL,GSW,694,1,Initiation,OFF,Ball-Handler,Russell Westbrook,201566.0,689,3,6,,0022100002_1_3,Mid-Left Above Break 3,26:31.2


In [4]:
game_results_df.head(5)

Unnamed: 0_level_0,season,season_type,nba_game_id,home,away,game_date,quarter,poss_id,off_team,def_team,poss_time,poss_margin,poss_type,poss_press,poss_zone,oncourt_id,result_id,result_num,result_off,result_off_id,result_def1,result_def1_id,result_def2,result_def2_id,result_type,result_pbp_id,result_pbp_error,result_time,result_zone,result_contest,result_tov_reason,result_tov_teammate,result_foul_reason,isAND1,points,self_created,assisted,hcky_assisted,potential_ast,passer_name,passer_nba_id,hcky_passer_name,hcky_passer_nba_id,def1_play,def2_play,orb_win_name,orb_win_nba_id,drb_win_name,drb_win_nba_id,last_updated_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
267,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,3,LAL,GSW,694,0.0,Halfcourt,0,0,1,0022100002_1_3,1,LeBron James,2544.0,Stephen Curry,201939.0,,,TOV,11.0,,687,Mid-Right Short Paint,,Def Forced,,,0.0,0.0,0.0,0.0,0.0,0.0,Russell Westbrook,201566.0,,,Steal,,,,,,26:31.2
275,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,11,GSW,LAL,622,-2.0,Halfcourt,0,0,1,0022100002_1_11,1,Stephen Curry,201939.0,Russell Westbrook,201566.0,Kent Bazemore,203145.0,3+D Pull-Up,26.0,,612,Mid-Right Above Break 3,Average,,,,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,,,,26:31.2
283,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,17,GSW,LAL,532,-1.0,Halfcourt,0,0,1,0022100002_1_19,1,Kevon Looney,1626172.0,DeAndre Jordan,201599.0,,,Off-Lob / Tip,45.0,,524,Rim,Open,,,,0.0,2.0,0.0,1.0,1.0,1.0,Draymond Green,203110.0,Jordan Poole,1629673.0,,,,,,,26:31.2
291,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,24,GSW,LAL,446,0.0,Halfcourt,0,0,1,0022100002_1_26,1,Draymond Green,203110.0,Kent Bazemore,203145.0,,,TOV,61.0,,442,Mid-Left Midrange,,Off Fault,,,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,,,,26:31.2
299,2021,Regular Season,22100002,LAL,GSW,10/19/2021,1,32,LAL,GSW,374,2.0,Halfcourt,0,0,3,0022100002_1_34,1,LeBron James,2544.0,Andre Iguodala,2738.0,,,Off-Movement,82.0,,370,Left Above Break 3,Average,,,,0.0,3.0,1.0,0.0,0.0,0.0,,,,,,,,,,,26:31.2


In [40]:
pts_per_result = []
for types in game_results_df['result_type'].unique():
    pts_per_result.append(game_results_df.loc[(game_results_df['result_type'] == types)]['points'].sum())
    
pts_per_result_df = pd.DataFrame(pts_per_result)
pts_per_result_df.set_index(game_results_df['result_type'].unique())

Unnamed: 0,0
TOV,0.0
3+D Pull-Up,1573.0
Off-Lob / Tip,534.0
Off-Movement,1874.0
Floater,2026.0
1-2D Pull-Up,1505.0
Catch & Shoot,4362.0
Face-Up,322.0
Side Out,0.0
Post Fade,210.0


In [44]:
play_per_pts_regsn = []
for types in game_data_master['result_type'].unique():
    play_per_pts_regsn.append(game_data_master.loc[(game_data_master['result_type'] == types) & (game_data_master['season_type'] == 'Regular Season') ]['points'].sum())
    
play_per_pts_df_regsn = pd.DataFrame(play_per_pts)


In [45]:
play_per_pts_df_regsn.set_index(game_data_master['result_type'].unique())

ValueError: Length mismatch: Expected 17 rows, received array of length 20

In [50]:
game_data_master['points'].fillna(0, inplace= True)

In [51]:
#targets = ['result_type', 'result_zone', 'result_contest']



targets = ['points']
attributes = [  'play_type',
                'player_type',
                'player_name',
                'play_zone',
                'result_type',
                'result_contest',
                'result_zone',
                'result_off'
            ]

X = pd.get_dummies(game_data_master[attributes])
y = pd.get_dummies(game_data_master[targets])



from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 0)


In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from tabulate import tabulate
from sklearn.model_selection import cross_val_score

C_grid = [1E-5,1E-4,1E-3,1E-2,1E-1,1E0]
best_C_l2 = 0
score_C_l2 = -np.inf

for C in C_grid:
    log_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", penalty="l2", C = C, random_state=0, max_iter=10000)
    train_cross_val = cross_val_score(log_reg, X_train, y_train, cv=4, scoring="accuracy") 

    avg = np.average(train_cross_val) # Avg Cross val score

    if avg > score_C_l2:
        score_C_l2 = avg
        best_C_l2 = C 


print('Best C value for L2:', best_C_l2)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best C value for L2: 0.1


In [71]:
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", penalty="l2", C = 0.1, random_state=0, max_iter=10000)

cross_val_score(log_reg, X_train, y_train, cv=4, scoring="f1_weighted") 

#f1_weighted = weighted f1 by class instances

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.6455046 , 0.65165159, 0.66220675, 0.66456309])

In [64]:
sk.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',