In [17]:
import pandas as pd
import numpy as np
import pprint
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, RidgeClassifierCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.neural_network import MLPRegressor, MLPClassifier

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_columns", None)

# How to use this Notebook

Change `DATASET` and `VAL_ON_END` variable to test the dataset you want.
For explanation of `SEASON` and `TEAM` read notebook `3-modeling-prep.ipynb`

In [None]:
DATASET = 'SEASON'
# DATASET = 'TEAM'
VAL_ON_END = True  # False = Start. False would mean validate on starting season's data

In [18]:
if DATASET == 'TEAM':
    train = pd.read_csv('../data/train_team.csv')
    test = pd.read_csv('../data/test_team.csv')
#     test  = pd.read_csv('../data/test_team_15-16.csv')

if DATASET == 'SEASON':
    train = pd.read_csv('../data/train_season.csv')
    test = pd.read_csv('../data/test_season.csv')

if VAL_ON_END:
    # To validate on data of the last season's, in the training-set
    train, validation = train_test_split(train, train_size=0.8, shuffle=False)
else:
    # To validate on data of the starting season's, in the training-set
    validation, train = train_test_split(train, train_size=0.2, shuffle=False)

In [19]:
target_column = "shot_made_flag"  # y_column_name

In [20]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    print(cm)
    cm = cm[cm.index]
    return cm

In [21]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_1 = f1_score(y_true,y_pred,average='binary',pos_label=1)
    f1_score_0 = f1_score(y_true,y_pred,average='binary',pos_label=0)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_1,f1_score_0,f1_score_macro]

train_res = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)']) 
results = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)'])

In [22]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

def split_regular_playoff(df, season, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test
#     train = split_x_y(df_train, 'shot_made_flag')
#     test = split_x_y(df_test, 'shot_made_flag')
#     return train[0], train[1], test[0], test[1]
#     return train, test

# x_train, y_train, x_test, y_test = split_regular_playoff(df, '2010-11')
# train, test = split_regular_playoff(df, '2010-11')

## Split X & Y

In [23]:
x_train, y_train = split_x_y(train, target_column)
x_test, y_test = split_x_y(test, target_column)
x_val, y_val = split_x_y(validation, target_column)

print(f"Shape in training-set: {x_train.shape}")
print(f"Validation-set Shape: {x_val.shape}")
print(f"Test-set Shape: {x_test.shape}")

Shape in training-set: (1075, 30)
Validation-set Shape: (269, 30)
Test-set Shape: (428, 30)


In [24]:
lambdas = [1e-10, 1e-9, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.5, 1, 5, 10, 50,100]

# Linear Regression

In [25]:
x_train.describe()

Unnamed: 0,period,playoffs,shot_distance,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month,weekday,action_type_te,opponent_te,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,shot_type_2PT Field Goal,shot_type_3PT Field Goal,shot_zone_area_Back Court(BC),shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home
count,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0
mean,2.47907,0.0,12.693953,1.706977,1.296744,319.728372,0.4532,0.787907,9.429767,0.420223,7.132093,3.24186,0.454816,0.455935,0.012093,0.026977,0.008372,0.750698,0.19907,0.002791,0.815814,0.184186,0.00186,0.455814,0.11907,0.141395,0.129302,0.152558,0.474419,0.525581
std,1.155619,0.0,9.060335,1.335205,1.088427,208.606895,0.079559,1.212767,7.396804,0.24207,5.022412,1.949816,0.224158,0.089831,0.109352,0.162091,0.091158,0.432811,0.399486,0.052778,0.387816,0.387816,0.043113,0.498276,0.324021,0.348591,0.33569,0.359728,0.499578,0.499578
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.302326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,5.0,1.0,0.0,142.0,0.42,0.0,4.0,0.29,1.0,1.0,0.291492,0.383333,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,13.0,2.0,1.0,301.0,0.47,0.0,8.0,0.44,11.0,4.0,0.291492,0.439024,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,3.0,0.0,19.0,2.0,2.0,498.5,0.5,1.0,15.0,0.56,12.0,5.0,0.672414,0.533333,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,6.0,0.0,64.0,6.0,4.0,711.0,0.55,10.0,33.0,1.0,12.0,6.0,1.0,0.647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
reg = LinearRegression()
a = reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.24094914027376502

#### train data

In [27]:
y_train_pred = reg.predict(x_train)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
print(f"Accuracy on training-set: {accuracy_score(y_train.values, y_train_pred)}")
train_res.loc['Linear',:] = compute_metrics(y_train.values, y_train_pred)
y_val_pred = reg.predict(x_val)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
print(f"Accuracy on Validation-set: {accuracy_score(y_val.values, y_val_pred)}")
results.loc['Linear',:] = compute_metrics(y_val.values, y_val_pred)
results

Accuracy on training-set: 0.7237209302325581
Accuracy on Validation-set: 0.6765799256505576


Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.67658,0.583732,0.735562,0.659647


# Lasso Regression

In [28]:
lasso_cv = LassoCV(alphas=lambdas, cv=5, max_iter=100_000)
lasso_cv.fit(x_train.values, y_train.values.flatten())
lasso_r2 =  np.mean(cross_val_score(lasso_cv, x_train.values, y_train.values.flatten()))
print('Best lambda:', lasso_cv.alpha_, 'R2 score:',lasso_r2)

Best lambda: 0.001 R2 score: 0.17608066331448133


In [29]:
y_train_pred = lasso_cv.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
print(f"Accuracy on training-set: {accuracy_score(y_train.values, y_train_pred):.5f}")
train_res.loc['Lasso',:] = compute_metrics(y_train.values, y_train_pred)
y_val_pred = lasso_cv.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
print(f"Accuracy on Validation-set: {accuracy_score(y_val.values, y_val_pred)}")
results.loc['Lasso',:] = compute_metrics(y_val.values, y_val_pred)
results

Accuracy on training-set: 0.72186
Accuracy on Validation-set: 0.6765799256505576


Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.67658,0.583732,0.735562,0.659647
Lasso,0.67658,0.587678,0.733945,0.660811


# Ridge Regression

In [30]:
best_cv = None
max_acc = -10
# Loop to find the best CV value, I've seen a good alpha is dependant on CV
for cv in [2, 3, 4, 5, 6, 7, 8]:
    ridge = RidgeClassifierCV(alphas=lambdas, cv=cv)
    ridge.fit(x_train.values, y_train.values.flatten())
#     print('Best lambda:', ridge.alpha_, 'score:', ridge.score(x_train.values, y_train.values.flatten()))
    y_train_pred = ridge.predict(x_train.values)
#     print(f"Accuracy on training-set: {accuracy_score(y_train.values, y_train_pred):.5f}")
    ry_val_pred = ridge.predict(x_val.values)
    ry_val_pred = ry_val_pred >= 0.5
    ry_val_pred = np.multiply(ry_val_pred, 1)
#     print(f"Accuracy on validation-set: {accuracy_score(y_val.values, ry_val_pred)}")
    if max_acc < accuracy_score(y_val.values, ry_val_pred):
        max_acc = accuracy_score(y_val.values, ry_val_pred)
        best_cv = cv

ridge = RidgeClassifierCV(alphas=lambdas, cv=best_cv)
ridge.fit(x_train.values, y_train.values.flatten())
print('Best lambda:', ridge.alpha_, 'score:', ridge.score(x_train.values, y_train.values.flatten()))

y_train_pred = ridge.predict(x_train.values)
print(f"Accuracy on training-set: {accuracy_score(y_train.values, y_train_pred):.5f}")
ry_val_pred = ridge.predict(x_val.values)
print(f"Accuracy on validation-set: {accuracy_score(y_val.values, ry_val_pred)}")

train_res.loc['Ridge',:] = compute_metrics(y_train.values, y_train_pred)
results.loc['Ridge',:] = compute_metrics(y_val.values, ry_val_pred)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

Best lambda: 5.0 score: 0.7227906976744186
Accuracy on training-set: 0.72279
Accuracy on validation-set: 0.6802973977695167


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [31]:
train_res

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.723721,0.655852,0.769231,0.712541
Lasso,0.72186,0.654335,0.767315,0.710825
Ridge,0.722791,0.655093,0.768274,0.711683


In [32]:
results

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.67658,0.583732,0.735562,0.659647
Lasso,0.67658,0.587678,0.733945,0.660811
Ridge,0.680297,0.590476,0.737805,0.664141


In [53]:
# Test Data
x_test.shape

(428, 30)

In [54]:
x_val.shape

(269, 30)

In [55]:
x_test, y_test
print(f"Rows in test-set: {len(x_test)}")
ry_test_pred = ridge.predict(x_test.values)
print(f"Accuracy on Test-set: {accuracy_score(y_test.values, ry_test_pred)}")

Rows in test-set: 428
Accuracy on Test-set: 0.6635514018691588
