In [63]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.neural_network import MLPRegressor, MLPClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_columns", None)

In [87]:
# DATASET = 'SEASON'
DATASET = 'TEAM'

if DATASET == 'TEAM':
    train = pd.read_csv('../data/train_team.csv')
    test = pd.read_csv('../data/test_team.csv')

if DATASET == 'SEASON':
    train = pd.read_csv('../data/train_season.csv')
    test = pd.read_csv('../data/test_season.csv')
    
train, validation = train_test_split(train, train_size=0.8, shuffle=False)

In [88]:
target_column = "shot_made_flag"  # y_column_name

In [89]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    print(cm)
    cm = cm[cm.index]
    return cm

In [90]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_1 = f1_score(y_true,y_pred,average='binary',pos_label=1)
    f1_score_0 = f1_score(y_true,y_pred,average='binary',pos_label=0)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_1,f1_score_0,f1_score_macro]
    
    

results = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)'])

In [91]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

def split_regular_playoff(df, season, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test
#     train = split_x_y(df_train, 'shot_made_flag')
#     test = split_x_y(df_test, 'shot_made_flag')
#     return train[0], train[1], test[0], test[1]
#     return train, test

# x_train, y_train, x_test, y_test = split_regular_playoff(df, '2010-11')
# train, test = split_regular_playoff(df, '2010-11')

In [92]:
# train.describe(include='all')
train.action_type_te.unique()

array([0.40322581, 0.81818182, 1.        , 0.85714286, 0.22222222])

## Split X & Y

In [93]:
x_train, y_train = split_x_y(train, target_column)
x_test, y_test = split_x_y(test, target_column)

x_val, y_val = split_x_y(validation, target_column)

In [94]:
lambdas = [1e-10,1e-5,1e-4,1e-3,1e-2,0.1, 0.5,1,5,10,50,100]

# Linear Regression

In [95]:
x_train.describe()

Unnamed: 0,period,playoffs,shot_distance,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month,weekday,action_type_te,opponent_te,combined_shot_type_Dunk,combined_shot_type_Jump Shot,combined_shot_type_Layup,shot_type_2PT Field Goal,shot_type_3PT Field Goal,shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home,combined_shot_type_Tip Shot,shot_zone_area_Back Court(BC)
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,2.597222,0.0,14.902778,1.944444,1.527778,297.180556,0.261944,0.486111,11.333333,0.515,1.125,4.486111,0.472081,0.519231,0.027778,0.819444,0.152778,0.791667,0.208333,0.375,0.208333,0.111111,0.138889,0.166667,0.680556,0.319444,0.0,0.0
std,1.182736,0.0,9.314777,1.309649,1.074312,231.39542,0.217829,0.711932,6.83611,0.189082,0.603686,1.609768,0.197161,0.0,0.165489,0.387349,0.362298,0.408966,0.408966,0.48752,0.408966,0.316475,0.348257,0.375293,0.469533,0.469533,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.222222,0.519231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.75,0.0,10.0,1.0,1.0,90.5,0.0,0.0,6.0,0.4275,1.0,2.0,0.403226,0.519231,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,16.0,2.0,2.0,264.0,0.41,0.0,10.0,0.5,1.0,5.0,0.403226,0.519231,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,4.0,0.0,20.25,2.0,2.0,496.75,0.47,1.0,15.0,0.5725,1.0,6.0,0.403226,0.519231,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,4.0,0.0,41.0,5.0,3.0,707.0,0.47,3.0,28.0,1.0,4.0,6.0,1.0,0.519231,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [96]:
reg = LinearRegression()
a = reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.49769580186193696

#### train data

In [97]:
y_train_pred = reg.predict(x_train)
y_train_pred = y_train_pred >= 0.6
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.8055555555555556

#### Validation Data

In [98]:
y_val_pred = reg.predict(x_val)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Linear',:] = compute_metrics(y_val.values, y_val_pred)
results

0.7777777777777778

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.777778,0.6,0.846154,0.723077


# Lasso Regression

In [99]:
lasso_cv = LassoCV(alphas=lambdas, cv=5, max_iter=100_000)
a = lasso_cv.fit(x_train.values, y_train.values.flatten())

lasso_r2 =  np.mean(cross_val_score(lasso_cv, x_train.values, y_train.values.flatten()))

print('Best lambda:', lasso_cv.alpha_, 'R2 score:',lasso_r2)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Best lambda: 5.0 R2 score: 0.06929338938217193


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [100]:
y_train_pred = lasso_cv.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.6666666666666666

In [101]:
y_val_pred = lasso_cv.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Lasso',:] = compute_metrics(y_val.values, y_val_pred)
results

0.5

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.777778,0.6,0.846154,0.723077
Lasso,0.5,0.181818,0.64,0.410909


# Ridge Regression

In [102]:
ridge_cross_val_metrics = pd.DataFrame(columns=['mean MSE', 'mean norm_MSE', 'mean R2'])
# We calculate the cross-validation metrics for each lambda 
for lambda_val in lambdas:
    kf = KFold(n_splits=5)
    i=1
    cv_mse = []
    cv_nmse = []
    cv_r2 = []
    # We compute the metrics for each fold and then perform the mean.
    for train_index, test_index in kf.split(x_train):
        x_train_fold = x_train.values[train_index]
        y_train_fold = y_train.values[train_index]
        x_test_fold = x_train.values[test_index,:]
        y_test_fold = y_train.values[test_index]

        ridge = Ridge(alpha=lambda_val)
        a = ridge.fit(x_train_fold,y_train_fold)
        y_pred_fold = ridge.predict(x_test_fold)
        fold_mse = mean_squared_error(y_test_fold, y_pred_fold)
        fold_nmse =  1-r2_score(y_test_fold, y_pred_fold)
        fold_r2 = r2_score(y_test_fold, y_pred_fold)
        cv_mse.append(fold_mse)
        cv_nmse.append(fold_nmse)
        cv_r2.append(fold_r2)
    ridge_cross_val_metrics.loc['Lambda={}'.format(lambda_val),:] = [np.mean(cv_mse),np.mean(cv_nmse),np.mean(cv_r2)]

ridge_cross_val_metrics.loc['Mean', :] = ridge_cross_val_metrics.mean()
ridge_cv_r2 = ridge_cross_val_metrics.loc["Mean", "mean R2"]

ridge_cross_val_metrics.sort_values(by='mean R2',ascending=False)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Unnamed: 0,mean MSE,mean norm_MSE,mean R2
Lambda=5,0.245636,1.04571,-0.04571
Lambda=10,0.24751,1.054281,-0.054281
Lambda=1,0.260797,1.107458,-0.107458
Lambda=50,0.260764,1.109455,-0.109455
Lambda=100,0.264872,1.126857,-0.126857
Lambda=0.5,0.278727,1.184594,-0.184594
Mean,0.304175,1.300056,-0.300056
Lambda=0.1,0.325114,1.389552,-0.389552
Lambda=0.01,0.350489,1.503676,-0.503676
Lambda=0.001,0.353776,1.518541,-0.518541


In [107]:
ridge = Ridge(alpha=5)

ridge.fit(x_train.values, y_train.values.flatten())

y_train_pred = ridge.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

Ridge(alpha=5)

0.7777777777777778

In [108]:
y_val_pred = ridge.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Ridge',:] = compute_metrics(y_val.values, y_val_pred)
results

0.6666666666666666

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.777778,0.6,0.846154,0.723077
Lasso,0.5,0.181818,0.64,0.410909
Ridge,0.666667,0.25,0.785714,0.517857


# Neural Network

In [83]:
regr = MLPClassifier(hidden_layer_sizes=(12, ), learning_rate_init=0.00001,  max_iter=1_000_000, tol=1e-8)
regr.fit(x_train.values, y_train.values.flatten())
y_train_pred = regr.predict(x_train.values)
regr.score(x_train.values, y_train.values.flatten())

MLPClassifier(hidden_layer_sizes=(12,), learning_rate_init=1e-05,
              max_iter=1000000, tol=1e-08)

1.0

In [84]:
y_train_pred = regr.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

1.0

In [85]:
y_val_pred = regr.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Neural Net',:] = compute_metrics(y_val.values, y_val_pred)
results

0.6666666666666666

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.777778,0.6,0.846154,0.723077
Lasso,0.5,0.181818,0.64,0.410909
Ridge,0.666667,0.25,0.785714,0.517857
Neural Net,0.666667,0.25,0.785714,0.517857


In [86]:
# confusion(y_test.values.flatten(), y_test_pred)
confusion(y_test_pred, y_test.values.flatten())

print( classification_report(y_test.values, y_test_pred))

NameError: name 'y_test_pred' is not defined

In [24]:
# Get numerical feature importances
importances = list(regr.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

AttributeError: 'MLPClassifier' object has no attribute 'feature_importances_'