In [136]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.neural_network import MLPRegressor, MLPClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_columns", None)

In [137]:
# DATASET = 'SEASON'
DATASET = 'TEAM'

if DATASET == 'TEAM':
    train = pd.read_csv('../data/train_team.csv')
    test = pd.read_csv('../data/test_team.csv')

if DATASET == 'SEASON':
    train = pd.read_csv('../data/train_season.csv')
    test = pd.read_csv('../data/test_season.csv')
    
train, validation = train_test_split(train, train_size=0.8, shuffle=False)

In [138]:
target_column = "shot_made_flag"  # y_column_name

In [139]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    print(cm)
    cm = cm[cm.index]
    return cm

In [140]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_1 = f1_score(y_true,y_pred,average='binary',pos_label=1)
    f1_score_0 = f1_score(y_true,y_pred,average='binary',pos_label=0)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_1,f1_score_0,f1_score_macro]

results = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)'])

In [141]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

def split_regular_playoff(df, season, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test
#     train = split_x_y(df_train, 'shot_made_flag')
#     test = split_x_y(df_test, 'shot_made_flag')
#     return train[0], train[1], test[0], test[1]
#     return train, test

# x_train, y_train, x_test, y_test = split_regular_playoff(df, '2010-11')
# train, test = split_regular_playoff(df, '2010-11')

## Split X & Y

In [142]:
x_train, y_train = split_x_y(train, target_column)
x_test, y_test = split_x_y(test, target_column)
x_val, y_val = split_x_y(validation, target_column)

print(f"Rows in training-set: {len(x_train)}")
print(f"Rows in validation-set: {len(x_val)}")

Rows in training-set: 158
Rows in validation-set: 40


In [118]:
lambdas = [1e-10,1e-5,1e-4,1e-3,1e-2,0.1, 0.5,1,5,10,50,100]

# Linear Regression

In [143]:
x_train.describe()

Unnamed: 0,period,playoffs,shot_distance,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month,weekday,action_type_te,opponent_te,combined_shot_type_Dunk,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,shot_type_2PT Field Goal,shot_type_3PT Field Goal,shot_zone_area_Back Court(BC),shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,2.481013,0.0,13.240506,1.765823,1.310127,309.348101,0.489494,0.797468,9.765823,0.493861,4.170886,3.537975,0.477542,0.469697,0.044304,0.778481,0.164557,0.012658,0.85443,0.14557,0.006329,0.405063,0.139241,0.164557,0.088608,0.196203,0.5,0.5
std,1.11001,0.0,9.890116,1.277775,1.033835,221.207886,0.179155,1.110227,7.266683,0.229491,4.274339,1.55836,0.246353,0.0,0.206423,0.416589,0.371959,0.11215,0.353796,0.353796,0.079556,0.492465,0.347298,0.371959,0.28508,0.398386,0.50159,0.50159
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.469697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,6.25,1.0,0.0,108.0,0.5,0.0,4.0,0.4025,1.0,2.0,0.366412,0.469697,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,14.0,2.0,1.0,284.5,0.52,0.0,9.0,0.5,2.0,4.0,0.366412,0.469697,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
75%,3.0,0.0,18.75,2.0,2.0,495.5,0.57,1.0,14.0,0.6,4.0,5.0,0.75,0.469697,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,4.0,0.0,70.0,6.0,4.0,707.0,0.65,5.0,28.0,1.0,12.0,6.0,1.0,0.469697,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [144]:
reg = LinearRegression()
a = reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.3577746307951094

#### train data

In [145]:
y_train_pred = reg.predict(x_train)
y_train_pred = y_train_pred >= 0.6
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.7341772151898734

#### Validation Data

In [146]:
y_val_pred = reg.predict(x_val)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Linear',:] = compute_metrics(y_val.values, y_val_pred)
results

0.825

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.825,0.740741,0.867925,0.804333


# Lasso Regression

In [147]:
lasso_cv = LassoCV(alphas=lambdas, cv=5, max_iter=100_000)
a = lasso_cv.fit(x_train.values, y_train.values.flatten())

lasso_r2 =  np.mean(cross_val_score(lasso_cv, x_train.values, y_train.values.flatten()))

print('Best lambda:', lasso_cv.alpha_, 'R2 score:',lasso_r2)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Best lambda: 0.01 R2 score: 0.1686979568282047


  model = cd_fast.enet_coordinate_descent_gram(


In [148]:
y_train_pred = lasso_cv.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.7151898734177216

In [149]:
y_val_pred = lasso_cv.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Lasso',:] = compute_metrics(y_val.values, y_val_pred)
results

0.775

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.825,0.740741,0.867925,0.804333
Lasso,0.775,0.64,0.836364,0.738182


# Ridge Regression

In [150]:
ridge_cross_val_metrics = pd.DataFrame(columns=['mean MSE', 'mean norm_MSE', 'mean R2'])
# We calculate the cross-validation metrics for each lambda 
for lambda_val in lambdas:
    kf = KFold(n_splits=5)
    i=1
    cv_mse = []
    cv_nmse = []
    cv_r2 = []
    # We compute the metrics for each fold and then perform the mean.
    for train_index, test_index in kf.split(x_train):
        x_train_fold = x_train.values[train_index]
        y_train_fold = y_train.values[train_index]
        x_test_fold = x_train.values[test_index,:]
        y_test_fold = y_train.values[test_index]

        ridge = Ridge(alpha=lambda_val)
        a = ridge.fit(x_train_fold,y_train_fold)
        y_pred_fold = ridge.predict(x_test_fold)
        fold_mse = mean_squared_error(y_test_fold, y_pred_fold)
        fold_nmse =  1-r2_score(y_test_fold, y_pred_fold)
        fold_r2 = r2_score(y_test_fold, y_pred_fold)
        cv_mse.append(fold_mse)
        cv_nmse.append(fold_nmse)
        cv_r2.append(fold_r2)
    ridge_cross_val_metrics.loc['Lambda={}'.format(lambda_val),:] = [np.mean(cv_mse),np.mean(cv_nmse),np.mean(cv_r2)]

ridge_cross_val_metrics.loc['Mean', :] = ridge_cross_val_metrics.mean()
ridge_cv_r2 = ridge_cross_val_metrics.loc["Mean", "mean R2"]

ridge_cross_val_metrics.sort_values(by='mean R2',ascending=False)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Unnamed: 0,mean MSE,mean norm_MSE,mean R2
Lambda=1,0.217403,0.893311,0.106689
Lambda=5,0.21992,0.904081,0.095919
Lambda=0.5,0.220802,0.907144,0.092856
Lambda=10,0.227588,0.935749,0.064251
Lambda=0.1,0.227864,0.936096,0.063904
Mean,0.230403,0.946985,0.053015
Lambda=0.01,0.232627,0.955924,0.044076
Lambda=0.001,0.233484,0.959514,0.040486
Lambda=0.0001,0.233577,0.959906,0.040094
Lambda=1e-05,0.233586,0.959945,0.040055


In [151]:
ridge = Ridge(alpha=5)

ridge.fit(x_train.values, y_train.values.flatten())

y_train_pred = ridge.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

Ridge(alpha=5)

0.7215189873417721

In [152]:
y_val_pred = ridge.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Ridge',:] = compute_metrics(y_val.values, y_val_pred)
results

0.825

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.825,0.740741,0.867925,0.804333
Lasso,0.775,0.64,0.836364,0.738182
Ridge,0.825,0.740741,0.867925,0.804333


# Neural Network

In [153]:
regr = MLPClassifier(hidden_layer_sizes=(12, ), learning_rate_init=0.00001,  max_iter=1_000_000, tol=1e-8)
regr.fit(x_train.values, y_train.values.flatten())
y_train_pred = regr.predict(x_train.values)
regr.score(x_train.values, y_train.values.flatten())

MLPClassifier(hidden_layer_sizes=(12,), learning_rate_init=1e-05,
              max_iter=1000000, tol=1e-08)

0.7974683544303798

In [154]:
y_train_pred = regr.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.7974683544303798

In [155]:
y_val_pred = regr.predict(x_val.values)
y_val_pred = y_val_pred >= 0.5
y_val_pred = np.multiply(y_val_pred, 1)
accuracy_score(y_val.values, y_val_pred)
results.loc['Neural Net',:] = compute_metrics(y_val.values, y_val_pred)
results

0.675

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.825,0.740741,0.867925,0.804333
Lasso,0.775,0.64,0.836364,0.738182
Ridge,0.825,0.740741,0.867925,0.804333
Neural Net,0.675,0.628571,0.711111,0.669841


In [86]:
# confusion(y_test.values.flatten(), y_test_pred)
confusion(y_val_pred, y_test.values.flatten())

print( classification_report(y_test.values, y_test_pred))

NameError: name 'y_test_pred' is not defined

In [24]:
# Get numerical feature importances
importances = list(regr.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

AttributeError: 'MLPClassifier' object has no attribute 'feature_importances_'