In [92]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

**Preprocess Data**

In [44]:
df_lb = pd.read_csv('lebronstats.csv')
df_lb.reset_index()
df_lb.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22020,1610612747,LAL,Los Angeles Lakers,22000608,2021-03-15,LAL @ GSW,W,30,22,8,13,0.615,2,4,0.5,4,6,0.667,1,9,10,11,0,0,6,3,6.6
1,22020,1610612747,LAL,Los Angeles Lakers,22000585,2021-03-12,LAL vs. IND,W,34,18,5,13,0.385,1,4,0.25,7,11,0.636,0,3,3,10,1,0,3,4,2.2
2,32020,1610616834,LBN,Team LeBron,32000001,2021-03-07,LBN @ DRT,W,13,4,2,7,0.286,0,3,0.0,0,0,,0,2,2,4,0,1,1,0,-0.6
3,22020,1610612747,LAL,Los Angeles Lakers,22000542,2021-03-02,LAL vs. PHX,L,38,38,16,24,0.667,3,8,0.375,3,5,0.6,0,5,5,6,2,1,3,1,0.4
4,22020,1610612747,LAL,Los Angeles Lakers,22000527,2021-02-28,LAL vs. GSW,W,24,19,7,12,0.583,3,5,0.6,2,4,0.5,2,4,6,4,2,2,2,2,3.6


In [49]:
def preprocess_lb(df):
  df = df.dropna()
  # Normalize Data: x - xmin / xmax - xmin
  arr = ['PTS', 'AST', 'REB', 'DREB', 'FGA', 'MIN', 'FGM',	'FGA', 'FG_PCT',
         'FG3M', 'FG3A', 'FG3_PCT',	'FTM', 'FTA', 'FT_PCT', 'STL', 'BLK',	
         'TOV',	'PF', 'PLUS_MINUS']
  for field in arr:
    scaler = MinMaxScaler() 
    df[field] = scaler.fit_transform(df[[field]])
  df = df.replace('W', 1)
  df = df.replace('L', 0)
  df['WL'] = df['WL'].astype('int') 
  return df

In [51]:
df_lb = preprocess_lb(df_lb)

In [52]:
df_lb.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22020,1610612747,LAL,Los Angeles Lakers,22000608,2021-03-15,LAL @ GSW,1,0.431818,0.327586,0.347826,0.264706,0.702857,0.25,0.25,0.5,0.166667,0.185185,0.667,1,0.529412,0.526316,0.578947,0.0,0.0,0.545455,0.5,0.847059
1,22020,1610612747,LAL,Los Angeles Lakers,22000585,2021-03-12,LAL vs. IND,1,0.522727,0.258621,0.217391,0.264706,0.44,0.125,0.25,0.25,0.291667,0.37037,0.636,0,0.176471,0.157895,0.526316,0.142857,0.0,0.272727,0.666667,0.588235
3,22020,1610612747,LAL,Los Angeles Lakers,22000542,2021-03-02,LAL vs. PHX,0,0.613636,0.603448,0.695652,0.588235,0.762286,0.375,0.583333,0.375,0.125,0.148148,0.6,0,0.294118,0.263158,0.315789,0.285714,0.2,0.272727,0.166667,0.482353
4,22020,1610612747,LAL,Los Angeles Lakers,22000527,2021-02-28,LAL vs. GSW,1,0.295455,0.275862,0.304348,0.235294,0.666286,0.375,0.333333,0.6,0.083333,0.111111,0.5,2,0.235294,0.315789,0.210526,0.285714,0.4,0.181818,0.333333,0.670588
5,22020,1610612747,LAL,Los Angeles Lakers,22000512,2021-02-26,LAL vs. POR,1,0.568182,0.431034,0.478261,0.5,0.598857,0.25,0.333333,0.4,0.166667,0.259259,0.5,1,0.588235,0.578947,0.368421,0.571429,0.6,0.181818,0.166667,0.564706


**Random Forest Method**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_lb[['PTS', 'AST', 'REB']], df_lb['WL'], test_size=0.4, random_state=100)

In [38]:
# X_train = pd.Series(X_train).to_numpy()
# X_test = pd.Series(X_test).to_numpy()
# y_train = pd.Series(y_train).to_numpy()
# y_test = pd.Series(y_test).to_numpy()

In [8]:
rforestclf = RandomForestClassifier()

In [9]:
rforestclf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
results = rforestclf.predict(X_test)

In [11]:
total = 0
correct = 0
for i in range(len(results)):
    total += 1
    if results[i] == y_test.ravel()[i]:
        correct += 1
correct / total * 100

61.367249602543716

**Logisitic Regression Method**

In [53]:
df_lb.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22020,1610612747,LAL,Los Angeles Lakers,22000608,2021-03-15,LAL @ GSW,1,0.431818,0.327586,0.347826,0.264706,0.702857,0.25,0.25,0.5,0.166667,0.185185,0.667,1,0.529412,0.526316,0.578947,0.0,0.0,0.545455,0.5,0.847059
1,22020,1610612747,LAL,Los Angeles Lakers,22000585,2021-03-12,LAL vs. IND,1,0.522727,0.258621,0.217391,0.264706,0.44,0.125,0.25,0.25,0.291667,0.37037,0.636,0,0.176471,0.157895,0.526316,0.142857,0.0,0.272727,0.666667,0.588235
3,22020,1610612747,LAL,Los Angeles Lakers,22000542,2021-03-02,LAL vs. PHX,0,0.613636,0.603448,0.695652,0.588235,0.762286,0.375,0.583333,0.375,0.125,0.148148,0.6,0,0.294118,0.263158,0.315789,0.285714,0.2,0.272727,0.166667,0.482353
4,22020,1610612747,LAL,Los Angeles Lakers,22000527,2021-02-28,LAL vs. GSW,1,0.295455,0.275862,0.304348,0.235294,0.666286,0.375,0.333333,0.6,0.083333,0.111111,0.5,2,0.235294,0.315789,0.210526,0.285714,0.4,0.181818,0.333333,0.670588
5,22020,1610612747,LAL,Los Angeles Lakers,22000512,2021-02-26,LAL vs. POR,1,0.568182,0.431034,0.478261,0.5,0.598857,0.25,0.333333,0.4,0.166667,0.259259,0.5,1,0.588235,0.578947,0.368421,0.571429,0.6,0.181818,0.166667,0.564706


In [56]:
X = df_lb[['PTS', 'AST', 'REB', 'DREB', 'FGA', 'MIN', 'FGM',	'FGA', 'FG_PCT',
         'FG3M', 'FG3A', 'FG3_PCT',	'FTM', 'FTA', 'FT_PCT', 'STL', 'BLK',	
         'TOV',	'PF', 'PLUS_MINUS']]
X = sm.add_constant(X)
y = df_lb['WL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=100)

In [57]:
model = sm.Logit(y_train,X_train).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.286195
         Iterations 11


0,1,2,3
Dep. Variable:,WL,No. Observations:,1021.0
Model:,Logit,Df Residuals:,1002.0
Method:,MLE,Df Model:,18.0
Date:,"Fri, 16 Apr 2021",Pseudo R-squ.:,0.5561
Time:,00:50:43,Log-Likelihood:,-292.2
converged:,True,LL-Null:,-658.31
Covariance Type:,nonrobust,LLR p-value:,8.235e-144

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.4959,5.22e+05,-2.01e-05,1.000,-1.02e+06,1.02e+06
PTS,0.3440,1.01e+07,3.41e-08,1.000,-1.98e+07,1.98e+07
AST,1.9463,0.727,2.679,0.007,0.522,3.370
REB,0.1618,1.659,0.098,0.922,-3.089,3.413
DREB,-0.1222,1.680,-0.073,0.942,-3.415,3.170
FGA,0.5379,3.36e+07,1.6e-08,1.000,-6.59e+07,6.59e+07
MIN,-0.2991,1.076,-0.278,0.781,-2.409,1.811
FGM,-2.8211,8e+06,-3.53e-07,1.000,-1.57e+07,1.57e+07
FGA,0.5379,3.36e+07,1.6e-08,1.000,-6.58e+07,6.58e+07


P-Value for RED indicates it does not affect the dependent variable

In [75]:
# Adjust Features
X = df_lb[['PTS', 'AST', 'FG3A', 'FG3_PCT', 'PLUS_MINUS']]
X = sm.add_constant(X)
y = df_lb['WL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=75)

In [76]:
model = sm.Logit(y_train,X_train).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.248227
         Iterations 8


0,1,2,3
Dep. Variable:,WL,No. Observations:,1021.0
Model:,Logit,Df Residuals:,1015.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 16 Apr 2021",Pseudo R-squ.:,0.6123
Time:,00:57:04,Log-Likelihood:,-253.44
converged:,True,LL-Null:,-653.74
Covariance Type:,nonrobust,LLR p-value:,8.596e-171

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.8732,0.864,-13.748,0.000,-13.566,-10.181
PTS,1.8336,0.913,2.009,0.044,0.045,3.622
AST,1.5579,0.752,2.071,0.038,0.083,3.033
FG3A,-2.8891,0.660,-4.375,0.000,-4.183,-1.595
FG3_PCT,1.1912,0.539,2.208,0.027,0.134,2.249
PLUS_MINUS,24.4247,1.685,14.495,0.000,21.122,27.727


In [77]:
predictions = model.predict(X_test)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
predictions

94      0.0
1363    1.0
1266    1.0
1194    0.0
1635    0.0
       ... 
925     1.0
998     1.0
1053    1.0
1341    1.0
567     1.0
Length: 550, dtype: float64

In [78]:
y_test

94      0
1363    1
1266    1
1194    0
1635    0
       ..
925     1
998     0
1053    1
1341    1
567     1
Name: WL, Length: 550, dtype: int64

In [79]:
def accuracy(predictions, y_test):
  total = 0
  correct = 0
  for i in range(len(predictions.ravel())):
      pred = predictions.ravel()[i]
      total += 1
      if pred == y_test.ravel()[i]:
          correct += 1
  return correct / total * 100

In [80]:
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
max_acc = 0
thresh = 0
for threshold in thresholds:
  predictions = model.predict(X_test)
  predictions[predictions > threshold] = 1
  predictions[predictions <= threshold] = 0
  acc = accuracy(predictions, y_test)
  if acc > max_acc:
    max_acc = acc
    thresh = threshold
max_acc, thresh

(85.27272727272728, 0.6)

**Classification Metrics**

In [81]:
predictions = model.predict(X_test)
predictions[predictions > 0.4] = 1
predictions[predictions <= 0.4] = 0

In [82]:
#Your code here
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = matrix.ravel()
(tn, fp, fn, tp)

(130, 62, 25, 333)

In [83]:
TPR = tp / (tp + fn)
FPR = fp / (fp + tn)
TPR, FPR

(0.9301675977653632, 0.3229166666666667)

In [None]:
#Your code here
from sklearn.metrics import confusion_matrix

thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
TPR_arr = []
FPR_arr = []

for threshold in thresholds:
  predictions = model.predict(X_test)
  predictions[predictions > threshold] = 1
  predictions[predictions <= threshold] = 0
  tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
  TPR = tp / (tp + fn)
  FPR = fp / (fp + tn)
  TPR_arr.append(TPR)
  FPR_arr.append(FPR)

**K-Folds Testing**

In [118]:
def accuracy_kfold(predictions, y_test):
  total = 0
  correct = 0
  for i in range(len(predictions.ravel())):
      pred = predictions.ravel()[i]
      total += 1
      if pred == y_test.to_numpy()[i]:
          correct += 1
  return correct / total * 100

In [139]:
splits = 50
kf = KFold(n_splits=splits)
X = df_lb[['PTS', 'AST', 'FG3A', 'FG3_PCT', 'PLUS_MINUS']]
X = sm.add_constant(X)
X = X.to_numpy()
y = df_lb['WL']
y = y.to_numpy()
kf.get_n_splits(X)
KFold(n_splits=splits, random_state=75, shuffle=True)
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  X_train = pd.DataFrame({'PTS': X_train[:, 1], 'AST': X_train[:, 2], 
                          'FG3A': X_train[:, 3], 'FG3_PCT': X_train[:, 4], 
                          'PLUS_MINUS': X_train[:, 5]})
  X_test = pd.DataFrame({'PTS': X_test[:, 1], 'AST': X_test[:, 2], 
                          'FG3A': X_test[:, 3], 'FG3_PCT': X_test[:, 4], 
                          'PLUS_MINUS': X_test[:, 5]})
  y_train = pd.DataFrame({'WL': y_train})
  y_test = pd.DataFrame({'WL': y_test})
  model = sm.Logit(y_train,X_train).fit()
  predictions = model.predict(X_test)
  predictions[predictions > 0.4] = 1
  predictions[predictions <= 0.4] = 0
  acc = accuracy_kfold(predictions, y_test)
  print(acc)

Optimization terminated successfully.
         Current function value: 0.476290
         Iterations 6
84.375
Optimization terminated successfully.
         Current function value: 0.473445
         Iterations 6
71.875
Optimization terminated successfully.
         Current function value: 0.471893
         Iterations 7
78.125
Optimization terminated successfully.
         Current function value: 0.476577
         Iterations 6
84.375
Optimization terminated successfully.
         Current function value: 0.471154
         Iterations 7
65.625
Optimization terminated successfully.
         Current function value: 0.475473
         Iterations 6
65.625
Optimization terminated successfully.
         Current function value: 0.470265
         Iterations 7
81.25
Optimization terminated successfully.
         Current function value: 0.473872
         Iterations 6
78.125
Optimization terminated successfully.
         Current function value: 0.472251
         Iterations 6
62.5
Optimization terminate

**Random State Iteration**

In [140]:
def accuracy(predictions, y_test):
  total = 0
  correct = 0
  for i in range(len(predictions.ravel())):
      pred = predictions.ravel()[i]
      total += 1
      if pred == y_test.ravel()[i]:
          correct += 1
  return correct / total * 100

In [142]:
for i in range(100):
  X = df_lb[['PTS', 'AST', 'FG3A', 'FG3_PCT', 'PLUS_MINUS']]
  X = sm.add_constant(X)
  y = df_lb['WL']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=i)
  model = sm.Logit(y_train,X_train).fit()
  predictions = model.predict(X_test)
  predictions[predictions > 0.4] = 1
  predictions[predictions <= 0.4] = 0
  print(accuracy(predictions, y_test))

Optimization terminated successfully.
         Current function value: 0.284335
         Iterations 8
88.07631160572336
Optimization terminated successfully.
         Current function value: 0.280480
         Iterations 8
88.39427662957074
Optimization terminated successfully.
         Current function value: 0.265118
         Iterations 8
89.0302066772655
Optimization terminated successfully.
         Current function value: 0.275526
         Iterations 8
87.91732909379968
Optimization terminated successfully.
         Current function value: 0.279512
         Iterations 8
89.0302066772655
Optimization terminated successfully.
         Current function value: 0.279597
         Iterations 8
88.39427662957074
Optimization terminated successfully.
         Current function value: 0.297473
         Iterations 8
89.66613672496025
Optimization terminated successfully.
         Current function value: 0.286782
         Iterations 8
88.87122416534181
Optimization terminated successfully.
    