In [50]:
import sys
import os
import json
import pandas as pd
import numpy as np
import optparse
import tensorflow as tf
import keras
import keras.layers as KL
from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV


In [5]:
norm = pd.read_csv('norm.csv')
norm = norm.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
norm.head()

Unnamed: 0,e_id,gp_id,result,x_2539334,y_2539334,s_2539334,a_2539334,dis_2539334,o_2539334,dir_2539334,...,CROSS_0,FLAT_0,GO_0,HITCH_0,IN_0,OUT_0,POST_0,SCREEN_0,SLANT_0,WHEEL_0
0,20180906007536,201809060075,pass_outcome_caught,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,201809060014631,2018090600146,pass_outcome_incomplete,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,201809060016834,2018090600168,pass_outcome_incomplete,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,201809060019039,2018090600190,pass_outcome_caught,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,201809060025631,2018090600256,pass_outcome_incomplete,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
players = pd.read_csv('players.csv')
players = players[players['position']=='QB'] 
players.head(15)

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
8,2532842,78,243,1989-01-20,Arizona,QB,Nick Foles
19,310,76,217,1985-05-17,Boston College,QB,Matt Ryan
51,2558191,74,225,1994-05-04,Pittsburgh,QB,Nathan Peterman
67,382,78,245,1985-01-16,Delaware,QB,Joe Flacco
93,2560955,77,237,1996-05-21,Wyoming,QB,Josh Allen
95,2560757,74,212,1997-01-07,Louisville,QB,Lamar Jackson
100,2506109,77,240,1982-03-02,"Miami, O.",QB,Ben Roethlisberger
124,2495240,73,215,1989-08-03,Virginia Tech,QB,Tyrod Taylor
153,2533031,6-4,240,09/12/1989,Stanford,QB,Andrew Luck
162,2495143,74,220,1987-10-29,Texas Christian,QB,Andy Dalton


In [54]:
qb_norm = norm[norm['x_2506109'] != 0]

qb_norm = qb_norm.loc[(qb_norm.sum(axis=1) != 0), (qb_norm.sum(axis=0) != 0)]

complete = ['pass_outcome_caught', 'pass_outcome_touchdown']

y = qb_norm['result'].apply(lambda x: 1 if x in complete else 0)
X = qb_norm.drop(['e_id', 'gp_id', 'result'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(str(X.shape[0])+' - '+str(X_train.shape[0])+' - '+str(X_test.shape[0]))

697 - 522 - 175


### Random Forest - Default Values

In [28]:
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
t_y_pred_rf = rnd_clf.predict(X_train)


array([[13, 59],
       [ 7, 96]], dtype=int64)

In [35]:
# Classification Report
print(classification_report(y_train, t_y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confustion matrix
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       171
           1       1.00      1.00      1.00       351

    accuracy                           1.00       522
   macro avg       1.00      1.00      1.00       522
weighted avg       1.00      1.00      1.00       522

              precision    recall  f1-score   support

           0       0.65      0.18      0.28        72
           1       0.62      0.93      0.74       103

    accuracy                           0.62       175
   macro avg       0.63      0.56      0.51       175
weighted avg       0.63      0.62      0.55       175

[[13 59]
 [ 7 96]]


### Random Forest - Grid Search

In [36]:
#create a dictionary of parameters 
param_grid = {'max_depth':[6,8,10,12,14,16]}

# create Random Forest model 
rf_obj = RandomForestClassifier(random_state=42)


# Create gridsearch object with various combinations of parameters
rf_Grid = GridSearchCV(rf_obj, param_grid, cv = 5, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
rf_Grid.fit(X_train, y_train)

best_pred = rf_Grid.best_estimator_.predict(X_test)
t_best_pred = rf_Grid.best_estimator_.predict(X_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  30 | elapsed:    3.8s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  21 out of  30 | elapsed:    3.9s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  28 out of  30 | elapsed:    4.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.2s finished


In [37]:
# Classification Report
print(classification_report(y_train, t_y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confustion matrix
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       171
           1       1.00      1.00      1.00       351

    accuracy                           1.00       522
   macro avg       1.00      1.00      1.00       522
weighted avg       1.00      1.00      1.00       522

              precision    recall  f1-score   support

           0       0.65      0.18      0.28        72
           1       0.62      0.93      0.74       103

    accuracy                           0.62       175
   macro avg       0.63      0.56      0.51       175
weighted avg       0.63      0.62      0.55       175

[[13 59]
 [ 7 96]]


### AdaBoost

In [45]:
# Create simple AdaBoostClassifier
ada_clf = AdaBoostClassifier(random_state=42)
ada_clf.fit(X_train, y_train)

# Parameters
param_grid = {'n_estimators':[100, 200, 300, 400],
              'learning_rate':[0.2, 0.4, 0.6, 0.8, 1.0, 1.2]}

# Grid Search
ab_Grid = GridSearchCV(ada_clf, param_grid, cv = 5, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
ab_Grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   47.9s finished


GridSearchCV(cv=5, estimator=AdaBoostClassifier(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': [0.2, 0.4, 0.6, 0.8, 1.0, 1.2],
                         'n_estimators': [100, 200, 300, 400]},
             scoring='roc_auc', verbose=5)

In [46]:
ab_bp = ab_Grid.best_estimator_.predict(X_test)
t_ab_bp = ab_Grid.best_estimator_.predict(X_train)

# Classification report
print(classification_report(y_train, t_ab_bp))
print(classification_report(y_test, ab_bp))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       171
           1       1.00      1.00      1.00       351

    accuracy                           1.00       522
   macro avg       1.00      1.00      1.00       522
weighted avg       1.00      1.00      1.00       522

              precision    recall  f1-score   support

           0       0.53      0.36      0.43        72
           1       0.63      0.78      0.70       103

    accuracy                           0.61       175
   macro avg       0.58      0.57      0.56       175
weighted avg       0.59      0.61      0.59       175



### XGBOOST

In [51]:
# XGBOOOOOOOOOOST 
clf_xgb = xgb.XGBClassifier(random_state=42)

# Parameters 
param = {'n_estimators':np.arange(100, 1001, 50),
         'learning_rate':np.arange(0.1, 1.7, 0.1),
         'max_depth':[1,2],
         'gamma':np.arange(0,5.25,0.25)}

#Randomized Search CV
xgb_rnd = RandomizedSearchCV(clf_xgb, param, cv = 5, scoring = 'roc_auc', refit = True)
xgb_rnd.fit(X_train, y_train) 

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                   param_distributions={'gamma': array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  , 2.25, 2.5 ,
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75

In [52]:
xgb_bp = xgb_rnd.best_estimator_.predict(X_test)
t_xgb_bp = xgb_rnd.best_estimator_.predict(X_train)

# Classification Report
print(classification_report(y_train, t_xgb_bp))
print(classification_report(y_test, xgb_bp))

              precision    recall  f1-score   support

           0       0.94      0.26      0.41       171
           1       0.73      0.99      0.84       351

    accuracy                           0.75       522
   macro avg       0.84      0.63      0.63       522
weighted avg       0.80      0.75      0.70       522

              precision    recall  f1-score   support

           0       0.67      0.11      0.19        72
           1       0.61      0.96      0.74       103

    accuracy                           0.61       175
   macro avg       0.64      0.54      0.47       175
weighted avg       0.63      0.61      0.52       175

