In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier

import itertools
import pickle

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import log_loss
from sklearn.metrics import fbeta_score
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

In [136]:
cfb_2016 = pd.read_pickle('cfb_2016.pickle')
cfb_2017 = pd.read_pickle('cfb_2017.pickle')
cfb_2018 = pd.read_pickle('cfb_2018.pickle')

In [3]:
df = pd.concat([cfb_2016, cfb_2017])

In [4]:
df.reset_index(drop=True, inplace=True)

In [5]:
df = df.loc[(df['Wt'] != '-')]
df['Wt'] = pd.to_numeric(df['Wt'])

In [137]:
cfb_2018 = cfb_2018.loc[(cfb_2018['Wt'] != '-')]
cfb_2018['Wt'] = pd.to_numeric(cfb_2018['Wt'])

In [6]:
df = df.loc[(df['Ht'] != '-')]
df = df.loc[(df['Ht'] != 0)]

In [138]:
cfb_2018 = cfb_2018.loc[(cfb_2018['Ht'] != '-')]
cfb_2018 = cfb_2018.loc[(cfb_2018['Ht'] != 0)]

In [7]:
height_list = []

for height in df['Ht']:
    
    heights = str(height).split('-')
    height_list.append(int(heights[0]) * 12 + int(heights[-1]))

In [139]:
height_list = []

for height in cfb_2018['Ht']:
    
    heights = str(height).split('-')
    height_list.append(int(heights[0]) * 12 + int(heights[-1]))

In [8]:
df.reset_index(drop=True, inplace=True)
df['Ht'] = pd.Series(height_list)

In [140]:
cfb_2018.reset_index(drop=True, inplace=True)
cfb_2018['Ht'] = pd.Series(height_list)

In [9]:
df = df.loc[(df['Pos'] != '-')]
df.reset_index(drop=True, inplace=True)

In [141]:
cfb_2018 = cfb_2018.loc[(cfb_2018['Pos'] != '-')]
cfb_2018.reset_index(drop=True, inplace=True)

In [10]:
categorized_list = []
defensive_zone = ['CB', 'S']
defensive_line = ['DB', 'DE', 'DL', 'DT',] 
linebackers = ['LB', 'OLB', 'ILB']
receivers = ['WR', 'TE']
running_back = ['RB']
qb = ['QB']
offensive_lineman = ['OL', 'OT']

for position in df['Pos']:
    
    if position in defensive_line:
        categorized_list.append('Defense')
    elif position in receivers:
        categorized_list.append('Receiver')
    elif position in defensive_zone:
        categorized_list.append('Defensive Zone')
    elif position in qb:
        categorized_list.append('Quarterback')
    elif position in running_back:
        categorized_list.append('Running Back')
    elif position in linebackers:
        categorized_list.append('linebackers')
    elif position in offensive_lineman:
        categorized_list.append('Offensive Lineman')
    else:
        categorized_list.append('Other')

In [142]:
categorized_list = []
defensive_zone = ['CB', 'S']
defensive_line = ['DB', 'DE', 'DL', 'DT',] 
linebackers = ['LB', 'OLB', 'ILB']
receivers = ['WR', 'TE']
running_back = ['RB']
qb = ['QB']
offensive_lineman = ['OL', 'OT']

for position in cfb_2018['Pos']:
    
    if position in defensive_line:
        categorized_list.append('Defense')
    elif position in receivers:
        categorized_list.append('Receiver')
    elif position in defensive_zone:
        categorized_list.append('Defensive Zone')
    elif position in qb:
        categorized_list.append('Quarterback')
    elif position in running_back:
        categorized_list.append('Running Back')
    elif position in linebackers:
        categorized_list.append('linebackers')
    elif position in offensive_lineman:
        categorized_list.append('Offensive Lineman')
    else:
        categorized_list.append('Other')

In [143]:
cfb_2018['Category'] = pd.Series(categorized_list)

In [144]:
cfb_2018 = cfb_2018.loc[(cfb_2018['Category'] != 'Other')]


In [145]:
cfb_2018 = cfb_2018.loc[(cfb_2018['Name']!='David Moore')]
cfb_2018 = cfb_2018.loc[(cfb_2018['Name']!='Josh Jackson')]
cfb_2018 = cfb_2018.loc[(cfb_2018['Name']!='Cameron Smith')]
cfb_2018 = cfb_2018.loc[(cfb_2018['Name']!='Josh Allen')]
cfb_2018 = cfb_2018.loc[(cfb_2018['Name']!='Braden Smith')]
cfb_2018.reset_index(drop=True,inplace=True)

In [16]:
dummies = ['Category', 'Yr']

In [146]:
X1 = cfb_2018[['Yr', 'Ht', 'Wt', 'TD', '2XP', 'Points', 'Rush Yards', 
     'Pass Yards', 'Recv.', 'Int.', 'Solo', 'Sacks', 'Att', 'Comp',
     'Passing TD', 'Passing Int', 'Category']]
y1 = cfb_2018['Drafted']

In [147]:
X1= pd.get_dummies(X1, columns=dummies)
X1.columns

Index(['Ht', 'Wt', 'TD', '2XP', 'Points', 'Rush Yards', 'Pass Yards', 'Recv.',
       'Int.', 'Solo', 'Sacks', 'Att', 'Comp', 'Passing TD', 'Passing Int',
       'Category_Defense', 'Category_Defensive Zone',
       'Category_Offensive Lineman', 'Category_Quarterback',
       'Category_Receiver', 'Category_Running Back', 'Category_linebackers',
       'Yr_-', 'Yr_FR', 'Yr_JR', 'Yr_SO', 'Yr_SR'],
      dtype='object')

In [17]:
X = df[['Yr', 'Ht', 'Wt', 'TD', '2XP', 'Points', 'Rush Yards', 
     'Pass Yards', 'Recv.', 'Int.', 'Solo', 'Sacks', 'Att', 'Comp',
     'Passing TD', 'Passing Int', 'Category']]
y = df['Drafted']

In [18]:
X = pd.get_dummies(X, columns=dummies)
X.columns

Index(['Ht', 'Wt', 'TD', '2XP', 'Points', 'Rush Yards', 'Pass Yards', 'Recv.',
       'Int.', 'Solo', 'Sacks', 'Att', 'Comp', 'Passing TD', 'Passing Int',
       'Category_Defense', 'Category_Defensive Zone',
       'Category_Offensive Lineman', 'Category_Quarterback',
       'Category_Receiver', 'Category_Running Back', 'Category_linebackers',
       'Yr_-', 'Yr_FR', 'Yr_JR', 'Yr_SO', 'Yr_SR'],
      dtype='object')

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                            test_size =0.2, random_state=42)

In [23]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train, y_train)
X_ros = pd.DataFrame(X_ros, columns = X_test.columns)

In [24]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X_train, y_train)
X_ada = pd.DataFrame(X_ada, columns = X_test.columns)

In [25]:
sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X_train, y_train)
X_sm = pd.DataFrame(X_sm, columns = X_test.columns)

In [71]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [72]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [78]:
adaforest = RandomForestClassifier(n_estimators=800, max_depth=100).fit(X_ada, y_ada)

In [76]:
adarf_random = RandomizedSearchCV(estimator = adaforest, param_distributions = random_grid,
                               n_iter = 100, cv = 3, verbose=2,
                                  random_state=42, n_jobs = -1).fit(X_ada, y_ada)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 14.5min finished


In [77]:
adarf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [79]:
print(confusion_matrix(y_test, adaforest.predict(X_test)))
print("Precision: " , precision_score(y_test, adaforest.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, adaforest.predict(X_test), pos_label='Yes'))
print("f1_score: " , f1_score(y_test, adaforest.predict(X_test), pos_label='Yes'))

[[2537   51]
 [  43   22]]
Precision:  0.3013698630136986
Recall:  0.3384615384615385
f1_score:  0.31884057971014496


In [26]:
smlog = LogisticRegression(C=10000, solver='lbfgs').fit(X_sm, y_sm)



In [29]:
print(confusion_matrix(y_test, smlog.predict(X_test)))
print("Precision: " , precision_score(y_test, smlog.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, smlog.predict(X_test), pos_label='Yes'))
print("f1_score: " , f1_score(y_test, smlog.predict(X_test), pos_label='Yes'))

[[2272  316]
 [  10   55]]
Precision:  0.14824797843665768
Recall:  0.8461538461538461
f1_score:  0.25229357798165136


In [30]:
adalog = LogisticRegression(C=10000, solver='lbfgs').fit(X_ada, y_ada)



In [31]:
print(confusion_matrix(y_test, adalog.predict(X_test)))
print("Precision: " , precision_score(y_test, adalog.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, adalog.predict(X_test), pos_label='Yes'))
print("f1 score: " , f1_score(y_test, adalog.predict(X_test), pos_label='Yes'))

[[2242  346]
 [   8   57]]
Precision:  0.141439205955335
Recall:  0.8769230769230769
f1 score:  0.24358974358974358


In [130]:
smgradient_boosting = GradientBoostingClassifier(subsample=0.8, max_depth=7, max_features='sqrt',  
                                                 n_estimators=100).fit(X_sm, y_sm)

In [131]:
print(confusion_matrix(y_test, smgradient_boosting.predict(X_test)))
print("Precision: " , precision_score(y_test, 
                                      smgradient_boosting.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, smgradient_boosting.predict(X_test), pos_label='Yes'))
print("f1_score: " , f1_score(y_test, smgradient_boosting.predict(X_test), pos_label='Yes'))

[[2512   76]
 [  31   34]]
Precision:  0.3090909090909091
Recall:  0.5230769230769231
f1_score:  0.38857142857142857


In [61]:
adagradient_boosting = GradientBoostingClassifier(max_depth=6, max_features='sqrt',
                                                  n_estimators=100).fit(X_ada, y_ada)

In [62]:
print(confusion_matrix(y_test, adagradient_boosting.predict(X_test)))
print("Precision: " , precision_score(y_test, 
                                      adagradient_boosting.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, adagradient_boosting.predict(X_test), pos_label='Yes'))
print("f1_score: " , f1_score(y_test, adagradient_boosting.predict(X_test), pos_label='Yes'))

[[2497   91]
 [  27   38]]
Precision:  0.29457364341085274
Recall:  0.5846153846153846
f1_score:  0.3917525773195876


In [86]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(2, 15, num = 11)]
subsample = [1, 0.8, 0/6, 0.4]

In [87]:
random_grid = {'n_estimators': n_estimators,
               'subsample': subsample,
               'max_depth': max_depth,
              }

In [88]:
adaxgb_random = RandomizedSearchCV(estimator = adaxgboost, param_distributions = random_grid,
                               n_iter = 100, cv = 3, verbose=2,
                                  random_state=42, n_jobs = -1).fit(X_ada, y_ada)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 33.1min finished


In [89]:
adaxgb_random.best_params_

{'subsample': 0.8, 'n_estimators': 1788, 'max_depth': 3}

In [123]:
adaxgboost = XGBClassifier(min_child_weight=5, colsample_bytree=0.7, n_estimators = 100,
                           max_depth=5, subsample=1, learning_rate=0.1).fit(X_ada, y_ada)

In [124]:
print(confusion_matrix(y_test, adaxgboost.predict(X_test)))
print("Precision: " , precision_score(y_test, 
                                      adaxgboost.predict(X_test), pos_label='Yes'))
print("Recall: " , recall_score(y_test, adaxgboost.predict(X_test), pos_label='Yes'))
print("f1_score: " , f1_score(y_test, adaxgboost.predict(X_test), pos_label='Yes'))

[[2513   75]
 [  25   40]]
Precision:  0.34782608695652173
Recall:  0.6153846153846154
f1_score:  0.4444444444444444


In [170]:
cols1 = ['Pass Yards', 'Att', 'Comp', 'Passing TD', 'Points', 'Category_Quarterback', 
         'Yr_JR', 'Yr_SO', 'Yr_SR']
cols2 = ['Ht', 'Wt', 'TD', '2XP', 'Points', 'Rush Yards', 'Pass Yards', 'Recv.', 'Int.', 
         'Solo', 'Sacks', 'Category_Defense', 'Category_Defensive Zone',
       'Category_Offensive Lineman', 'Category_Quarterback', 'Category_Receiver', 
         'Category_Running Back', 'Category_linebackers', 'Yr_JR', 'Yr_SO', 'Yr_SR']

In [173]:
pipe1 = make_pipeline(ColumnSelector(cols=cols1),
                      LogisticRegression(C=10000, solver='lbfgs'))
pipe2 = make_pipeline(ColumnSelector(cols=cols2),
                      XGBClassifier(min_child_weight=5, colsample_bytree=0.7, n_estimators = 100,
                           max_depth=5, subsample=1, learning_rate=0.1))

In [167]:
model_vars = [adaxgboost, adalog, smgradient_boosting]

In [175]:
sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(),
                            random_state=42)
sclf.fit(X_ada, y_ada)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [161]:
print(confusion_matrix(y1, adalog.predict(X1)))
print("Precision: " , precision_score(y1, 
                                      adalog.predict(X1), pos_label='Yes'))
print("Recall: " , recall_score(y1, adalog.predict(X1), pos_label='Yes'))
print("f1_score: " , f1_score(y1, adalog.predict(X1), pos_label='Yes'))

[[5435  852]
 [  21  169]]
Precision:  0.16552399608227228
Recall:  0.8894736842105263
f1_score:  0.2791081750619323


In [165]:
insights = cfb_2018.reset_index()
insights = insights.join(pd.DataFrame(adalog.predict(X1)))
insights['predictions'] = insights[0] 

In [166]:
nope = insights.loc[(insights['predictions']=='No')]
nope.loc[(nope['Drafted']=='Yes')].head(50)

Unnamed: 0,index,Name,Pos,Yr,Ht,Wt,Kickoffs,Punts,TD,FG,...,Solo,Sacks,Att,Comp,Passing TD,Passing Int,Drafted,Category,0,predictions
101,101,Rodney Anderson,RB,JR,73,220.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Running Back,No,No
125,125,Bobby Evans,OL,JR,77,301.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,Yes,Offensive Lineman,No,No
128,128,Cody Ford,OL,JR,76,338.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Offensive Lineman,No,No
513,513,Jonah Williams,OL,JR,77,301.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Offensive Lineman,No,No
1561,1561,Xavier Crawford,DB,JR,73,180.0,0.0,0.0,0.0,0.0,...,18.0,0.0,0.0,0.0,0.0,0.0,Yes,Defense,No,No
1998,1998,Jordan Scarlett,RB,JR,71,210.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Running Back,No,No
2006,2006,Jawaan Taylor,OL,JR,77,328.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Offensive Lineman,No,No
3161,3161,Greedy Williams,CB,SO,75,184.0,0.0,0.0,0.0,0.0,...,23.0,0.0,0.0,0.0,0.0,0.0,Yes,Defensive Zone,No,No
3496,3496,Travis Homer,RB,JR,71,205.0,0.0,0.0,4.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,Yes,Running Back,No,No
3519,3519,Jordan Miller,DL,FR,76,325.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Defense,No,No
