In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('display.max_columns', 200)

In [None]:
nflDF = pd.read_csv('data/nflplaybyplay2009to2017//nfl2009_2017.csv')

In [None]:
nflDF = nflDF[~nflDF.down.isnull()]
nflDF = nflDF[nflDF.down != 4.0]

nflDF = nflDF[~nflDF.PlayTimeDiff.isnull()]





nflDF.reset_index(inplace=True)

In [None]:
# parse play decription column to see if play run in No Huddle or Shotgun.
nflDF['NoHuddle'] = nflDF.desc.str.contains('No Huddle')
nflDF['Shotgun'] = nflDF.desc.str.contains('Shotgun')

In [None]:
nflDF['OffenseInOwnHalf'] = nflDF.posteam == nflDF.SideofField

### Create dataframe with the relevant features

In [None]:
nflDF = nflDF[nflDF.PlayType.isin(['Pass', 'Run'])]
playType = nflDF[['PlayType']]
playType.replace(['Pass', 'Run'],[1, 0], inplace=True)
playType.reset_index(inplace=True, drop=True)

In [None]:
features = ['down', 'qtr', 'PlayTimeDiff', 'GoalToGo', 'ydstogo', 'ydsnet', 'ScoreDiff', 'posteam', 'NoHuddle', 'Shotgun', 'OffenseInOwnHalf']
nflDF = nflDF.filter(items=features, axis=1)

In [None]:
playType['PlayType'].value_counts()

In [None]:
# Histogram of Run vs Pass counts
plt.style.use('fivethirtyeight')
plt.hist(playType['PlayType'], bins=2, edgecolor = 'k');
plt.xlabel('Run(0) or Pass(1)'); plt.ylabel('Number of times run'); 
plt.title('Run vs Pass Distribution');

In [None]:
nflDF.columns

In [None]:
plt.hist(nflDF['PlayTimeDiff'], range=(0,100), edgecolor = 'k');
plt.xlabel('Time between plays'); plt.ylabel('Count'); 
plt.title('PlayTimeDiff');
# not normally distributed, so use Min-Max scaler

In [None]:
nflDF['ydstogo'].describe()

In [None]:
plt.hist(nflDF['ScoreDiff'], bins='auto', edgecolor = 'k');
plt.xlabel('ScoreDiff between teams'); plt.ylabel('Count'); 
plt.title('ScoreDiff');

In [None]:
nflDF.GoalToGo = nflDF.GoalToGo.astype(bool)

In [None]:
nflDF.info()

In [None]:
# one-hot encode
downDummy = pd.get_dummies(nflDF.down)
qtrDummy = pd.get_dummies(nflDF.qtr)
posTeamDummy = pd.get_dummies(nflDF.posteam)

In [None]:
nflDF.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
mms = MinMaxScaler()
ss = StandardScaler()
# scale numeric features
nflDF[['PlayTimeDiff', 'ydstogo', 'ydsnet']] = mms.fit_transform(nflDF[['PlayTimeDiff', 'ydstogo', 'ydsnet']])
nflDF[['ScoreDiff']] = ss.fit_transform(nflDF[['ScoreDiff']])

In [None]:
nflDF.head()

In [None]:
modelDF0 = pd.concat([playType, qtrDummy, posTeamDummy, nflDF], axis=1)
modelDF0 = modelDF0.dropna()

In [None]:
modelDF0.head()

In [None]:
# # Find all correlations with the score and sort 
# correlations_data = data.corr()['score'].sort_values()

In [None]:
from sklearn.model_selection import train_test_split
target = modelDF0['PlayType']
featureMatrix = modelDF0.drop(labels=['PlayType', 'posteam'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(featureMatrix, target, test_size=0.25)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, lr.predict(X_test))

In [None]:
>>> from sklearn.metrics import precision_score
>>> y_true = [0, 1, 2, 0, 1, 2]
>>> y_pred = [0, 2, 1, 0, 0, 1]
>>> precision_score(y_true, y_pred, average='macro')  
0.22...
>>> precision_score(y_true, y_pred, average='micro')  
0.33...
>>> precision_score(y_true, y_pred, average='weighted')
... 
0.22...
>>> precision_score(y_true, y_pred, average=None)  
array([ 0.66...,  0.        ,  0.        ])







In [None]:
probs = lr.predict_proba(X_test)

In [None]:
preds = probs[:, 1]

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
%matplotlib inline
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
import sklearn.metrics as skm
skm.roc_auc_score(y_test, lr.predict(X_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, n_estimators=100)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
predsRF = clf.predict_proba(X_test)[:, 1]
fpr, tpr, threshold = roc_curve(y_test, predsRF)
roc_auc = auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
%matplotlib inline
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2).fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# Compare Algorithms
from sklearn import model_selection
# from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.svm import SVC
# prepare models
models = []
models.append(('LR', LogisticRegression()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
# models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, featureMatrix, target, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()