In [36]:
#Imports
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.cross_validation import train_test_split

In [21]:
#Merging data
years = [2014]
years= [str(i) for i in years]
data={}

for i in years:
    data[i] = pd.read_csv(i+'plays.csv')
    data[i]['year'] = int(i)

fulldf = pd.concat(data.values())
fulldf = fulldf.reset_index(drop=True)
fullplays = len(fulldf)

In [22]:
# create columns for half and if the team with the ball is the home team, as well as time left in half
fulldf['is1stHalf'] = fulldf['quarter'] < 3
fulldf['is2ndHalf'] = (fulldf['quarter'] > 2) & (fulldf['quarter'] < 5)
fulldf['half'] = 2 - (fulldf['is1stHalf'])
for i,row in fulldf.iterrows():
    if row['quarter'] == 5:
        fulldf.loc[i,'half'] = 'OT'

fulldf['isHome'] = fulldf['tm'] == fulldf['home']
fulldf['secsElapsedInHalf'] = np.remainder(fulldf['secsElapsedInGame'],1800)
fulldf['isLast3minHalf'] = fulldf['secsElapsedInHalf'] >= 1620

In [23]:
# create columns for home and away timeouts remaining in the half
for x in ['hm_TO_left','aw_TO_left']:
    fulldf[x] = [3 for i in xrange(fullplays)]
for i,row in fulldf.iterrows():
    if row['half'] == 'OT':
        fulldf.loc[i,'hm_TO_left'] = 2
        fulldf.loc[i,'aw_TO_left'] = 2
    
for i,row in fulldf.iterrows():
    if row['isTimeout']:
        r = fulldf[(fulldf['bsID']==row['bsID']) & (fulldf['half']==row['half'])]
        l = range(i+1,r.index[-1])
        if row['timeoutTeam'] == row['home']:
            fulldf.loc[l,'hm_TO_left'] = r.loc[l,'hm_TO_left'] - 1
        elif row['timeoutTeam'] == row['away']:
            fulldf.loc[l,'aw_TO_left'] = r.loc[l,'aw_TO_left'] - 1

In [24]:
# only keeep run and pass plays from fulldf in the dataframe df
dfrunpass = fulldf[(fulldf['isPass']== True) | (fulldf['isRun'] == True)]
df = dfrunpass.reset_index(drop=True)

In [25]:
# Create feature columns based on given home/away information (tm = team with ball, opp = opponent)
plays = len(df)
for x in ['tm_score','opp_score','tm_winprob','tm_TO_left','opp_TO_left']:
    df[x] = [0 for i in xrange(plays)]
    
for i,row in df.iterrows():
    if row['isHome']:
        df.loc[i,'tm_score'] = row['pbp_score_hm']
        df.loc[i,'opp_score'] = row['pbp_score_aw']
        df.loc[i,'tm_winprob'] = row['home_wp']
        df.loc[i,'tm_TO_left'] = row['hm_TO_left']
        df.loc[i,'opp_TO_left'] = row['aw_TO_left']
    else:
        df.loc[i,'tm_score'] = row['pbp_score_aw']
        df.loc[i,'opp_score'] = row['pbp_score_hm']
        df.loc[i,'tm_winprob'] = 100.0 - row['home_wp']
        df.loc[i,'tm_TO_left'] = row['aw_TO_left']
        df.loc[i,'opp_TO_left'] = row['hm_TO_left']

# create scoring marging column (team with ball - opponent) and indicators if the team with the ball is in FG Range and/or RedZone
df['margin'] = df['tm_score'] - df['opp_score']
df['isFGRange'] = df['distToGoal'] <= 38
df['isRedZone'] = df['distToGoal'] <= 20

In [153]:
df = df[~np.isnan(df.down)].reset_index(drop=True)
df[df['year']<2009].to_csv('0208plays.csv', index=False, encoding = 'utf-8')
df[df['year']>2008].to_csv('0914plays.csv', index=False, encoding = 'utf-8')

In [154]:
df = pd.concat([pd.read_csv('0208plays.csv'),pd.read_csv('0914plays.csv')])

In [155]:
tms = ['pit', 'cle', 'mia', 'kan', 'sdg', 'stl', 'ari', 'min', 'buf',
       'det', 'nor', 'jax', 'cin', 'bal', 'tam', 'ind', 'hou', 'oak',
       'phi', 'gnb', 'ten', 'nyj', 'dal', 'sfo', 'was', 'atl', 'nyg',
       'chi', 'den', 'sea', 'car', 'nwe']
df = df[df.tm.isin(tms)].reset_index(drop=True)

In [156]:
df.shape

(33790, 127)

In [157]:
df.head(2)

Unnamed: 0,away,bsID,challengeUpheld,challenger,detail,distToGoal,down,exp_pts_after,exp_pts_before,fairCatcher,fgBlockRecoverer,fgBlockRetYds,fgBlocker,fgDist,fgGood,fgKicker,fieldside,fumbForcer,fumbRecFieldside,fumbRecYdLine,fumbRecoverer,fumbRetYds,fumbler,home,home_wp,intFieldside,intRetYds,intYdLine,interceptor,isBlocked,isChallenged,isComplete,isFairCatch,isFieldGoal,isFumble,isInt,isKickoff,isKneel,isLateral,isMuffedCatch,isPass,isPenalty,isPresnapPenalty,isPunt,isRun,isSpike,isTD,isTimeout,isTouchback,isTwoPoint,...,punter,qtr_time_remain,quarter,rushDir,rusher,sackYds,sacker1,sacker2,secsElapsedInGame,spikeQB,tackler1,tackler2,target,timeoutNum,timeoutTeam,tm,twoPointSuccess,xpGood,xpKicker,ydLine,yds,yds_to_go,year,is1stHalf,is2ndHalf,half,isHome,secsElapsedInHalf,isLast3minHalf,hm_TO_left,aw_TO_left,tm_score,opp_score,tm_winprob,tm_TO_left,opp_TO_left,margin,isFGRange,isRedZone,RESP,inFGRange,inRedZone,inLast3minHalf,inDown1,inDown2,inDown3,inQuarter1,inQuarter2,inQuarter3,inQuarter4
0,pit,201410120cle,False,,RoetBe00 pass complete short right to WheaMa00...,86,1,0.15,-0.28,,,,,,False,,pit,,,,,,,cle,54.0,,,,,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,,14:56,1,,,,,,4,,SkriBu00,,WheaMa00,,,pit,False,False,,14,6,10,2014,True,False,1,False,4,False,3,3,0,0,46.0,3,3,0,False,False,1,0,0,0,1,0,0,1,0,0,0
1,pit,201410120cle,False,,BellLe00 right guard for 3 yards (tackle by Ki...,80,2,-0.32,0.15,,,,,,False,,pit,,,,,,,cle,55.4,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,...,,14:16,1,RG,BellLe00,,,,44,,KitcIs00,,,,,pit,False,False,,20,3,4,2014,True,False,1,False,44,False,3,3,0,0,44.6,3,3,0,False,False,0,0,0,0,0,1,0,1,0,0,0


In [158]:
#Split into train and test
itrain, itest = train_test_split(xrange(df.shape[0]), train_size=0.7)

In [159]:
mask=np.ones(df.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [160]:
mask.shape, mask.sum()

((33790,), 23653)

In [161]:
df.columns.tolist()

['away',
 'bsID',
 'challengeUpheld',
 'challenger',
 'detail',
 'distToGoal',
 'down',
 'exp_pts_after',
 'exp_pts_before',
 'fairCatcher',
 'fgBlockRecoverer',
 'fgBlockRetYds',
 'fgBlocker',
 'fgDist',
 'fgGood',
 'fgKicker',
 'fieldside',
 'fumbForcer',
 'fumbRecFieldside',
 'fumbRecYdLine',
 'fumbRecoverer',
 'fumbRetYds',
 'fumbler',
 'home',
 'home_wp',
 'intFieldside',
 'intRetYds',
 'intYdLine',
 'interceptor',
 'isBlocked',
 'isChallenged',
 'isComplete',
 'isFairCatch',
 'isFieldGoal',
 'isFumble',
 'isInt',
 'isKickoff',
 'isKneel',
 'isLateral',
 'isMuffedCatch',
 'isPass',
 'isPenalty',
 'isPresnapPenalty',
 'isPunt',
 'isRun',
 'isSpike',
 'isTD',
 'isTimeout',
 'isTouchback',
 'isTwoPoint',
 'isXP',
 'kneelQB',
 'koKicker',
 'koRetYds',
 'koReturner',
 'koYds',
 'location',
 'muffRecoverer',
 'muffRetYds',
 'muffedBy',
 'muffedCatch',
 'oob',
 'opp',
 'passLoc',
 'passer',
 'pbp_score_aw',
 'pbp_score_hm',
 'penDeclined',
 'penOn',
 'penYds',
 'penalty',
 'puntBlockReco

In [162]:
df.quarter.unique()

array([ 1.,  2.,  3.,  4.,  5.])

In [218]:
df["RESP"] = df["isPass"]*1.0
df["inFGRange"] = df["isFGRange"]*1.0
df["inRedZone"] = df["isRedZone"]*1.0
df["inLast3minHalf"] = df["isLast3minHalf"]*1.0
df["inDown1"] = (df["down"] == 1.)*1.0
df["inDown2"] = (df["down"] == 2.)*1.0
df["inDown3"] = (df["down"] == 3.)*1.0
df["inQuarter1"] = (df["quarter"] == 1.)*1.0
df["inQuarter2"] = (df["quarter"] == 2.)*1.0
df["inQuarter3"] = (df["quarter"] == 3.)*1.0
df["inQuarter4"] = (df["quarter"] == 4.)*1.0
lcols = ['distToGoal',"inDown3","inDown2","inDown1",'inQuarter4',"inQuarter3","inQuarter2","inQuarter1",'secsElapsedInHalf',"margin","yds_to_go","tm_TO_left","opp_TO_left","inFGRange", "inRedZone", "tm_winprob","inLast3minHalf", "tm_priorPass"]

In [219]:
df[["RESP"]] = df[["RESP"]].astype(int)
df.RESP.unique()

array([1, 0])

In [186]:
from sklearn.svm import LinearSVC, SVC

In [178]:
clfsvm_lin=LinearSVC(loss="hinge")
Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0] # try fewer if doesnt finish up
Xmatrix=df[lcols].values
Yresp=df['RESP'].values

In [179]:
Xmatrix_train=Xmatrix[mask]
Xmatrix_test=Xmatrix[~mask]
Yresp_train=Yresp[mask]
Yresp_test=Yresp[~mask]

In [180]:
from sklearn.grid_search import GridSearchCV
gs=GridSearchCV(clfsvm_lin, param_grid={'C':Cs}, cv=5)
gs.fit(Xmatrix_train, Yresp_train)
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_

BEST {'C': 1.0} 0.610535661438 [mean: 0.58081, std: 0.05072, params: {'C': 0.001}, mean: 0.59193, std: 0.08620, params: {'C': 0.01}, mean: 0.59329, std: 0.05580, params: {'C': 0.1}, mean: 0.61054, std: 0.00925, params: {'C': 1.0}, mean: 0.54839, std: 0.07411, params: {'C': 10.0}, mean: 0.49732, std: 0.04691, params: {'C': 100.0}]


In [181]:
best = gs.best_estimator_
best.fit(Xmatrix_train, Yresp_train)
best.score(Xmatrix_test, Yresp_test)

0.62020321594160011

In [187]:
clfsvm_orig = SVC()
gs_orig = GridSearchCV(clfsvm_orig, param_grid={'C':Cs}, cv=5)
gs_orig.fit(Xmatrix_train, Yresp_train)
print "BEST", gs_orig.best_params_, gs_orig.best_score_, gs_orig.grid_scores_

BEST {'C': 1.0} 0.599416564495 [mean: 0.59870, std: 0.00005, params: {'C': 0.001}, mean: 0.59870, std: 0.00005, params: {'C': 0.01}, mean: 0.59870, std: 0.00005, params: {'C': 0.1}, mean: 0.59942, std: 0.00175, params: {'C': 1.0}, mean: 0.59646, std: 0.00266, params: {'C': 10.0}, mean: 0.59646, std: 0.00238, params: {'C': 100.0}]


In [188]:
best_orig = gs_orig.best_estimator_
best_orig.fit(Xmatrix_train, Yresp_train)
best_orig.score(Xmatrix_test, Yresp_test)

0.58932623064022882

In [220]:
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

In [221]:
from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.6f" % (training_accuracy)
    print "Accuracy on test data:     %0.6f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [234]:
# create prior pass pct for each team lagged by year
def dist1(y):
    if y < 4:
        return 'short'
    elif y < 8:
        return 'medium'
    elif y < 12:
        return 'long'
    else:
        return 'super long'
df['dist_to_1st'] = [dist1(y) for y in df.yds_to_go.values]
df['tmdowndist_to_1st'] = [str(t) + str(d) + str(d1) for t,d,d1 in zip(df['tm'],df['down'],df['dist_to_1st'])]
res = (df.groupby('tmdowndist_to_1st')['RESP'].mean())
df['tm_priorPass'] = [res[t] for t in df['tmdowndist_to_1st'].values]

In [235]:
from sklearn.linear_model import LogisticRegression
clflog,_,_,_,_  = do_classify(LogisticRegression(penalty="l1"), {"C": [0.001, 0.01, 0.1, 1, 10, 100]}, df, lcols, u'RESP', 1, mask=mask)

using mask
BEST {'C': 1} 0.669851604448 [mean: 0.59595, std: 0.00512, params: {'C': 0.001}, mean: 0.66529, std: 0.00719, params: {'C': 0.01}, mean: 0.66951, std: 0.00958, params: {'C': 0.1}, mean: 0.66985, std: 0.00946, params: {'C': 1}, mean: 0.66930, std: 0.01009, params: {'C': 10}, mean: 0.66934, std: 0.00971, params: {'C': 100}]
############# based on standard predict ################
Accuracy on training data: 0.670824
Accuracy on test data:     0.669626
[[2192 1939]
 [1410 4596]]
########################################################




In [None]:
def cv_optimize2(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

In [None]:
def do_classify2(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5, n_jobs=1):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize2(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [None]:
#Random forest classifier
from sklearn.ensemble import RandomForestClassifier

clfForest = RandomForestClassifier()

parameters = {"n_estimators": range(1,100)}
clfForest, Xtrain, ytrain, Xtest, ytest = do_classify2(clfForest, parameters, df, lcols, 'RESP', 1, mask=mask, score_func='f1', n_jobs=4) # could add njobs to the mix to run them in parallel

In [241]:
range(1,19)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]