In [40]:
from src.football_db import FootballDB
import numpy as np
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor as RFR
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

In [33]:
# function to perform K-fold Cross Validation
def DoKFold(model,X,y,k,random_state=123):

    # assign kfold function to kf
    kf = KFold(n_splits=k,shuffle=True,random_state=random_state)

    # create empty lists for the training and testing scores and MSE
    train_scores=[]
    test_scores=[]
    train_mse=[]
    test_mse=[]

    # use kfold to split into training and testing data
    for idxTrain,idxTest in kf.split(X):
        Xtrain = X.iloc[idxTrain, :]
        Xtest = X.iloc[idxTest, :]
        ytrain = y.iloc[idxTrain]
        ytest = y.iloc[idxTest]

        # fit the user-provided model
        model.fit(Xtrain,ytrain)

        # append model scores to their respective lists
        train_scores.append(model.score(Xtrain,ytrain))
        test_scores.append(model.score(Xtest, ytest))

        # Compute the mean squared errors
        #    and append them to their respective lists
        ytrain_pred = model.predict(Xtrain)
        ytest_pred = model.predict(Xtest)
        train_mse.append(np.mean((ytrain-ytrain_pred)**2))
        test_mse.append(np.mean((ytest-ytest_pred)**2))

    # return the lists
    return train_scores,test_scores,train_mse,test_mse


In [35]:
space = {'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

def objective(space):
    model = RFR(max_depth = space['max_depth'],
                max_features = space['max_features'],
                min_samples_leaf = space['min_samples_leaf'],
                min_samples_split = space['min_samples_split'],
                n_estimators = space['n_estimators'])
    
    accuracy = cross_val_score(model, X, y, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

# Pass Model

In [36]:
pass_df = FootballDB().get_tPass()
pass_df = pass_df[['clock_min', 'clock_sec', 'quarter', 'down', 'distance', 'fieldpos', 'yards_gained']]
pass_df

Unnamed: 0,clock_min,clock_sec,quarter,down,distance,fieldpos,yards_gained
0,15,0,1,1,10,-20,-1
1,13,38,1,3,8,-22,11
2,12,25,1,3,9,-24,13
3,10,31,1,2,10,-34,11
4,10,3,1,3,6,-38,35
...,...,...,...,...,...,...,...
123584,6,7,4,3,4,-42,17
123585,5,45,4,1,10,47,50
123586,4,38,4,2,7,-28,15
123587,4,11,4,2,8,-40,10


In [37]:
X = pass_df.drop('yards_gained', axis=1)
y = pass_df['yards_gained']


In [38]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best


100%|██████████| 80/80 [2:27:02<00:00, 110.28s/trial, best loss: -0.035430487502412976]


{'max_depth': 400.0,
 'max_features': 3,
 'min_samples_leaf': 0.028120909623998293,
 'min_samples_split': 0.1244338442358785,
 'n_estimators': 4}

In [44]:
# establish random forest regressor with optimal hyperparameters
rfr = RFR(random_state=440, 
          n_estimators=200, 
          min_samples_split=10,
          min_samples_leaf=15,
          max_features=0.9,
          max_depth=60)

# perform k-fold cross validation
tr,te,tr_mse,te_mse = DoKFold(rfr,X,y,k=10)

# print average model results
print('Avg. Training R^2: ' + format(np.mean(tr),'.3f'))
print('Avg. Testing R^2: ' + format(np.mean(te), '.3f'))
print('Avg. Training MSE: ' + format(np.mean(tr_mse),'.3f'))
print('Avg. Testing MSE: ' + format(np.mean(te_mse), '.3f'))


Avg. Training R^2: 0.172
Avg. Testing R^2: 0.030
Avg. Training MSE: 191.984
Avg. Testing MSE: 224.996


# Rush Model

In [22]:
rush_df = FootballDB().get_tRush()
rush_df = rush_df[['clock_min', 'clock_sec', 'quarter', 'down', 'distance', 'fieldpos', 'yards_gained']]
rush_df

Unnamed: 0,clock_min,clock_sec,quarter,down,distance,fieldpos,yards_gained
0,14,14,1,2,11,-19,5
1,13,5,1,2,10,-23,1
2,11,58,1,2,10,-34,8
3,11,9,1,1,10,-20,17
4,9,26,1,1,10,33,10
...,...,...,...,...,...,...,...
138846,3,38,4,3,1,-47,12
138847,2,7,4,1,10,17,5
138848,1,48,4,1,10,11,9
138849,1,36,4,2,1,2,-5


In [10]:
FootballDB().get_tRunConcept()

Unnamed: 0,play_id,run_concept_1,run_concept_2,run_concept_3
0,685252,Power,,
1,685256,,Cutback,Base
2,685259,,Base,
3,685262,,Lead,
4,685266,,Lead,
...,...,...,...,...
138846,4990351,Pull Lead,Lead,
138847,4990354,Man,,
138848,4990364,Counter,,
138849,4990365,Undefined,,
