In [1]:
import numpy as np 
import pandas as pd 

from scipy import stats
import math

from string import punctuation
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

pd.set_option('max_columns', 200)
pd.set_option('max_rows', 80)

In [13]:
def cv_score(df_train, y_train, kfolds, pipeline, imp_coef=False):
    oof = np.zeros(len(df_train))
    train = df_train.copy()
    
    feat_df = pd.DataFrame()
    
    for n_fold, (train_index, test_index) in enumerate(kfolds.split(train.values)):
            
        trn_data = train.iloc[train_index][:]
        val_data = train.iloc[test_index][:]
        
        trn_target = y_train.iloc[train_index].values.ravel()
        val_target = y_train.iloc[test_index].values.ravel()
        
        pipeline.fit(trn_data, trn_target)

        oof[test_index] = pipeline.predict(val_data).ravel()

        if imp_coef:
            try:
                fold_df = get_coef(pipeline)
            except AttributeError:
                fold_df = get_feature_importance(pipeline)
                
            fold_df['fold'] = n_fold + 1
            feat_df = pd.concat([feat_df, fold_df], axis=0)
       
    if imp_coef:
        feat_df = feat_df.groupby('feat')['score'].agg(['mean', 'std'])
        feat_df['abs_sco'] = (abs(feat_df['mean']))
        feat_df = feat_df.sort_values(by=['abs_sco'],ascending=False)
        del feat_df['abs_sco']
        return oof, feat_df
    else:    
        return oof
    

def get_coef(pipe):
    imp = pipe.steps[-1][1].coef_.tolist()
    feats = pipe.steps[-2][1].get_feature_names()
    result = pd.DataFrame({'feat':feats,'score':imp})
    result['abs_res'] = abs(result['score'])
    result = result.sort_values(by=['abs_res'],ascending=False)
    del result['abs_res']
    return result


def get_feature_importance(pipe):
    imp = pipe.steps[-1][1].feature_importances_.tolist() #it's a pipeline
    feats = pipe.steps[-2][1].get_feature_names()
    result = pd.DataFrame({'feat':feats,'score':imp})
    result = result.sort_values(by=['score'],ascending=False)
    return result

In [2]:
def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0

    for position in l:
        sub_string = position.split(' ')
        if sub_string[1] == 'DL':
            dl += int(sub_string[0])
        elif sub_string[1] in ['LB','OL']:
            lb += int(sub_string[0])
        else:
            db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts


def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        pos = sub_string[1]
        cnt = int(sub_string[0])

        if pos == 'QB':
            qb += cnt
            sub_total += cnt
            qb_listed = True
        # Assuming LB is a line backer lined up as full back
        elif pos in ['RB','LB']:
            rb += cnt
            sub_total += cnt
        # Assuming DB is a defensive back and lined up as WR
        elif pos in ['WR','DB']:
            wr += cnt
            sub_total += cnt
        elif pos == 'TE':
            te += cnt
            sub_total += cnt
        # Assuming DL is a defensive lineman lined up as an additional line man
        else:
            ol += cnt
            sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts

In [3]:
class transformation(TransformerMixin, BaseEstimator):
    def __init__(self, mean_weight=10):
        self.columns = None
        self.mean_weight = mean_weight
        self.smooth_team = {}
    
    
    def smooth_te(self, data, target, col):
        tmp_data = data.copy()
        tmp_data['target'] = target
        mean_tot = tmp_data['target'].mean()
        means = tmp_data.groupby(col)['target'].mean()
        counts = tmp_data.groupby(col)['target'].count()

        smooth = ((counts * means + self.mean_weight * mean_tot) / 
                       (counts + self.mean_weight))
        return mean_tot, smooth

        
    def fit(self, X, y=None):
        return self
    
    
    def stats_by_play(self, data):
        avg_by_play = data.groupby(['PlayId', 
                                    'Team', 
                                    'offense_team'], as_index=False)[['PlayerHeight', 
                                                                      'PlayerWeight',
                                                                      'age',
                                                                      'S', 'A']].mean()
        spread = data.groupby(['PlayId', 
                               'Team', 
                               'offense_team'])[['X', 'Y']].std().reset_index()
        tot_momentum = data.groupby(['PlayId', 
                                     'Team', 
                                     'offense_team'], as_index=False)[['X_speed', 'Y_speed',
                                                                       'PlayerWeight',
                                                                       'X_acceleration','Y_acceleration']].sum()
        
        tot_momentum['x_momentum'] = tot_momentum['X_speed'] * tot_momentum['PlayerWeight']
        tot_momentum['y_momentum'] = tot_momentum['Y_speed'] * tot_momentum['PlayerWeight']
        tot_momentum['x_force'] = tot_momentum['X_acceleration'] * tot_momentum['PlayerWeight']
        tot_momentum['y_force'] = tot_momentum['Y_acceleration'] * tot_momentum['PlayerWeight']
        tot_momentum.drop(['X_speed', 'Y_speed',
                           'PlayerWeight',  
                           'X_acceleration','Y_acceleration'], axis=1, inplace=True)

        avg_by_play = pd.merge(avg_by_play, tot_momentum, on=['PlayId', 'Team', 'offense_team'])
        avg_by_play = pd.merge(avg_by_play, spread, on=['PlayId', 'Team', 'offense_team'])

        poss_team = avg_by_play[avg_by_play.Team == avg_by_play.offense_team].copy()
        def_team = avg_by_play[avg_by_play.Team != avg_by_play.offense_team].copy()

        poss_team.rename(columns={'PlayerHeight': 'poss_avg_height', 
                                  'PlayerWeight': 'poss_avg_weight',
                                  'age': 'poss_avg_age',
                                  'X': 'poss_std_X',
                                  'Y': 'poss_std_Y',
                                  'S': 'poss_avg_S', 
                                  'A': 'poss_avg_A', 
                                  'x_momentum': 'poss_x_momentum', 
                                  'y_momentum': 'poss_y_momentum', 
                                  'x_force': 'poss_x_force', 
                                  'y_force': 'poss_y_force'}, inplace=True)
        def_team.rename(columns={'PlayerHeight': 'def_avg_height', 
                                  'PlayerWeight': 'def_avg_weight', 
                                  'age': 'def_avg_age',
                                  'X': 'def_std_X',
                                  'Y': 'def_std_Y',
                                  'S': 'def_avg_S', 
                                  'A': 'def_avg_A',
                                  'x_momentum': 'def_x_momentum', 
                                  'y_momentum': 'def_y_momentum', 
                                  'x_force': 'def_x_force', 
                                  'y_force': 'def_y_force'}, inplace=True)

        avg_by_play = pd.merge(poss_team.drop('Team', axis=1), 
                               def_team.drop('Team', axis=1), on=['PlayId', 'offense_team'])
        
        avg_by_play['tot_x_momenumt'] = avg_by_play['poss_x_momentum'] - avg_by_play['def_x_momentum']
        avg_by_play['tot_x_force'] = avg_by_play['poss_x_force'] - avg_by_play['def_x_force']
        avg_by_play['height_diff'] = avg_by_play['poss_avg_height'] - avg_by_play['def_avg_height']
        avg_by_play['weight_diff'] = avg_by_play['poss_avg_weight'] - avg_by_play['def_avg_weight']
        avg_by_play['age_diff'] = avg_by_play['poss_avg_age'] - avg_by_play['def_avg_age']
        avg_by_play['X_diff'] = avg_by_play['poss_std_X'] - avg_by_play['def_std_X']
        avg_by_play['Y_diff'] = avg_by_play['poss_std_Y'] - avg_by_play['def_std_Y']

        return avg_by_play
    
    
    def personnel_features(self, X):
        personnel = X[['PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].str.split(', ')
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
        personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
        personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
        personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].str.split(', ')
        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
        personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
        personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
        personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
        personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
        personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

        # Let's create some features to specify if the OL is covered
        personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
        personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
        # Let's create a feature to specify if the defense is preventing the run
        # Let's just assume 7 or more DL and LB is run prevention
        personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

        personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)

        return personnel
    
    
    def process_play(self, X):
        cols_by_play = ['GameId', 'PlayId', 'YardLine', 
                'Quarter', 'GameClock', 'Down', 'Distance',
                'OffenseFormation', 'DefendersInTheBox',  
                'Yards','Location', 'StadiumType', 'Turf', 
                'GameWeather','Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 
                'PlayDirection', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay']
        train_play = X[cols_by_play].drop_duplicates()
        avg_by_play = self.stats_by_play(X)
        personnel = self.personnel_features(X)
        train_play = pd.merge(train_play, avg_by_play.drop('offense_team', axis=1), on=['PlayId'])
        train_play = pd.merge(train_play, personnel, on=['PlayId'])

        return train_play
    
    
    def transform(self, X, y=None):
        train_play = self.process_play(X)
        carriers = X[X.has_ball].copy()

        to_drop = ['GameId', 'NflId', 'Team', 'Orientation','YardLine', 'Quarter', 'GameClock', 'PossessionTeam',
           'Down', 'FieldPosition', 'HomeScoreBeforePlay',
           'VisitorScoreBeforePlay', 'NflIdRusher', 'OffensePersonnel','DefensePersonnel',
               'PlayDirection', 'Yards', 'Position', 'HomeTeamAbbr',
           'VisitorTeamAbbr', 'Location', 'StadiumType', 'GameWeather',
           'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'to_left',
           'has_ball', 'offense_team', 'Distance',
           'OffenseFormation', 'DefendersInTheBox', 'Turf']

        carriers.drop(to_drop, axis=1, inplace=True)

        full_train = pd.merge(carriers, train_play, on='PlayId')

        full_train.drop(['GameId', 'WindDirection', 'WindSpeed', 'GameWeather', 
                         'PlayDirection', 'StadiumType', 'Turf', 'Location', 'GameClock'], axis=1, inplace=True)
        
        self.columns = full_train.columns

        return full_train
    
    
    def get_features_name(self):
        return self.columns

In [4]:
class df_imputer(TransformerMixin, BaseEstimator):
    '''
    Just a wrapper for the SimpleImputer that keeps the dataframe structure
    '''
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = SimpleImputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled

    
class df_scaler(TransformerMixin, BaseEstimator):
    '''
    Wrapper of StandardScaler or RobustScaler
    '''
    def __init__(self, method='standard'):
        self.scl = None
        self.scale_ = None
        self.method = method
        if self.method == 'sdandard':
            self.mean_ = None
        elif method == 'robust':
            self.center_ = None
        self.columns = None  # this is useful when it is the last step of a pipeline before the model

    def fit(self, X, y=None):
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.scl.fit(X)
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.scl = RobustScaler()
            self.scl.fit(X)
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xscaled

    def get_feature_names(self):
        return list(self.columns)


class dummify(TransformerMixin, BaseEstimator):
    '''
    Wrapper for get dummies
    Via match_cols, it is possible to ask the transformer to make sure that all the dummies are there
    Missing dummies are introduced with a column of 0's
    Extra dummies are dropped
    '''
    def __init__(self, drop_first=False, match_cols=True):
        self.drop_first = drop_first
        self.columns = []  # useful to well behave with FeatureUnion
        self.match_cols = match_cols

    def fit(self, X, y=None):
        return self
    
    def match_columns(self, X):
        miss_train = list(set(X.columns) - set(self.columns))
        miss_test = list(set(self.columns) - set(X.columns))
        
        err = 0
        
        if len(miss_test) > 0:
            for col in miss_test:
                X[col] = 0  # insert a column for the missing dummy
                err += 1
        if len(miss_train) > 0:
            for col in miss_train:
                del X[col]  # delete the column of the extra dummy
                err += 1
                
        if err > 0:
            warnings.warn('The dummies in this set do not match the ones in the train set, we corrected the issue.',
                         UserWarning)
        return X
        
    def transform(self, X):
        X = pd.get_dummies(X, drop_first=self.drop_first)
        if (len(self.columns) > 0):
            if self.match_cols:
                X = self.match_columns(X)
        else:
            self.columns = X.columns
        return X
    
    def get_features_name(self):
        return list(self.columns)

In [8]:
def create_targets(data):
    unique_plays = data[['PlayId', 'Yards']].drop_duplicates()
    simple = unique_plays['Yards'].reset_index(drop=True)
    # As total distance of the rusher
    rushers = data[data.has_ball].copy().reset_index(drop=True)
    tot_dist = rushers['Yards'] + rushers['from_yardline']
    # As percentage of Yards remaining to be gained
    unique_plays = data[['PlayId', 'YardLine', 'Yards']].drop_duplicates().reset_index(drop=True)
    perc_gained = unique_plays['Yards'] / (110 - unique_plays['YardLine'])
    # As both
    perc_dist = tot_dist / (110 - rushers['X'])
    
    return simple, tot_dist, perc_gained, perc_dist

In [5]:
df_train = pd.read_csv('../data_processed/train_processed.csv', dtype={'WindSpeed': 'object'})

df_train.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,Yards,PlayerHeight,PlayerWeight,Position,HomeTeamAbbr,VisitorTeamAbbr,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,to_left,has_ball,offense_team,from_yardline,X_speed,Y_speed,X_acceleration,Y_acceleration,age,distance_from_ball,closest_opponent,opponents_in_6,teammates_in_6
0,2017090700,20170907000118,away,46.09,18.493333,1.69,1.13,0.4,81.99,1.620015,496723,45,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,72,212,SS,NE,KC,"Foxborough, MA",Outdoor,0,Clear and warm,63.0,77.0,8,SW,True,False,home,1.09,1.687953,-0.083145,1.128632,-0.055594,10480,6.480872,4.59331,3.0,7.0
1,2017090700,20170907000118,away,45.33,20.693333,0.42,1.35,0.01,27.61,1.24442,2495116,45,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,75,288,DE,NE,KC,"Foxborough, MA",Outdoor,0,Clear and warm,63.0,77.0,8,SW,True,False,home,0.33,0.397828,0.134657,1.278734,0.432828,10394,4.59331,4.59331,3.0,7.0
2,2017090700,20170907000118,away,46.0,20.133333,1.22,0.59,0.31,3.01,1.174083,2495493,45,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,75,270,DE,NE,KC,"Foxborough, MA",Outdoor,0,Clear and warm,63.0,77.0,8,SW,True,False,home,1.0,1.12525,0.471395,0.544178,0.22797,10457,5.448982,4.59331,3.0,7.0
3,2017090700,20170907000118,away,48.54,25.633333,0.42,0.54,0.02,359.77,2.868623,2506353,45,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,75,245,ILB,NE,KC,"Foxborough, MA",Outdoor,0,Clear and warm,63.0,77.0,8,SW,True,False,home,3.54,0.113229,-0.404449,0.14558,-0.520006,12709,7.820038,4.59331,3.0,7.0
4,2017090700,20170907000118,away,50.68,17.913333,1.82,2.43,0.16,12.63,1.844638,2530794,45,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,72,206,FS,NE,KC,"Foxborough, MA",Outdoor,0,Clear and warm,63.0,77.0,8,SW,True,False,home,5.68,1.752185,-0.492187,2.339456,-0.657151,10980,10.622476,4.59331,3.0,7.0


In [31]:
folds = KFold(5, shuffle=True, random_state=541)

plays = df_train[df_train.has_ball][['PlayId', 
                                     'YardLine', 
                                     'from_yardline', 
                                     'X']].drop_duplicates().reset_index(drop=True)


y1, y2, y3, y4 = create_targets(df_train)

In [32]:
transf_pipe = Pipeline([('trsf', transformation()), 
                        ('dummifier', dummify(drop_first=True)),
                        ('Imputer', df_imputer()),
                        ('scl', df_scaler(method='standard'))])

full_train = transf_pipe.fit_transform(df_train)

In [34]:
def get_crps(train, inf_pred, y):
    y_pred = np.zeros((len(train),199))
    y_true = np.zeros((len(train),199))

    for i,p in enumerate(inf_preds):
        p += 99
        for j in range(199):
            if j >= p + 5:
                y_pred[i][j] = 1.0
            elif j >= p - 5:
                y_pred[i][j] = (j + 5 - p) * 0.05

    for i,p in enumerate(y):
        p += 99
        for j in range(199):
            if j >= p:
                y_true[i][j]=1.0
                
    return np.sum(np.power(y_pred - y_true, 2)) / (199 * (len(train)))

In [36]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('forest', RandomForestRegressor(n_estimators=400, n_jobs=-1)), 
          ('xtree', ExtraTreesRegressor(n_estimators=400, n_jobs=-1)), 
          ('svr', SVR(gamma='auto')),
          ('xgb', xgb.XGBRegressor(n_estimators=400, objective='reg:squarederror')), 
          ('lgb', lgb.LGBMRegressor(n_estimators=400))]

mod_name = []
rmse_train_simple = []
mae_train_simple = []
crps_train_simple = []

rmse_train_tot = []
mae_train_tot = []
crps_train_tot = []

mae_train_prcgained = []
rmse_train_prcgained = []
crps_train_prcgained = []

rmse_train_prctot = []
mae_train_prctot = []
crps_train_prctot = []

for model in models:
    
    train = full_train.copy()
    mod_play = plays.copy()
    del train['PlayId']
    print(model[0])
    mod_name.append(model[0])
    
    model_pipe = Pipeline([model])
            
    inf_preds = cv_score(train, y1, folds, model_pipe)
    
    mod_play['simple'] = inf_preds
    mod_play.loc[mod_play.simple > (110 - mod_play.YardLine), 'simple'] = (110 - mod_play.YardLine)
    inf_preds = mod_play['simple']
    
    avg_preds = inf_preds / 4
    
    crps = get_crps(train, inf_preds, y1)

    rmse_train_simple.append(np.sqrt(mean_squared_error(y1, inf_preds)))
    mae_train_simple.append(mean_absolute_error(y1, inf_preds))
    crps_train_simple.append(crps)
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y1, inf_preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(y1, inf_preds), 4)}')
    print(f'\tTrain set CRPS: {round(crps, 4)}')
    
    print('- -'*20)
    
    inf_preds = cv_score(train, y2, folds, model_pipe)
    
    mod_play['total'] = inf_preds
    mod_play['total'] = mod_play['total'] - mod_play['from_yardline']
    inf_preds = mod_play['total']
    
    avg_preds += inf_preds / 4
    
    crps = get_crps(train, inf_preds, y1)
    
    rmse_train_tot.append(np.sqrt(mean_squared_error(y1, inf_preds)))
    mae_train_tot.append(mean_absolute_error(y1, inf_preds))
    crps_train_tot.append(crps)
    
    print(f'\tTrain set RMSE full distance: {round(np.sqrt(mean_squared_error(y1, inf_preds)), 4)}')
    print(f'\tTrain set MAE full distance: {round(mean_absolute_error(y1, inf_preds), 4)}')
    print(f'\tTrain set CRPS: {round(crps, 4)}')
    
    print('- -'*20)
    
    inf_preds = cv_score(train, y3, folds, model_pipe)
    
    mod_play['per_gain'] = inf_preds
    mod_play['per_gain'] = mod_play['per_gain'] * (110 - mod_play['YardLine'])
    inf_preds = mod_play['per_gain']
    
    avg_preds += inf_preds / 4
    
    crps = get_crps(train, inf_preds, y1)
    
    rmse_train_prcgained.append(np.sqrt(mean_squared_error(y1, inf_preds)))
    mae_train_prcgained.append(mean_absolute_error(y1, inf_preds))
    crps_train_prcgained.append(crps)
    
    print(f'\tTrain set RMSE percentage gained: {round(np.sqrt(mean_squared_error(y1, inf_preds)), 4)}')
    print(f'\tTrain set MAE percentage gained: {round(mean_absolute_error(y1, inf_preds), 4)}')
    print(f'\tTrain set CRPS: {round(crps, 4)}')
    
    print('- -'*20)
    
    inf_preds = cv_score(train, y4, folds, model_pipe)
    
    mod_play['per_tot'] = inf_preds
    mod_play['per_tot'] = mod_play['per_tot'] * (110 - mod_play['X']) - mod_play['from_yardline']
    inf_preds = mod_play['per_tot']
    
    avg_preds += inf_preds / 4
    
    crps = get_crps(train, inf_preds, y1)
    
    rmse_train_prctot.append(np.sqrt(mean_squared_error(y1, inf_preds)))
    mae_train_prctot.append(mean_absolute_error(y1, inf_preds))
    crps_train_prctot.append(crps)
    
    print(f'\tTrain set RMSE full distance percentage: {round(np.sqrt(mean_squared_error(y1, inf_preds)), 4)}')
    print(f'\tTrain set MAE full distance percentage: {round(mean_absolute_error(y1, inf_preds), 4)}')
    print(f'\tTrain set CRPS: {round(crps, 4)}')
    
    print('- -'*20)
    
    print(f'\tTrain set RMSE average: {round(np.sqrt(mean_squared_error(y1, avg_preds)), 4)}')
    print(f'\tTrain set MAE average: {round(mean_absolute_error(y1, avg_preds), 4)}')
    print(f'\tTrain set CRPS: {round(crps, 4)}')

    print('_'*60)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train_simple, 
                        'mae_train': mae_train_simple, 
                        'crps_train': crps_train_simple, 
                        'rmse_tot_dist': rmse_train_tot, 
                        'mae_tot_dist': mae_train_tot, 
                        'crps_tot_dist': crps_train_tot,
                        'rmse_perc_gained': rmse_train_prcgained, 
                        'mae_perc_gained': mae_train_prcgained, 
                        'crps_perc_gained': crps_train_prcgained,
                        'rmse_perc_tot': rmse_train_prctot, 
                        'mae_perc_tot': mae_train_prctot, 
                        'crps_perc_tot': crps_train_prctot,})

results

lasso
	Train set RMSE: 0.01
	Train set MAE: 0.006
	Train set CRPS: 0.0119
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
	Train set RMSE full distance: 0.0142
	Train set MAE full distance: 0.0099
	Train set CRPS: 0.0118
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
	Train set RMSE percentage gained: 3.2889
	Train set MAE percentage gained: 2.1693
	Train set CRPS: 0.0147
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
	Train set RMSE full distance percentage: 4.1143
	Train set MAE full distance percentage: 3.0949
	Train set CRPS: 0.0172
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
	Train set RMSE average: 1.7714
	Train set MAE average: 1.2861
	Train set CRPS: 0.0172
____________________________________________________________


ridge
	Train set RMSE: 0.0004
	Train set MAE: 0.0002
	Train set CRPS: 0.0119
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
	Train set RMSE full distance: 0.0004
	Train set MAE full distance:

Unnamed: 0,model_name,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,crps_tot_dist,rmse_perc_gained,mae_perc_gained,crps_perc_gained,rmse_perc_tot,mae_perc_tot,crps_perc_tot
0,lasso,0.010018,0.005963,0.011935,0.01416,0.009944,0.011834,3.288887,2.169294,0.014721,4.114261,3.094865,0.017205
1,ridge,0.000364,0.000212,0.011905,0.000381,0.00023,0.01189,4.176989,2.691543,0.015673,4.807894,3.420485,0.01806
2,sgd,0.000717,0.000454,0.011841,0.000791,0.0005,0.011839,4.384909,2.94366,0.016705,4.889113,3.441565,0.018365
3,forest,0.231499,0.005598,0.011134,0.268746,0.027802,0.011731,0.508991,0.077852,0.01173,0.578537,0.191168,0.011861
4,xtree,0.145241,0.003464,0.011142,0.204849,0.0259,0.011768,0.321685,0.048881,0.011749,0.392959,0.146135,0.011827
5,svr,2.383352,0.628349,0.012401,2.40723,0.641232,0.012432,3.597162,2.61531,0.015258,3.69759,2.709276,0.016603
6,xgb,0.093867,0.002745,0.01187,0.134865,0.041887,0.011762,0.453332,0.256643,0.011869,0.439346,0.256732,0.011853
7,lgb,0.371074,0.019572,0.011804,0.378079,0.058841,0.011786,0.412972,0.136068,0.011816,0.402993,0.158206,0.011822


In [37]:
results.corr()

Unnamed: 0,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,crps_tot_dist,rmse_perc_gained,mae_perc_gained,crps_perc_gained,rmse_perc_tot,mae_perc_tot,crps_perc_tot
rmse_train,1.0,0.990554,0.557889,0.999622,0.995753,0.938885,0.192217,0.254162,0.180016,0.117894,0.132937,0.122635
mae_train,0.990554,1.0,0.617099,0.988902,0.997345,0.972399,0.301282,0.363083,0.287729,0.229822,0.246267,0.234885
crps_train,0.557889,0.617099,1.0,0.540492,0.601581,0.721137,0.631297,0.680558,0.619059,0.595286,0.620666,0.604315
rmse_tot_dist,0.999622,0.988902,0.540492,1.0,0.995168,0.932662,0.171813,0.234125,0.159919,0.097256,0.112362,0.102019
mae_tot_dist,0.995753,0.997345,0.601581,0.995168,1.0,0.956244,0.237188,0.301288,0.224816,0.163919,0.181157,0.16935
crps_tot_dist,0.938885,0.972399,0.721137,0.932662,0.956244,1.0,0.497916,0.553314,0.479786,0.43261,0.448137,0.437413
rmse_perc_gained,0.192217,0.301282,0.631297,0.171813,0.237188,0.497916,1.0,0.996588,0.992803,0.995043,0.992503,0.995036
mae_perc_gained,0.254162,0.363083,0.680558,0.234125,0.301288,0.553314,0.996588,1.0,0.991342,0.986576,0.98676,0.988055
crps_perc_gained,0.180016,0.287729,0.619059,0.159919,0.224816,0.479786,0.992803,0.991342,1.0,0.983303,0.978498,0.985309
rmse_perc_tot,0.117894,0.229822,0.595286,0.097256,0.163919,0.43261,0.995043,0.986576,0.983303,1.0,0.998566,0.999575


In [38]:
results.sort_values(by='rmse_train')

Unnamed: 0,model_name,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,crps_tot_dist,rmse_perc_gained,mae_perc_gained,crps_perc_gained,rmse_perc_tot,mae_perc_tot,crps_perc_tot
1,ridge,0.000364,0.000212,0.011905,0.000381,0.00023,0.01189,4.176989,2.691543,0.015673,4.807894,3.420485,0.01806
2,sgd,0.000717,0.000454,0.011841,0.000791,0.0005,0.011839,4.384909,2.94366,0.016705,4.889113,3.441565,0.018365
0,lasso,0.010018,0.005963,0.011935,0.01416,0.009944,0.011834,3.288887,2.169294,0.014721,4.114261,3.094865,0.017205
6,xgb,0.093867,0.002745,0.01187,0.134865,0.041887,0.011762,0.453332,0.256643,0.011869,0.439346,0.256732,0.011853
4,xtree,0.145241,0.003464,0.011142,0.204849,0.0259,0.011768,0.321685,0.048881,0.011749,0.392959,0.146135,0.011827
3,forest,0.231499,0.005598,0.011134,0.268746,0.027802,0.011731,0.508991,0.077852,0.01173,0.578537,0.191168,0.011861
7,lgb,0.371074,0.019572,0.011804,0.378079,0.058841,0.011786,0.412972,0.136068,0.011816,0.402993,0.158206,0.011822
5,svr,2.383352,0.628349,0.012401,2.40723,0.641232,0.012432,3.597162,2.61531,0.015258,3.69759,2.709276,0.016603


In [23]:
results.sort_values(by='mae_train')

Unnamed: 0,model_name,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,rmse_perc_gained,mae_perc_gained,rmse_perc_tot,mae_perc_tot
1,ridge,0.000364,0.000213,0.01782,0.000381,0.00023,0.19123,0.086287,0.108813,0.074956
2,sgd,0.000718,0.000432,0.017806,0.000898,0.000578,0.194437,0.090137,0.110326,0.076666
6,xgb,0.095897,0.003226,0.017798,0.134865,0.041887,0.021575,0.008637,0.012777,0.006363
4,xtree,0.149173,0.003776,0.017054,0.19776,0.02555,0.024771,0.002428,0.011971,0.003888
0,lasso,0.010032,0.006026,0.017859,0.01416,0.009944,0.192432,0.080831,0.112837,0.074184
3,forest,0.231953,0.006344,0.017048,0.270173,0.027919,0.029629,0.003671,0.015328,0.005301
7,lgb,0.375767,0.020782,0.017733,0.378079,0.058841,0.027702,0.005049,0.010414,0.00393
5,svr,2.385005,0.632088,0.018692,2.40723,0.641232,0.165842,0.07533,0.07856,0.055893


In [25]:
results.sort_values(by='crps_train')

Unnamed: 0,model_name,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,rmse_perc_gained,mae_perc_gained,rmse_perc_tot,mae_perc_tot
3,forest,0.231953,0.006344,0.017048,0.270173,0.027919,0.029629,0.003671,0.015328,0.005301
4,xtree,0.149173,0.003776,0.017054,0.19776,0.02555,0.024771,0.002428,0.011971,0.003888
7,lgb,0.375767,0.020782,0.017733,0.378079,0.058841,0.027702,0.005049,0.010414,0.00393
6,xgb,0.095897,0.003226,0.017798,0.134865,0.041887,0.021575,0.008637,0.012777,0.006363
2,sgd,0.000718,0.000432,0.017806,0.000898,0.000578,0.194437,0.090137,0.110326,0.076666
1,ridge,0.000364,0.000213,0.01782,0.000381,0.00023,0.19123,0.086287,0.108813,0.074956
0,lasso,0.010032,0.006026,0.017859,0.01416,0.009944,0.192432,0.080831,0.112837,0.074184
5,svr,2.385005,0.632088,0.018692,2.40723,0.641232,0.165842,0.07533,0.07856,0.055893


In [26]:
results.sort_values(by='rmse_tot_dist')

Unnamed: 0,model_name,rmse_train,mae_train,crps_train,rmse_tot_dist,mae_tot_dist,rmse_perc_gained,mae_perc_gained,rmse_perc_tot,mae_perc_tot
1,ridge,0.000364,0.000213,0.01782,0.000381,0.00023,0.19123,0.086287,0.108813,0.074956
2,sgd,0.000718,0.000432,0.017806,0.000898,0.000578,0.194437,0.090137,0.110326,0.076666
0,lasso,0.010032,0.006026,0.017859,0.01416,0.009944,0.192432,0.080831,0.112837,0.074184
6,xgb,0.095897,0.003226,0.017798,0.134865,0.041887,0.021575,0.008637,0.012777,0.006363
4,xtree,0.149173,0.003776,0.017054,0.19776,0.02555,0.024771,0.002428,0.011971,0.003888
3,forest,0.231953,0.006344,0.017048,0.270173,0.027919,0.029629,0.003671,0.015328,0.005301
7,lgb,0.375767,0.020782,0.017733,0.378079,0.058841,0.027702,0.005049,0.010414,0.00393
5,svr,2.385005,0.632088,0.018692,2.40723,0.641232,0.165842,0.07533,0.07856,0.055893
