In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb

pd.set_option('max_columns', 200)
pd.set_option('max_rows', 80)

In [2]:
def cv_score(df_train, y_train, kfolds, pipeline, imp_coef=False):
    oof = np.zeros([y_train.shape[0], y_train.shape[1]])
    
    train = df_train.copy()
    
    feat_df = pd.DataFrame()
    
    for n_fold, (train_index, test_index) in enumerate(kfolds.split(train.values)):
            
        trn_data = train.iloc[train_index][:]
        val_data = train.iloc[test_index][:]
        
        trn_target = y_train[train_index]
        val_target = y_train[test_index]
        
        pipeline.fit(trn_data, trn_target)

        oof[test_index, :] = pipeline.predict(val_data)

        if imp_coef:
            try:
                fold_df = get_coef(pipeline)
            except AttributeError:
                fold_df = get_feature_importance(pipeline)
                
            fold_df['fold'] = n_fold + 1
            feat_df = pd.concat([feat_df, fold_df], axis=0)
       
    if imp_coef:
        feat_df = feat_df.groupby('feat')['score'].agg(['mean', 'std'])
        feat_df['abs_sco'] = (abs(feat_df['mean']))
        feat_df = feat_df.sort_values(by=['abs_sco'],ascending=False)
        del feat_df['abs_sco']
        return oof, feat_df
    else:    
        return oof

In [3]:
def get_coef(pipe):
    imp = pipe.steps[-1][1].coef_.tolist()
    feats = pipe.steps[-2][1].get_feature_names()
    result = pd.DataFrame({'feat':feats,'score':imp})
    result['abs_res'] = abs(result['score'])
    result = result.sort_values(by=['abs_res'],ascending=False)
    del result['abs_res']
    return result


def get_feature_importance(pipe):
    imp = pipe.steps[-1][1].feature_importances_.tolist() #it's a pipeline
    feats = pipe.steps[-2][1].get_feature_names()
    result = pd.DataFrame({'feat':feats,'score':imp})
    result = result.sort_values(by=['score'],ascending=False)
    return result

def _plot_diagonal(ax):
    xmin, xmax = ax.get_xlim()
    ymin, ymax = ax.get_ylim()
    low = min(xmin, xmax)
    high = max(xmin, xmax)
    scl = (high - low) / 100
    
    line = pd.DataFrame({'x': np.arange(low, high ,scl), # small hack for a diagonal line
                         'y': np.arange(low, high ,scl)})
    ax.plot(line.x, line.y, color='black', linestyle='--')
    
    return ax


def plot_predictions(data, true_label, pred_label, feature=None, hue=None, legend=False, savename='test.png'):
    '''
    Plot prediction vs true label or a specific feature. It also plots the residuals plot
    '''
    
    tmp = data.copy()
    tmp['Prediction'] = pred_label
    tmp['True Label'] = true_label
    tmp['Residual'] = tmp['True Label'] - tmp['Prediction']
    
    diag = False
    alpha = 0.7
    label = ''
    
    fig, ax = plt.subplots(1,2, figsize=(15,6))
    
    if feature is None:
        feature = 'True Label'
        diag = True
    else:
        legend = 'full'
        sns.scatterplot(x=feature, y='True Label', data=tmp, ax=ax[0], label='True',
                         hue=hue, legend=legend, alpha=alpha)
        label = 'Predicted'
        alpha = 0.4

    sns.scatterplot(x=feature, y='Prediction', data=tmp, ax=ax[0], label=label,
                         hue=hue, legend=legend, alpha=alpha)
    if diag:
        ax[0] = _plot_diagonal(ax[0])
    
    sns.scatterplot(x=feature, y='Residual', data=tmp, ax=ax[1], 
                    hue=hue, legend=legend, alpha=0.7)
    ax[1].axhline(y=0, color='r', linestyle='--')
    
    ax[0].set_title(f'{feature} vs Predictions')
    ax[1].set_title(f'{feature} vs Residuals')
    
    if not savename.endswith('.png'):
        savename += '.png'
    plt.savefig('../plots/' + savename)
    plt.close()

In [4]:
class df_imputer(TransformerMixin, BaseEstimator):
    '''
    Just a wrapper for the SimpleImputer that keeps the dataframe structure
    '''
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = SimpleImputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled

    
class df_scaler(TransformerMixin, BaseEstimator):
    '''
    Wrapper of StandardScaler or RobustScaler
    '''
    def __init__(self, method='standard'):
        self.scl = None
        self.scale_ = None
        self.method = method
        if self.method == 'sdandard':
            self.mean_ = None
        elif method == 'robust':
            self.center_ = None
        self.columns = None  # this is useful when it is the last step of a pipeline before the model

    def fit(self, X, y=None):
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.scl.fit(X)
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.scl = RobustScaler()
            self.scl.fit(X)
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xscaled

    def get_feature_names(self):
        return list(self.columns)


class dummify(TransformerMixin, BaseEstimator):
    '''
    Wrapper for get dummies
    Via match_cols, it is possible to ask the transformer to make sure that all the dummies are there
    Missing dummies are introduced with a column of 0's
    Extra dummies are dropped
    '''
    def __init__(self, drop_first=False, match_cols=True):
        self.drop_first = drop_first
        self.columns = []  # useful to well behave with FeatureUnion
        self.match_cols = match_cols

    def fit(self, X, y=None):
        return self
    
    def match_columns(self, X):
        miss_train = list(set(X.columns) - set(self.columns))
        miss_test = list(set(self.columns) - set(X.columns))
        
        err = 0
        
        if len(miss_test) > 0:
            for col in miss_test:
                X[col] = 0  # insert a column for the missing dummy
                err += 1
        if len(miss_train) > 0:
            for col in miss_train:
                del X[col]  # delete the column of the extra dummy
                err += 1
                
        if err > 0:
            warnings.warn('The dummies in this set do not match the ones in the train set, we corrected the issue.',
                         UserWarning)
            
        return X[self.columns]
        
        
    def transform(self, X):
        X = pd.get_dummies(X, drop_first=self.drop_first)
        if (len(self.columns) > 0):
            if self.match_cols:
                X = self.match_columns(X)
        else:
            self.columns = X.columns
        return X
    
    def get_features_name(self):
        return list(self.columns)

In [5]:
class transformation(TransformerMixin, BaseEstimator):
    def __init__(self, mean_weight=10):
        self.columns = None

        
    def fit(self, X, y=None):
        return self
    
    
    def stats_by_play(self, data):
        avg_by_play = data.groupby(['PlayId', 
                                    'Team', 
                                    'offense_team'], as_index=False)[['PlayerHeight', 
                                                                      'PlayerWeight',
                                                                      'age',
                                                                      'S', 'A']].mean()
        spread = data.groupby(['PlayId', 
                               'Team', 
                               'offense_team'])[['X', 'Y']].std().reset_index()
        tot_momentum = data.groupby(['PlayId', 
                                     'Team', 
                                     'offense_team'], as_index=False)[['X_speed', 'Y_speed',
                                                                       'PlayerWeight',
                                                                       'X_acceleration','Y_acceleration']].sum()
        
        tot_momentum['x_momentum'] = tot_momentum['X_speed'] * tot_momentum['PlayerWeight']
        tot_momentum['y_momentum'] = tot_momentum['Y_speed'] * tot_momentum['PlayerWeight']
        tot_momentum['x_force'] = tot_momentum['X_acceleration'] * tot_momentum['PlayerWeight']
        tot_momentum['y_force'] = tot_momentum['Y_acceleration'] * tot_momentum['PlayerWeight']
        tot_momentum.drop(['X_speed', 'Y_speed',
                           'PlayerWeight',  
                           'X_acceleration','Y_acceleration'], axis=1, inplace=True)

        avg_by_play = pd.merge(avg_by_play, tot_momentum, on=['PlayId', 'Team', 'offense_team'])
        avg_by_play = pd.merge(avg_by_play, spread, on=['PlayId', 'Team', 'offense_team'])

        poss_team = avg_by_play[avg_by_play.Team == avg_by_play.offense_team].copy()
        def_team = avg_by_play[avg_by_play.Team != avg_by_play.offense_team].copy()

        poss_team.rename(columns={'PlayerHeight': 'poss_avg_height', 
                                  'PlayerWeight': 'poss_avg_weight',
                                  'age': 'poss_avg_age',
                                  'X': 'poss_std_X',
                                  'Y': 'poss_std_Y',
                                  'S': 'poss_avg_S', 
                                  'A': 'poss_avg_A', 
                                  'x_momentum': 'poss_x_momentum', 
                                  'y_momentum': 'poss_y_momentum', 
                                  'x_force': 'poss_x_force', 
                                  'y_force': 'poss_y_force'}, inplace=True)
        def_team.rename(columns={'PlayerHeight': 'def_avg_height', 
                                  'PlayerWeight': 'def_avg_weight', 
                                  'age': 'def_avg_age',
                                  'X': 'def_std_X',
                                  'Y': 'def_std_Y',
                                  'S': 'def_avg_S', 
                                  'A': 'def_avg_A',
                                  'x_momentum': 'def_x_momentum', 
                                  'y_momentum': 'def_y_momentum', 
                                  'x_force': 'def_x_force', 
                                  'y_force': 'def_y_force'}, inplace=True)

        avg_by_play = pd.merge(poss_team.drop('Team', axis=1), 
                               def_team.drop('Team', axis=1), on=['PlayId', 'offense_team'])
        
        avg_by_play['tot_x_momenumt'] = avg_by_play['poss_x_momentum'] - avg_by_play['def_x_momentum']
        avg_by_play['tot_x_force'] = avg_by_play['poss_x_force'] - avg_by_play['def_x_force']
        avg_by_play['height_diff'] = avg_by_play['poss_avg_height'] - avg_by_play['def_avg_height']
        avg_by_play['weight_diff'] = avg_by_play['poss_avg_weight'] - avg_by_play['def_avg_weight']
        avg_by_play['age_diff'] = avg_by_play['poss_avg_age'] - avg_by_play['def_avg_age']
        avg_by_play['X_diff'] = avg_by_play['poss_std_X'] - avg_by_play['def_std_X']
        avg_by_play['Y_diff'] = avg_by_play['poss_std_Y'] - avg_by_play['def_std_Y']

        return avg_by_play
    
    
    def process_play(self, X):
        cols_by_play = ['GameId', 'PlayId', 'YardLine', 
                'Quarter', 'GameClock', 'Down', 'Distance',
                'OffenseFormation', 'DefendersInTheBox',  
                'Location', 'StadiumType', 'Turf', 
                'GameWeather','Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 
                'PlayDirection', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay']
        train_play = X[cols_by_play].drop_duplicates()
        avg_by_play = self.stats_by_play(X)
        train_play = pd.merge(train_play, avg_by_play.drop('offense_team', axis=1), on=['PlayId'])

        return train_play
    
    
    def dropper(self, X):
        cleaned = X.copy()
        to_drop = ['GameId', 'WindDirection', 'WindSpeed', 'GameWeather', 
                    'PlayDirection', 'StadiumType', 'Location', 
                   'GameClock', 'distance_from_ball', 'Quarter', 'Down', 
                   'OffenseFormation', 'Temperature', 'Humidity', 
                   'HomeScoreBeforePlay','VisitorScoreBeforePlay', 
                   'Dis', 'Dir', 'Yards', 'Distance', 'PlayId', 'X', 'Y']
        for col in to_drop:
            try:
                del cleaned[col]
            except KeyError:
                pass
        
        return cleaned
    
    
    def transform(self, X, y=None):
        train_play = self.process_play(X)
        carriers = X[X.has_ball].copy()

        to_drop = ['GameId', 'NflId', 'Team', 'Orientation','YardLine', 'Quarter', 'GameClock', 'PossessionTeam',
           'Down', 'FieldPosition', 'HomeScoreBeforePlay',
           'VisitorScoreBeforePlay', 'NflIdRusher', 'OffensePersonnel','DefensePersonnel',
               'PlayDirection', 'Position', 'HomeTeamAbbr',
           'VisitorTeamAbbr', 'Location', 'StadiumType', 'GameWeather',
           'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'to_left',
           'has_ball', 'offense_team', 'Distance',
           'OffenseFormation', 'DefendersInTheBox', 'Turf']

        carriers.drop(to_drop, axis=1, inplace=True)

        full_train = pd.merge(carriers, train_play, on='PlayId')
        
        full_train = self.dropper(full_train)
        
        self.columns = full_train.columns

        return full_train
    
    
    def get_features_name(self):
        return self.columns

In [6]:
df_train = pd.read_csv('../data_processed/train_processed.csv', dtype={'WindSpeed': 'object'})

In [7]:
transf_pipe = Pipeline([('trsf', transformation())])

full_train = transf_pipe.fit_transform(df_train)

full_train.head()

Unnamed: 0,S,A,PlayerHeight,PlayerWeight,from_yardline,X_speed,Y_speed,X_acceleration,Y_acceleration,age,closest_opponent,opponents_in_6,teammates_in_6,YardLine,DefendersInTheBox,Turf,poss_avg_height,poss_avg_weight,poss_avg_age,poss_avg_S,poss_avg_A,poss_x_momentum,poss_y_momentum,poss_x_force,poss_y_force,poss_std_X,poss_std_Y,def_avg_height,def_avg_weight,def_avg_age,def_avg_S,def_avg_A,def_x_momentum,def_y_momentum,def_x_force,def_y_force,def_std_X,def_std_Y,tot_x_momenumt,tot_x_force,height_diff,weight_diff,age_diff,X_diff,Y_diff
0,3.63,3.35,70,205,3.75,1.491487,3.309436,1.376441,3.054163,9349,4.59331,3.0,7.0,45,6.0,0,74.727273,259.181818,10374.454545,2.106364,1.358182,30394.065885,42824.323203,14808.310917,29092.727988,1.772665,5.855606,73.636364,233.545455,10101.272727,1.314545,1.025455,21967.405111,4867.00364,11406.041625,-783.13754,5.294079,7.014714,8426.660774,3402.269293,1.090909,25.636364,273.181818,-3.521414,-1.159107
1,3.06,2.41,70,205,4.07,-2.055465,2.266862,-1.618847,1.785339,9349,4.287773,3.0,7.0,53,6.0,0,74.727273,259.181818,10374.454545,2.094545,1.541818,5361.669172,36553.979512,1503.78838,27039.089576,2.00138,6.932502,73.636364,233.545455,10101.272727,1.639091,1.592727,-3627.89183,6256.141431,-2706.292685,4533.856018,5.406292,7.190716,8989.561002,4210.081065,1.090909,25.636364,273.181818,-3.404912,-0.258215
2,5.77,2.42,70,205,3.66,4.29064,3.857889,1.799541,1.61804,9349,4.22167,2.0,6.0,75,7.0,0,74.727273,259.181818,10374.454545,3.682727,1.419091,86277.195695,48107.583525,33664.932926,23232.912519,2.115286,6.044208,73.636364,233.545455,10101.272727,3.244545,2.092727,82977.152392,12860.033647,52837.531067,8496.045083,4.720893,6.57762,3300.043303,-19172.598141,1.090909,25.636364,273.181818,-2.605606,-0.533412
3,4.45,3.2,71,210,3.53,-0.421875,4.429957,-0.303371,3.185587,9808,4.528002,7.0,9.0,108,9.0,0,76.181818,282.545455,10320.636364,2.141818,0.880909,-39812.029305,39225.794933,-11636.814745,16186.365888,1.898686,3.154869,73.727273,257.454545,10390.181818,1.555455,1.293636,-37637.593645,-4639.830146,-26960.759277,-9328.539063,0.962418,4.505029,-2174.43566,15323.944532,2.454545,25.090909,-69.545455,0.936268,-1.35016
4,3.9,2.53,71,216,5.01,-3.613974,1.466013,-2.34445,0.951029,8069,4.288088,3.0,6.0,35,7.0,0,76.909091,268.454545,9732.818182,2.644545,1.62,-31427.161922,20004.068984,-13921.201825,13577.076191,2.056883,7.423977,73.181818,242.454545,9967.181818,2.322727,2.121818,-54049.085285,-11151.98538,-50459.233839,-5737.78304,5.391251,7.864325,22621.923362,36538.032014,3.727273,26.0,-234.363636,-3.334368,-0.440348


In [8]:
def create_targets(data):
    unique_plays = data[['PlayId', 'Yards']].drop_duplicates()
    yards = unique_plays['Yards'].reset_index(drop=True)
    
    y = np.zeros((yards.shape[0], 199))
    for idx, target in enumerate(list(yards)):
        y[idx][99 + target] = 1
    
    return yards, y


def crps_score(y_prediction, y_valid, shape):
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_prediction, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * shape)
    crps = np.round(val_s, 6)
    
    return crps

In [9]:
def grid_search(data, target, estimator, param_grid, scoring, cv, random=False):
    '''
    Calls a grid or a randomized search over a parameter grid
    Returns a dataframe with the results for each configuration
    Returns a dictionary with the best parameters
    Returns the best (fitted) estimator
    '''
    
    if random:
        grid = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, cv=cv, scoring=scoring, 
                                  n_iter=random, n_jobs=-1, random_state=434, iid=False)
    else:
        grid = GridSearchCV(estimator=estimator, param_grid=param_grid, 
                            cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    
    pd.options.mode.chained_assignment = None  # turn on and off a warning of pandas
    tmp = data.copy()
    grid = grid.fit(tmp, target)
    pd.options.mode.chained_assignment = 'warn'
    
    result = pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', 
                                                        ascending=False).reset_index()
    
    del result['params']
    times = [col for col in result.columns if col.endswith('_time')]
    params = [col for col in result.columns if col.startswith('param_')]
    
    result = result[params + ['mean_test_score', 'std_test_score'] + times]
    
    return result, grid.best_params_, grid.best_estimator_

In [13]:
yards, y = create_targets(df_train)

In [14]:
folds = KFold(5, shuffle=True, random_state=541)

plays = df_train[df_train.has_ball][['PlayId', 
                                     'YardLine', 
                                     'from_yardline', 
                                     'X']].drop_duplicates().reset_index(drop=True)

In [28]:
ridge_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                           ('Imputer', df_imputer()),
                           ('scl', df_scaler(method='standard')),
                           ('ridge', Ridge())])

param_grid = {'scl__method': ['robust'], 
              'ridge__alpha': np.arange(3, 10, 0.1), 
              'ridge__solver': ['auto', 'cholesky', 'svd', 'sparse_cg', 'lsqr']}

In [29]:
train = full_train.copy()

summary, bp, be = grid_search(train, yards, ridge_pipe, 
                              param_grid=param_grid, 
                              scoring='neg_mean_absolute_error', 
                              cv=folds)

summary.head(10)

Unnamed: 0,param_ridge__alpha,param_ridge__solver,param_scl__method,mean_test_score,std_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time
0,9.9,sparse_cg,robust,-3.633046,0.057711,0.358942,0.01453,0.080647,0.008895
1,9.8,sparse_cg,robust,-3.633048,0.057711,0.297045,0.036175,0.081973,0.016973
2,9.7,sparse_cg,robust,-3.63305,0.057712,0.300793,0.052399,0.093738,0.016099
3,9.6,sparse_cg,robust,-3.633051,0.057712,0.250866,0.081153,0.102592,0.014926
4,9.5,sparse_cg,robust,-3.633053,0.057713,0.25611,0.073815,0.109731,0.037428
5,9.4,sparse_cg,robust,-3.633055,0.057714,0.268777,0.077371,0.08673,0.009315
6,9.3,sparse_cg,robust,-3.633056,0.057714,0.326334,0.082696,0.070803,0.020525
7,9.2,sparse_cg,robust,-3.633058,0.057715,0.299985,0.072146,0.079112,0.02373
8,9.1,sparse_cg,robust,-3.63306,0.057715,0.296208,0.058883,0.101481,0.012605
9,9.0,sparse_cg,robust,-3.633061,0.057716,0.254254,0.117683,0.086324,0.0347


In [30]:
bp

{'ridge__alpha': 9.900000000000006,
 'ridge__solver': 'sparse_cg',
 'scl__method': 'robust'}

In [36]:
ridge_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                           ('Imputer', df_imputer()),
                           ('scl', df_scaler(method='robust')),
                           ('ridge', Ridge(alpha=10, solver='sparse_cg'))])

%time inf_preds= cv_score(train, y, folds, ridge_pipe)

crps = crps_score(inf_preds, y, train.shape[0])
    
inf_preds = inf_preds.argmax(axis=1) - 99

mae = mean_absolute_error(yards, inf_preds)

print(round(crps, 5), round(mae, 5))


CPU times: user 13.7 s, sys: 3.28 s, total: 17 s
Wall time: 2.94 s
0.01303 3.61568


In [37]:
extra_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                           ('Imputer', df_imputer()),
                           ('scl', df_scaler(method='standard')),
                           ('forest', ExtraTreesRegressor(n_estimators=200, n_jobs=-1,  
                                               criterion='mse', max_features='auto'))])

param_grid = {'scl__method': ['standard', 'robust'], 
              'forest__max_features': np.arange(0.1, 1, 0.1), 
              'forest__max_depth': [4, 10, None], 
              'forest__bootstrap': [True, False], 
              'forest__min_samples_leaf': [1, 2, 4, 8, 16, 32, 64], 
              'forest__min_samples_split': [2, 4, 8, 16, 32, 64], 
              'forest__min_impurity_decrease': np.arange(0.1, 1, 0.1)}

In [40]:
train = full_train.copy()

summary, bp, be = grid_search(train, yards, extra_pipe, 
                              param_grid=param_grid, 
                              scoring='neg_mean_absolute_error', 
                              cv=folds, 
                              random=1000)

summary.head(10)

Unnamed: 0,param_scl__method,param_forest__min_samples_split,param_forest__min_samples_leaf,param_forest__min_impurity_decrease,param_forest__max_features,param_forest__max_depth,param_forest__bootstrap,mean_test_score,std_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time
0,robust,2,1,0.1,0.9,,True,-3.711066,0.052409,0.997601,0.111097,0.159272,0.028322
1,robust,32,64,0.1,0.9,10.0,True,-3.711793,0.04607,1.067869,0.124909,0.170724,0.026913
2,robust,4,1,0.1,0.9,10.0,True,-3.71397,0.049129,1.060666,0.154601,0.148135,0.022535
3,robust,64,1,0.1,0.9,,True,-3.714956,0.046132,1.122275,0.087244,0.190561,0.054279
4,standard,32,4,0.1,0.9,,True,-3.715516,0.050749,0.974533,0.090793,0.162325,0.032308
5,robust,2,64,0.1,0.9,,True,-3.716493,0.045697,1.089322,0.101026,0.179763,0.025588
6,robust,4,32,0.1,0.9,10.0,True,-3.716951,0.047078,1.142275,0.148382,0.16849,0.048493
7,standard,64,64,0.1,0.8,,True,-3.720427,0.04917,0.957,0.020011,0.159035,0.017687
8,standard,64,32,0.1,0.9,4.0,True,-3.72139,0.046819,1.043952,0.101399,0.172925,0.050399
9,standard,64,8,0.1,0.9,4.0,True,-3.721438,0.044282,1.002098,0.045759,0.162087,0.036107


In [41]:
bp

{'scl__method': 'robust',
 'forest__min_samples_split': 2,
 'forest__min_samples_leaf': 1,
 'forest__min_impurity_decrease': 0.1,
 'forest__max_features': 0.9,
 'forest__max_depth': None,
 'forest__bootstrap': True}

In [42]:
extra_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                           ('Imputer', df_imputer()),
                           ('scl', df_scaler(method='robust')),
                           ('forest', ExtraTreesRegressor(n_estimators=200, n_jobs=-1,  
                                               criterion='mse', max_features='auto'))])

%time inf_preds, imps= cv_score(train, y, folds, extra_pipe, imp_coef=True)

crps = crps_score(inf_preds, y, train.shape[0])
    
inf_preds = inf_preds.argmax(axis=1) - 99

mae = mean_absolute_error(yards, inf_preds)

print(round(crps, 5), round(mae, 5))

imps

CPU times: user 17min 10s, sys: 15.1 s, total: 17min 25s
Wall time: 3min 10s
0.01288 3.58491


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
YardLine,0.027775,0.00025
A,0.027294,0.00016
Y_acceleration,0.026622,0.000363
def_y_momentum,0.025383,0.000144
Y_speed,0.025336,0.000188
from_yardline,0.023961,0.000114
poss_y_momentum,0.023799,0.000116
closest_opponent,0.023646,0.000189
def_avg_A,0.023486,0.000105
def_y_force,0.02344,0.000119


In [12]:
forest_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                           ('Imputer', df_imputer()),
                           ('scl', df_scaler(method='standard')),
                           ('forest', RandomForestRegressor(n_estimators=100, n_jobs=-1,  
                                               criterion='mse', max_features='auto'))])

param_grid = {'scl__method': ['standard', 'robust'], 
              'forest__max_features': np.arange(0.1, 1, 0.1), 
              'forest__max_depth': [4, 10, None], 
              'forest__bootstrap': [True, False], 
              'forest__min_samples_leaf': [1, 2, 4, 8, 16, 32, 64], 
              'forest__min_samples_split': [2, 4, 8, 16, 32, 64], 
              'forest__min_impurity_decrease': np.arange(0.1, 1, 0.1)}

In [21]:
train = full_train.copy()

summary, bp, be = grid_search(train, yards, forest_pipe, 
                              param_grid=param_grid, 
                              scoring='neg_mean_absolute_error', 
                              cv=folds, 
                              random=5000)

summary.head(10)

Unnamed: 0,param_scl__method,param_forest__min_samples_split,param_forest__min_samples_leaf,param_forest__min_impurity_decrease,param_forest__max_features,param_forest__max_depth,param_forest__bootstrap,mean_test_score,std_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time
0,robust,16,32,0.1,0.5,10.0,True,-3.639049,0.051882,7.870081,0.274354,0.250575,0.023501
1,robust,16,16,0.1,0.5,,True,-3.639721,0.055772,8.332916,0.395772,0.237786,0.016521
2,standard,64,16,0.1,0.6,10.0,True,-3.639903,0.055559,10.292828,0.302351,0.264055,0.023338
3,standard,8,16,0.1,0.5,,True,-3.640166,0.055021,8.21639,0.164932,0.239493,0.032475
4,standard,2,32,0.1,0.6,,True,-3.640173,0.051329,10.302243,0.14003,0.233498,0.012105
5,robust,2,16,0.1,0.5,10.0,True,-3.640404,0.0552,8.383902,0.229677,0.259515,0.012065
6,standard,4,32,0.1,0.4,,True,-3.64053,0.054848,6.114658,0.171485,0.241348,0.042843
7,standard,16,16,0.1,0.5,,True,-3.640647,0.056973,8.574978,0.220814,0.253685,0.013287
8,standard,4,32,0.1,0.6,10.0,True,-3.640702,0.053838,10.187792,0.09884,0.236761,0.015771
9,robust,2,32,0.1,0.6,,True,-3.640909,0.051881,10.180414,0.185039,0.233302,0.005836


In [22]:
bp

{'scl__method': 'robust',
 'forest__min_samples_split': 16,
 'forest__min_samples_leaf': 32,
 'forest__min_impurity_decrease': 0.1,
 'forest__max_features': 0.5,
 'forest__max_depth': 10,
 'forest__bootstrap': True}

In [25]:
forest_pipe = Pipeline([('dummifier', dummify(drop_first=True)),
                       ('Imputer', df_imputer()),
                       ('scl', df_scaler(method='robust')),
                       ('forest', RandomForestRegressor(n_estimators=200, n_jobs=-1,  
                                           criterion='mse', max_features=0.5, 
                                                        min_samples_split=16, 
                                                        min_samples_leaf=16, 
                                                        max_depth=None))])

%time inf_preds, imps= cv_score(train, y, folds, forest_pipe, imp_coef=True)

crps = crps_score(inf_preds, y, train.shape[0])
    
inf_preds = inf_preds.argmax(axis=1) - 99

mae = mean_absolute_error(yards, inf_preds)

print(round(crps, 5), round(mae, 5))

imps

CPU times: user 23min 5s, sys: 1.81 s, total: 23min 6s
Wall time: 4min 2s
0.0129 3.54758


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
Y_acceleration,0.049885,0.000842
YardLine,0.048639,0.001712
A,0.04708,0.001101
Y_speed,0.036996,0.000951
def_y_momentum,0.035024,0.000621
from_yardline,0.033431,0.000299
def_std_X,0.032624,0.001051
poss_std_X,0.028572,0.000865
S,0.028129,0.000419
def_avg_S,0.02764,0.000883
