In [1]:
from kaggle.competitions import nflrush

import io
import re
import gc
import os
import math
import random
import numpy as np
import pandas as pd
import datetime
import time
from pprint import pprint
import multiprocessing
from tqdm import tqdm_notebook

import keras
import torch
import tensorflow as tf
from keras import backend as F
from keras import initializers
from keras import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.engine.saving import load_model
from keras.layers import Dense, Activation, Embedding, Input, Lambda, BatchNormalization, Dropout, Add, Concatenate
from keras.models import Model 
from keras.optimizers import SGD, Adam,adam
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import preprocessing
import lightgbm as lgb
from scipy.spatial import Voronoi
from shapely.geometry import Polygon
from scipy.spatial import ConvexHull

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
def fix_seeds(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    tf.set_random_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    F.set_session(sess)
    
fix_seeds(2019)

In [3]:
from keras import backend as K

__all__ = ['RAdam']

class RAdam(keras.optimizers.Optimizer):
    """RAdam optimizer.
    # Arguments
        learning_rate: float >= 0. Learning rate.
        beta_1: float, 0 < beta < 1. Generally close to 1.
        beta_2: float, 0 < beta < 1. Generally close to 1.
        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
        decay: float >= 0. Learning rate decay over each update.
        weight_decay: float >= 0. Weight decay for each param.
        amsgrad: boolean. Whether to apply the AMSGrad variant of this
            algorithm from the paper "On the Convergence of Adam and
            Beyond".
        total_steps: int >= 0. Total number of training steps. Enable warmup by setting a positive value.
        warmup_proportion: 0 < warmup_proportion < 1. The proportion of increasing steps.
        min_lr: float >= 0. Minimum learning rate after warmup.
    # References
        - [Adam - A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980v8)
        - [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
        - [On The Variance Of The Adaptive Learning Rate And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf)
    """

    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999,
                 epsilon=None, decay=0., weight_decay=0., amsgrad=False,
                 total_steps=0, warmup_proportion=0.1, min_lr=0., **kwargs):
        learning_rate = kwargs.pop('lr', learning_rate)
        super(RAdam, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.learning_rate = K.variable(learning_rate, name='learning_rate')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            self.weight_decay = K.variable(weight_decay, name='weight_decay')
            self.total_steps = K.variable(total_steps, name='total_steps')
            self.warmup_proportion = K.variable(warmup_proportion, name='warmup_proportion')
            self.min_lr = K.variable(min_lr, name='min_lr')
        if epsilon is None:
            epsilon = K.epsilon()
        self.epsilon = epsilon
        self.initial_decay = decay
        self.initial_weight_decay = weight_decay
        self.initial_total_steps = total_steps
        self.amsgrad = amsgrad

    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr

        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        if self.initial_total_steps > 0:
            warmup_steps = self.total_steps * self.warmup_proportion
            decay_steps = K.maximum(self.total_steps - warmup_steps, 1)
            decay_rate = (self.min_lr - lr) / decay_steps
            lr = K.switch(
                t <= warmup_steps,
                lr * (t / warmup_steps),
                lr + decay_rate * K.minimum(t - warmup_steps, decay_steps),
            )

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params)]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params)]

        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params)]
        else:
            vhats = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))]

        self.weights = [self.iterations] + ms + vs + vhats

        beta_1_t = K.pow(self.beta_1, t)
        beta_2_t = K.pow(self.beta_2, t)

        sma_inf = 2.0 / (1.0 - self.beta_2) - 1.0
        sma_t = sma_inf - 2.0 * t * beta_2_t / (1.0 - beta_2_t)

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            m_corr_t = m_t / (1.0 - beta_1_t)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                v_corr_t = K.sqrt(vhat_t / (1.0 - beta_2_t))
                self.updates.append(K.update(vhat, vhat_t))
            else:
                v_corr_t = K.sqrt(v_t / (1.0 - beta_2_t))

            r_t = K.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                         (sma_t - 2.0) / (sma_inf - 2.0) *
                         sma_inf / sma_t)

            p_t = K.switch(sma_t >= 5, r_t * m_corr_t / (v_corr_t + self.epsilon), m_corr_t)

            if self.initial_weight_decay > 0:
                p_t += self.weight_decay * p

            p_t = p - lr * p_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    @property
    def lr(self):
        return self.learning_rate

    @lr.setter
    def lr(self, learning_rate):
        self.learning_rate = learning_rate

    def get_config(self):
        config = {
            'learning_rate': float(K.get_value(self.learning_rate)),
            'beta_1': float(K.get_value(self.beta_1)),
            'beta_2': float(K.get_value(self.beta_2)),
            'decay': float(K.get_value(self.decay)),
            'weight_decay': float(K.get_value(self.weight_decay)),
            'epsilon': self.epsilon,
            'amsgrad': self.amsgrad,
            'total_steps': float(K.get_value(self.total_steps)),
            'warmup_proportion': float(K.get_value(self.warmup_proportion)),
            'min_lr': float(K.get_value(self.min_lr)),
        }
        base_config = super(RAdam, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [4]:
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred = self.model.predict(X_train)
        y_true = np.clip(np.cumsum(y_train, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        tr_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_train.shape[0])
        logs['tr_CRPS'] = tr_s

        X_valid, y_valid = self.data[1][0], self.data[1][1]

        y_pred = self.model.predict(X_valid)
        y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
        logs['val_CRPS'] = val_s
        print('tr CRPS', tr_s, 'val CRPS', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

class Gambler():
    def __init__(self, input_size):
        super().__init__()
        self.input_size = input_size
        self.models = {}
        self.scalers = {}
        
    def construct_model(self):
        # opm = Adam(learning_rate=0.0001)
        opm = RAdam(warmup_proportion=0.1, min_lr=1e-4)
        my_init = initializers.glorot_uniform(seed=2019)
        model = Sequential([
            Dense(512, input_shape=(self.input_size,),kernel_initializer=my_init),
            keras.layers.LeakyReLU(0.25),
            keras.layers.Dropout(0.50,seed=2019),
            Dense(128, input_shape=(self.input_size,),kernel_initializer=my_init),
            keras.layers.LeakyReLU(0.25),
            keras.layers.Dropout(0.50,seed=2019),
            Dense(64,kernel_initializer=my_init),
            keras.layers.LeakyReLU(0.25),
            keras.layers.Dropout(0.50,seed=2019),
            Dense(199,kernel_initializer=my_init),
            Activation('softmax'),
        ],)

        model.compile(loss='categorical_crossentropy',optimizer=opm,metrics=[])
        return model

    def train(self, X_train, X_valid, y_train, y_valid, fold):
        self.models[fold] = self.construct_model()

        self.scalers[fold] = StandardScaler()
        X_train = self.scalers[fold].fit_transform(X_train)
        X_valid = self.scalers[fold].transform(X_valid)

        es = EarlyStopping(monitor='val_CRPS',mode='min',restore_best_weights=True,verbose=1,patience=11)
        es.set_model(self.models[fold])
        metric = Metric(self.models[fold], [es], [(X_train, y_train), (X_valid, y_valid)])
        self.models[fold].fit(X_train, y_train,verbose=0,callbacks=[metric],epochs=1000, batch_size=128)

    def predict(self, X, fold):
        X = self.scalers[fold].transform(X)
        preds = self.models[fold].predict(X)
        return preds
    
    def predict_final(self, X):
        final = None
        for fold in self.models.keys():
            preds = self.predict(X, fold)
            if final is None:
                final = preds / (len(self.models.keys()))
            else:
                final += preds / (len(self.models.keys()))
        return final

def train_loop(gambler, df, fake_df, num_folds,useful_raw_features,cv="cv"):    
    if cv == "cv":
        spliter = KFold(n_splits=num_folds, random_state=2019, shuffle = True)
        oof_predictions = np.zeros((df.shape[0], 199))
        oof_targets = np.zeros((df.shape[0], 199))
        oof_ids = np.zeros(df.shape[0])
        fold = 0
        for train_index, valid_index in spliter.split(df):
            print('### Fold', fold+1, '###')
            dataset_train = df.loc[train_index].copy()
            dataset_valid = df.loc[valid_index].copy()

            dataset_train = pd.concat([dataset_train,fake_df[fake_df.PlayId.isin(dataset_train.PlayId)]],axis=0)
            X_train = dataset_train[useful_raw_features].copy().fillna(-10)
            X_valid = dataset_valid[useful_raw_features].copy().fillna(-10)

            # get targets
            targets = dataset_train['Yards']
            y_train = np.zeros((targets.shape[0], 199))
            for idx, target in enumerate(list(targets)):
                y_train[idx][99 + target] = 1

            targets = dataset_valid['Yards']
            y_valid = np.zeros((targets.shape[0], 199))
            for idx, target in enumerate(list(targets)):
                y_valid[idx][99 + target] = 1

            gambler.train(X_train, X_valid, y_train, y_valid, fold)

            oof_pred = gambler.predict(X_valid, fold)
            oof_pred = np.clip(np.cumsum(oof_pred, axis=1), 0, 1)
            oof_predictions[valid_index] = oof_pred
            
            y_valid = np.clip(np.cumsum(y_valid, axis=1), 0, 1)            
            oof_targets[valid_index] = y_valid
            oof_ids[valid_index] = dataset_valid['PlayId'].values

            oof_score = ((oof_pred - y_valid) ** 2).sum(axis=1).sum(axis=0) / (199 * y_valid.shape[0])
            print(f'Fold {fold+1} score', oof_score)
            fold += 1  
            
        oof_score = ((oof_predictions - oof_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * oof_targets.shape[0])
        print(f'OOF Score', oof_score)
        
    elif cv == "split":
        dataset_train, dataset_valid = train_test_split(df, test_size=0.1, random_state=2019)
        X_train = dataset_train[useful_raw_features].copy().fillna(-10)
        X_valid = dataset_valid[useful_raw_features].copy().fillna(-10)

        # get targets
        targets = dataset_train['Yards']
        y_train = np.zeros((targets.shape[0], 199))
        for idx, target in enumerate(list(targets)):
            y_train[idx][99 + target] = 1

        targets = dataset_valid['Yards']
        y_valid = np.zeros((targets.shape[0], 199))
        for idx, target in enumerate(list(targets)):
            y_valid[idx][99 + target] = 1

        gambler.train(X_train, X_valid, y_train, y_valid, 0)
        oof_pred = gambler.predict(X_valid, 0)        
        oof_pred = np.clip(np.cumsum(oof_pred, axis=1), 0, 1)
        oof_predictions = oof_pred.copy()
        
        oof_ids = dataset_valid['PlayId'].values    
        y_valid = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
        oof_targets = y_valid
        
        oof_score = ((oof_pred - y_valid) ** 2).sum(axis=1).sum(axis=0) / (199 * y_valid.shape[0])
        print(f'OOF Score', oof_score)
        
    else:
        games = df[['GameId', 'PossessionTeam']].drop_duplicates()

        # Sort so the latest games are first and label the games with cumulative counter
        games = games.sort_values(['PossessionTeam', 'GameId'], ascending=[True, False])
        games['row_number'] = games.groupby(['PossessionTeam']).cumcount() + 1

        # Use last 5 games for each team as validation. There will be overlap since two teams will have the same
        game_set = set([1, 2, 3, 4, 5])

        # Set of unique game ids
        game_ids = set(games[games['row_number'].isin(game_set)]['GameId'].unique().tolist())

        dataset_train = df[~df['GameId'].isin(game_ids)]
        dataset_valid = df[df['GameId'].isin(game_ids)]

        X_train = dataset_train[useful_raw_features].copy().fillna(-10)
        X_valid = dataset_valid[useful_raw_features].copy().fillna(-10)

        # get targets
        targets = dataset_train['Yards']
        y_train = np.zeros((targets.shape[0], 199))
        for idx, target in enumerate(list(targets)):
            y_train[idx][99 + target] = 1

        targets = dataset_valid['Yards']
        y_valid = np.zeros((targets.shape[0], 199))
        for idx, target in enumerate(list(targets)):
            y_valid[idx][99 + target] = 1

        gambler.train(X_train, X_valid, y_train, y_valid, 0)
        oof_pred = gambler.predict(X_valid, 0)        
        oof_pred = np.clip(np.cumsum(oof_pred, axis=1), 0, 1)
        oof_predictions = oof_pred.copy()

        oof_ids = dataset_valid['PlayId'].values    
        y_valid = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
        oof_targets = y_valid

        oof_score = ((oof_pred - y_valid) ** 2).sum(axis=1).sum(axis=0) / (199 * y_valid.shape[0])
        print(f'OOF Score', oof_score)

    return oof_ids, oof_predictions, oof_targets

def encode_ohe(df, features):
    df[features] = df[features].fillna('missing')
    df = pd.get_dummies(df,columns = features, drop_first = True)
    return df

def generate_categorical_encoders(train, features):
    encoders = {}
    for feature in features:
        train[feature] = train[feature].fillna('missing')
        encoder = LabelEncoder()
        encoder.fit(train[feature].values)
        le_dict = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        encoders[feature] = le_dict
    return encoders

def encode_categorical_features(df, features, encoders):
    for f in features:
        df[f] = df[f].fillna('missing')
        df[f] = df[f].map(encoders[f])

def aggreate_by_play(df, configs):
    df = df.sort_values('PlayId')
    agg_df = pd.DataFrame({'PlayId': list(df['PlayId'].unique())}).sort_values('PlayId')
    # TODO aggerate with a sliding window
    for config in configs:
        feature = config[0]
        if feature == 'PlayId' or feature not in df.columns:
            continue
        gy = df.groupby('PlayId')
        gy_team = df.groupby(['PlayId','IsOnOffense'])
        if 'team' in config[2]:
            for agg_func in config[2]:                
                if agg_func == 'team':
                    continue
                elif agg_func == 'first':
                    agg_df[feature+"_def"] = gy_team[feature].agg(agg_func)[::2].values
                    agg_df[feature+"_off"] = gy_team[feature].agg(agg_func)[1::2].values
                else:
                    agg_df[f'{feature}_{agg_func}_def'] = gy_team[feature].agg(agg_func)[::2].values
                    agg_df[f'{feature}_{agg_func}_off'] = gy_team[feature].agg(agg_func)[1::2].values
        else:
            for agg_func in config[2]:
                if agg_func == 'first':
                    agg_df[feature] = gy[feature].agg(agg_func).values
                else:
                    agg_df[f'{feature}_{agg_func}'] = gy[feature].agg(agg_func).values
    return agg_df

In [5]:
class GoldenTimer:
    def __init__(self, show=True):
        self.start_time = time.time()
        self.show = show

    def time(self, print_str):
        duration = time.time() - self.start_time
        if self.show:
            print(print_str, duration)
        self.start_time = time.time()


class FeatureFactory:
    def __init__(self, cat_dict):
        self.cat_dict = cat_dict
    
    def make_feats(self, df_, show=False):
        timer = GoldenTimer(show)
        df = self.adjust_sa(df_)
        emil_df = self.get_emil_feats(df.copy())
        df = self.get_future_pos(df)
        df = self.get_dist(df)
        rush_df = df[df["NflIdRusher"] == df["NflId"]].copy()
        rush_df = self.get_closest_feat(df, rush_df)
        rush_df = self.get_free_width(df, rush_df)
        #rush_df = self.get_proper_field_position(rush_df)
        #rush_df = self.get_proper_yard(rush_df)
        #rush_df = self.get_direction(rush_df)
        #rush_df = self.do_cat(rush_df)
        rush_df = pd.merge(rush_df, emil_df, on="PlayId", how="inner")
        return rush_df
    
    def get_emil_feats(self, df):
        df = self.prep_df(df)
        df = self.make_emil_feats(df)
        return df
    
    def adjust_sa(self, df):
        mean_old, mean_new, std_old, std_new = 2.435519556913685, 2.7570316419451517, 1.2929623410155855, 1.4551321358655551
        df["S"] = np.where(df["Season"] >= 2018, (df["S"]-mean_new) / std_new * std_old + mean_old, df["S"])
        mean_old, mean_new, std_old, std_new = 1.5895792207792045, 1.7819953460610594, 0.8795106467756848, 1.060305722313926
        df["A"] = np.where(df["Season"] >= 2018, (df["A"]-mean_new) / std_new * std_old + mean_old, df["A"])
        
        df["PlayId"] = df["PlayId"].fillna(0)        
        df["Team"] = df["Team"].fillna(0)        
        for col in ['Season', 'YardLine', 'Quarter', 'Down','HomeScoreBeforePlay', 'VisitorScoreBeforePlay','NflIdRusher', 
                    'Week','Temperature', 'Humidity','DefendersInTheBox', 'PlayerWeight']:
            df["temp"] = df.groupby(["PlayId"])[col].transform("mean")
            df[col] = df[col].fillna(df["temp"])            

        for col in ['X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir','Distance']:
            df["temp"] = df.groupby(["PlayId","Team"])[col].transform("mean")
            df[col] = df[col].fillna(df["temp"])            
            
        del df["temp"]
        df = df.fillna(0)
        
        return df

    def get_future_pos(self, df):
        df["dxdy_dir"] = (540 - (df["Dir"] + 90)) % 360
        df["dy"] = np.sin(np.deg2rad(df["dxdy_dir"]))
        df["dx"] = np.cos(np.deg2rad(df["dxdy_dir"]))
        df["nextX"] = df["X"] + (df["dx"] * (df["S"]))
        df["nextY"] = df["Y"] + (df["dy"] * (df["S"]))
        df["nextX2"] = df["X"] + (df["dx"] * (df["S"]*2))
        df["nextY2"] = df["Y"] + (df["dy"] * (df["S"]*2))
        return df

    def get_dist(self, df):
        merge_rush = df[df["NflIdRusher"] == df["NflId"]].copy()
        merge_col = [
            "PlayId", "Team", "X", "Y", "nextX", "nextY", "nextX2", "nextY2"
        ]
        merge_rush = merge_rush[merge_col]
        merge_rush.columns = [
            "PlayId", "rush_team", "rush_X", "rush_Y",
            "rush_nextX", "rush_nextY", "rush_nextX2", "rush_nextY2"
        ]
        ret_df = pd.merge(df, merge_rush, on="PlayId", how="left")
        
        ret_df["dist"] = (ret_df["X"] - ret_df["rush_X"]) ** 2 + (ret_df["Y"] - ret_df["rush_Y"]) ** 2
        ret_df["dist"] = ret_df["dist"] ** 0.5
        ret_df["dist2"] = (ret_df["X"] - ret_df["rush_nextX"]) ** 2 + (ret_df["Y"] - ret_df["rush_nextY"]) ** 2
        ret_df["dist2"] = ret_df["dist2"] ** 0.5
        ret_df["dist3"] = (ret_df["nextX"] - ret_df["rush_nextX"]) ** 2 + (ret_df["nextY"] - ret_df["rush_nextY"]) ** 2
        ret_df["dist3"] = ret_df["dist3"] ** 0.5        
        ret_df["dist4"] = (ret_df["nextX2"] - ret_df["rush_nextX2"]) ** 2 + (ret_df["nextY2"] - ret_df["rush_nextY2"]) ** 2
        ret_df["dist4"] = ret_df["dist4"] ** 0.5

        defense_team_df = ret_df[ret_df["Team"] != ret_df["rush_team"]]
        dist_keys = ["dist", "dist2", "dist3", "dist4"]
        temp_df = self._get_dist_feat(defense_team_df, dist_keys)
        ret_df, temp_df1 = self._get_mark_dist(ret_df)
        temp_df = pd.merge(temp_df, temp_df1, on="PlayId", how="left")
        ret_df = pd.merge(ret_df, temp_df, on="PlayId", how="left")
        return ret_df
    
    def _get_dist_feat(self, defense_team_df, dist_keys):
        aggs = dict()
        for k in dist_keys:
            aggs[k] = ["min", "mean", "std"]
        temp_df = defense_team_df.groupby("PlayId").agg(aggs).reset_index()
        
        cols = ["PlayId"]
        for k in dist_keys:
            cols += [f"min_df_{k}", f"mean_df_{k}", f"std_df_{k}"]
        temp_df.columns = cols
        return temp_df
    
    def _get_mark_dist(self, df):
        df["index"] = df.index
        df["player_idx"] = df.groupby(["PlayId", "Team"])["index"].agg(["rank"])
        temp_df = None
        for i in range(1, 12):
            mask = (df["Team"] == df["rush_team"]) & (df["player_idx"] == i)
            temp = df[mask]
            temp = temp[["PlayId", "X", "Y"]]
            temp.columns = ["PlayId", f"of{i}_X", f"of{i}_Y"]
            if temp_df is None:
                temp_df = temp
            else:
                temp_df = pd.merge(temp_df, temp, on="PlayId", how="left")
        df = pd.merge(df, temp_df, on=["PlayId"], how="left")
        for i in range(1, 12):
            df[f"of{i}_dist"] = ((df["X"] - df[f"of{i}_X"])**2 + (df["Y"] - df[f"of{i}_Y"])**2) ** 0.5
        agg_col = [f"of{i}_dist" for i in range(1, 12)]
        df["closest_of_dist"] = np.min(df[agg_col], axis=1)
        df["closest_of_index"] = np.argmin(df[agg_col].values, axis=1) + 1
        df["closest_of_X"] = 0
        df["closest_of_Y"] = 0
        for i in range(1, 12):
            df["closest_of_X"] = np.where(df["closest_of_index"] == i, df[f"of{i}_X"], df["closest_of_X"])
            df["closest_of_Y"] = np.where(df["closest_of_index"] == i, df[f"of{i}_Y"], df["closest_of_Y"])
        df["mark_dist"] = ((df["closest_of_X"]-df["rush_X"])**2 + (df["closest_of_Y"]-df["rush_Y"])**2)**0.5
        df["is_blocked"] = np.where(df["mark_dist"] < df["dist"], 1, 0)
        
        mask = (df["Team"] != df["rush_team"]) & (df["is_blocked"] == 0)
        unblocked_defense_team_df = df[mask]
        aggs = {
            "dist3": ["min"],
            "dist4": ["min"],
        }
        temp_df = unblocked_defense_team_df.groupby("PlayId").agg(aggs).reset_index()
        cols = ["PlayId", "min_unblocked_dist3", "min_unblocked_dist4",]
        temp_df.columns = cols

        mask = (df["Team"] != df["rush_team"]) & (df["is_blocked"] == 1)
        blocked_defense_team_df = df[mask]
        aggs = {
            "dist3": ["min"],
        }
        temp_df2 = blocked_defense_team_df.groupby("PlayId").agg(aggs).reset_index()
        cols = ["PlayId", "min_blocked_dist3"]
        temp_df2.columns = cols
        temp_df = pd.merge(temp_df, temp_df2, on="PlayId", how="left")
        return df, temp_df
    
    def get_closest_feat(self, df, rush_df):
        closest_df = df[df["min_df_dist"] == df["dist"]]
        use_col = ["PlayId", "S", "A", "closest_of_dist", "dist3"]
        closest_df = closest_df[use_col]
        closest_df = closest_df.drop_duplicates(subset="PlayId")
        closest_df.columns = ["PlayId", "closest_S", "closest_A", "closest_mark_dist", "closest_next_dist"]
        ret_df = pd.merge(rush_df, closest_df, on="PlayId", how="left")
        closest_df = df[df["min_df_dist3"] == df["dist3"]]
        use_col = ["PlayId", "S", "A", "dist"]
        closest_df = closest_df[use_col]
        closest_df = closest_df.drop_duplicates(subset="PlayId")
        closest_df.columns = ["PlayId", "closest3_S", "closest3_A", "closest3_prev_dist"]
        ret_df = pd.merge(ret_df, closest_df, on="PlayId", how="left")
        return ret_df
    
    def get_free_width(self, df, rush_df):
        temp = df.copy()
        temp["Y"] = temp["nextY"]
        temp["X"] = temp["nextX"]
        temp_rush3 = self._get_free_width(temp)
        temp_rush3.columns = ["PlayId", "max_df_cnt3", "min_df_cnt3", "free_width3"]
        rush_df = pd.merge(rush_df, temp_rush3, on="PlayId", how="left")
        return rush_df
    
    
    def _get_free_width(self, temp):
        temp["std_X"] = np.where(temp["PlayDirection"] == "left", temp["X"]-10, 110-temp["X"])
        temp["std_rush_X"] = np.where(temp["PlayDirection"] == "left", temp["rush_X"]-10, 110-temp["rush_X"])
        temp["x_in_box"] = np.where((temp["std_X"] <= temp["std_rush_X"]+5) & (temp["std_rush_X"] <= temp["std_X"]+10), 1, 0)
        temp = temp.sort_values(by=["PlayId", "Y"])
        temp["in_box_sum"] = temp.groupby(["PlayId"])["x_in_box"].cumsum()
        temp["in_box_r"] = temp.groupby(["PlayId", "Team"])["in_box_sum"].shift(1)
        temp["in_box_l"] = temp.groupby(["PlayId", "Team"])["in_box_sum"].shift(-1)
        temp["df_cnt_r"] = temp["in_box_sum"] - temp["in_box_r"]
        temp["df_cnt_l"] = temp["in_box_l"] - temp["in_box_sum"]
        temp["max_df_cnt0"] = np.max(temp[["df_cnt_r", "df_cnt_l"]], axis=1)
        temp["min_df_cnt0"] = np.min(temp[["df_cnt_r", "df_cnt_l"]], axis=1)
        temp["next_y_in_field"] = temp.groupby("PlayId")["Y"].shift(-1).fillna(temp["Y"])
        temp["prev_y_in_field"] = temp.groupby("PlayId")["Y"].shift(1).fillna(temp["Y"])
        temp["free_width"] = np.where(temp["max_df_cnt0"] == 1, temp["next_y_in_field"]-temp["prev_y_in_field"], 0)
        temp_rush = temp[temp["NflIdRusher"] == temp["NflId"]]
        return temp_rush[["PlayId", "max_df_cnt0", "min_df_cnt0", "free_width"]]
    
    def get_proper_field_position(self, df):
        map_abbr = {
            'ARZ': 'ARI', 'BLT': 'BAL', 'CLV': 'CLE', 'HST': 'HOU', 'NE': 'NE', 'BUF': 'BUF',
            'CHI': 'CHI', 'CIN': 'CIN', 'CLE': 'CLE', 'DET': 'DET', 'HOU': 'HOU', 'TEN': 'TEN',
            'WAS': 'WAS', 'LA': 'LA', 'GB': 'GB', 'SF': 'SF', 'DAL': 'DAL', 'MIN': 'MIN',
            'DEN': 'DEN', 'BAL': 'BAL', 'CAR': 'CAR', 'IND': 'IND', 'JAX': 'JAX', 'KC': 'KC',
            'NO': 'NO', 'PIT': 'PIT', 'TB': 'TB', 'LAC': 'LAC', 'OAK': 'OAK', 'SEA': 'SEA',
            'ATL': 'ATL', 'NYG': 'NYG', 'NYJ': 'NYJ', 'PHI': 'PHI', 'ARI': 'ARI', 'MIA': 'MIA'
        }
        df['FieldPosition'] = df['FieldPosition'].map(map_abbr)
        df['PossessionTeam'] = df['PossessionTeam'].map(map_abbr)
        return df
    
    def get_proper_yard(self, df):
        df["ball_is_in_own_half"] = np.where(df["FieldPosition"] == df["PossessionTeam"], 1, -1)
        df["playing_to_left"] = np.where(df["PlayDirection"] == "left", 1, -1)
        df["ball_is_on_right"] = df["playing_to_left"] * df["ball_is_in_own_half"]
        df["remaining_yards"] =  np.where(df["ball_is_in_own_half"]==1, 100-df["YardLine"], df["YardLine"])

        df["yard_to_X"] = np.where(df["ball_is_on_right"] == 1, 100-df["YardLine"], df["YardLine"])
        df["yard_to_X"] += 10
        df["rush_yardline_dist"] = df["yard_to_X"] - df["X"]
        df["rush_yardline_dist"] = np.where(df["PlayDirection"] == "right", df["rush_yardline_dist"], -df["rush_yardline_dist"])
        mid = 26.666
        df["std_Y"] = np.abs(mid - df["Y"])
        return df
    
    def get_direction(self, df):
        df["rounded_dir"] = df["Dir"]
        df["rounded_dir"] = np.where(df["PlayDirection"] == "left", (df["rounded_dir"] + 180) % 360, df["rounded_dir"])
        df["rounded_dir"] = np.where(df["rounded_dir"]>=180, np.abs(270-df["rounded_dir"])+180, np.abs(90-df["rounded_dir"]))
        df["rounded_orient"] = df["Orientation"]
        df["rounded_orient"] = np.where(df["PlayDirection"] == "left", (df["rounded_orient"] + 180) % 360, df["rounded_orient"])
        df["rounded_orient"] = np.where(df["rounded_orient"] >= 180, np.abs(270 - df["rounded_orient"])+180, np.abs(90 - df["rounded_orient"]))
        return df

    def do_cat(self, df):
        for c in self.cat_dict.keys():
            df[c] = df[c].map(self.cat_dict[c])
        return df
    
    
    def prep_df(self, df):
        
        mean_old, mean_new, std_old, std_new = 2.435519556913685, 2.7570316419451517, 1.2929623410155855, 1.4551321358655551
        df["S"] = np.where(df["Season"] >= 2018, (df["S"]-mean_new) / std_new * std_old + mean_old, df["S"])
        mean_old, mean_new, std_old, std_new = 1.5895792207792045, 1.7819953460610594, 0.8795106467756848, 1.060305722313926
        df["A"] = np.where(df["Season"] >= 2018, (df["A"]-mean_new) / std_new * std_old + mean_old, df["A"])

        df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "ARI", "ARZ", df["VisitorTeamAbbr"])
        df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "ARI", "ARZ", df["HomeTeamAbbr"])
        df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "BAL", "BLT", df["VisitorTeamAbbr"])
        df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "BAL", "BLT", df["HomeTeamAbbr"])
        df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "CLE", "CLV", df["VisitorTeamAbbr"])
        df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "CLE", "CLV", df["HomeTeamAbbr"])
        df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "HOU", "HST", df["VisitorTeamAbbr"])
        df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "HOU", "HST", df["HomeTeamAbbr"])

        df["PossessionTeam"] = np.where(df["PossessionTeam"] == "ARI", "ARZ", df["PossessionTeam"])
        df["FieldPosition"] = np.where(df["FieldPosition"] == "ARI", "ARZ", df["FieldPosition"])
        df["PossessionTeam"] = np.where(df["PossessionTeam"] == "BAL", "BLT", df["PossessionTeam"])
        df["FieldPosition"] = np.where(df["FieldPosition"] == "BAL", "BLT", df["FieldPosition"])
        df["PossessionTeam"] = np.where(df["PossessionTeam"] == "CLE", "CLV", df["PossessionTeam"])
        df["FieldPosition"] = np.where(df["FieldPosition"] == "CLE", "CLV", df["FieldPosition"])
        df["PossessionTeam"] = np.where(df["PossessionTeam"] == "HOU", "HST", df["PossessionTeam"])
        df["FieldPosition"] = np.where(df["FieldPosition"] == "HOU", "HST", df["FieldPosition"])
        
        # df["DefenderTeam"] = np.where(df["PossessionTeam"] == df["HomeTeamAbbr"], df["VisitorTeamAbbr"],df["HomeTeamAbbr"])
        df["FieldPosition"] = (df["FieldPosition"] == df["PossessionTeam"]).astype(int)

        df["TeamOnOffense"] = np.where(df["PossessionTeam"] == df["HomeTeamAbbr"], "home", "away")
        df["IsOnOffense"] = np.where(df["Team"] == df["TeamOnOffense"], 1, 0)

        df["ToLeft"] = np.where(df["PlayDirection"] == "left", 1, 0)
        df["YardsFromOwnGoal"] = np.where(df["FieldPosition"] == 1, df["YardLine"],
                                                50 + (50 - df["YardLine"]))
        df["YardsFromOwnGoal"] = np.where(df["YardLine"] == 50, 50, df["YardsFromOwnGoal"])

        df["X"] = np.where(df["ToLeft"] == 1, 120 - df["X"], df["X"]) - 10  ## Standardizes X
        df["Y"] = np.where(df["ToLeft"] == 1, 160 / 3 - df["Y"], df["Y"])  ## Standardized Y
        df["Dir"] = np.where(df["ToLeft"] == 1, (df["Dir"] + 180) % 360, df["Dir"])  ## Standardizes Dir
        df["Orientation"] = np.where(df["ToLeft"] == 1, (df["Orientation"] + 180) % 360, df["Orientation"])  ## Standardizes Orientation
        return df
        
    def make_emil_feats(self, df):
        df["X_end"] = df["S"]*df["Dir"].apply(lambda x: math.cos((90-x)*np.pi/180)) + df["X"]
        df["Y_end"] = df["S"]*df["Dir"].apply(lambda x: math.sin((90-x)*np.pi/180)) + df["Y"]


        df["S_horizontal"] = df["S"]*df["Dir"].apply(lambda x: math.cos((90-x)*np.pi/180))
        df["S_vertical"] = df["S"]*df["Dir"].apply(lambda x: math.sin((90-x)*np.pi/180))
        df["A_horizontal"] = df["A"]*df["Dir"].apply(lambda x: math.cos((90-x)*np.pi/180))
        # df["A_vertical"] = df["A"]*df["Dir"].apply(lambda x: math.sin((90-x)*np.pi/180))
        #

        for col in ["X","Y","X_end","Y_end","S","S_horizontal","S_vertical","A","A_horizontal"]: # PlayerHeight
            mask = df["NflId"]==df["NflIdRusher"]
            temp = np.repeat(df[mask][col],22)
            df[col+"_rusher"] = temp.values

        df["dist_rusher"] = ((df["X_rusher"]-df["X"])**2+(df["Y_rusher"]-df["Y"])**2)**0.5
        dist_rusher = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["dist_rusher"].agg(["mean","std","min","max"])).unstack().reset_index(drop=True)
        dist_rusher.columns = ["dist_rusher_mean_def","dist_rusher_mean_off","dist_rusher_std_def","dist_rusher_std_off","dist_rusher_min_def","dist_rusher_min_off","dist_rusher_max_def","dist_rusher_max_off"]

        df["dist_rusher_end"] = ((df["X_end_rusher"]-df["X_end"])**2+(df["Y_end_rusher"]-df["Y_end"])**2)**0.5
        dist_rusher_end = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["dist_rusher_end"].agg(["mean","std","min","max"])).unstack().reset_index(drop=True)
        dist_rusher_end.columns = ["dist_rusher_end_mean_def","dist_rusher_end_mean_off","dist_rusher_end_std_def","dist_rusher_end_std_off","dist_rusher_end_min_def","dist_rusher_end_min_off","dist_rusher_end_max_def","dist_rusher_end_max_off"]
        #

        df["dist_rusher_travel"] = ((df["X_end_rusher"]-df["X_rusher"])**2+(df["Y_end_rusher"]-df["Y_rusher"])**2)**0.5
        #

        df["dist_X"] = (df["X"] - df["X_rusher"])
        dist_x = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["dist_X"].agg(["mean","min","max"])).unstack().reset_index(drop=True)
        dist_x.columns = ["dist_X_mean_def","dist_X_mean_off","dist_X_min_def","dist_X_min_off","dist_X_max_def","dist_X_max_off"]

        df["dist_Y_end"] = np.abs(df["Y_end"] - df["Y_end_rusher"])
        dist_y_end = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["dist_Y_end"].agg(["mean"])).unstack().reset_index(drop=True)
        dist_y_end.columns = ["dist_Y_end_mean_def","dist_Y_end_mean_off"]
        #

        df["dist_rusher_rank"] = df.groupby(["PlayId","IsOnOffense"])["dist_rusher"].rank()
        df["dist_rusher_rank"] = np.where(df["IsOnOffense"]==1,df["dist_rusher_rank"]-1,df["dist_rusher_rank"])

        df["dist_rusher_end_rank"] = df.groupby(["PlayId","IsOnOffense"])["dist_rusher_end"].rank()
        df["dist_rusher_end_rank"] = np.where(df["IsOnOffense"]==1,df["dist_rusher_end_rank"]-1,df["dist_rusher_end_rank"])
        #
        for col in ["X","Y"]:
            mask = (df["dist_rusher_rank"] == 1)&(df["IsOnOffense"] == 0)
            temp = np.repeat(df[mask][col],22)
            df[col+"_close"] = temp.values

        df["dist_close"] = ((df["X_close"]-df["X"])**2+(df["Y_close"]-df["Y"])**2)**0.5
        dist_close = (df[~((df["dist_rusher_rank"] == 1)&(df["IsOnOffense"] == 0))].groupby(["PlayId","IsOnOffense"])["dist_close"].agg(["min"])).unstack().reset_index(drop=True)
        dist_close.columns = ["dist_close_min_def","dist_close_min_off"]

        for col in ["X_end","Y_end"]:
            mask = (df["dist_rusher_end_rank"] == 1)&(df["IsOnOffense"] == 0)
            temp = np.repeat(df[mask][col],22)
            df[col+"_close"] = temp.values

        df["dist_close_end"] = ((df["X_end_close"]-df["X"])**2+(df["Y_end_close"]-df["Y"])**2)**0.5
        dist_close_end = (df[~((df["dist_rusher_end_rank"] == 1)&(df["IsOnOffense"] == 0))].groupby(["PlayId","IsOnOffense"])["dist_close_end"].agg(["mean"])).unstack().reset_index(drop=True)
        dist_close_end.columns = ["dist_close_end_mean_def","dist_close_end_mean_off"]
        
        def angle_fun(x_diff,y_diff):
            return math.degrees(math.atan2(y_diff,x_diff))
        angle_fun = np.vectorize(angle_fun)
        y_diff = (df["Y"]-df["Y_rusher"]).values
        x_diff = (df["X"]-df["X_rusher"]).values

        df["angle"] =  angle_fun(x_diff,y_diff)
        df["angle"] = np.where(df["NflId"]==df["NflIdRusher"], (90-df["Dir"]), df["angle"])
        df["angle"] = np.where(df["NflId"]==df["NflIdRusher"], (90-df["Dir"]), df["angle"])
        df["angle"] = np.where((df["NflId"]==df["NflIdRusher"])&(df["Dir"].between(270,360)), df["angle"]+360, df["angle"])

        angle = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["angle"].agg(["std","min","max"])).unstack().reset_index(drop=True)
        angle.columns = ["angle_std_def","angle_std_off","angle_min_def","angle_min_off","angle_max_def","angle_max_off"]

        y_diff = (df["Y_end"]-df["Y_end_rusher"]).values
        x_diff = (df["X_end"]-df["X_end_rusher"]).values

        df["angle_end"] = angle_fun(x_diff,y_diff)
        df["angle_end"] = np.where(df["NflId"]==df["NflIdRusher"], (90-df["Dir"]),df["angle_end"])
        df["angle_end"] = np.where((df["NflId"]==df["NflIdRusher"])&(df["Dir"].between(270,360)), df["angle_end"]+360, df["angle_end"])

        angle_end = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["angle_end"].agg(["min","max"])).unstack().reset_index(drop=True)
        angle_end.columns = ["angle_end_min_def","angle_end_min_off","angle_end_max_def","angle_end_max_off"]

        df["Old"] = (df["Season"] == 2017).astype(int)

        df["dist_yard"] = ((df["YardsFromOwnGoal"]-df["X"])**2+(df["Y_rusher"]-df["Y"])**2)**0.5
        dist_yard = (df[df["NflId"]!=df["NflIdRusher"]].groupby(["PlayId","IsOnOffense"])["dist_yard"].agg(["mean","min"])).unstack().reset_index(drop=True)
        dist_yard.columns = ["dist_yard_mean_def","dist_yard_mean_off","dist_yard_min_def","dist_yard_min_off"]

        #
        raw_feature_configs = [
            ('GameId', 2, ['first']),
            ('PlayId', 2, ['first']),
            ('Yards', 2, ['first']),

            ('PossessionTeam', 1, ['first']),
            ('X', 0, ['team','min','max', 'mean', 'std']),
            ('Y', 0, ['team','min','max', 'mean', 'std']),
            ('S', 0, ['team','max', 'std']), # 'min','mean', 
            ('A', 0, ['team', 'std']), # ,'min','max', 'mean'
            ('Dir', 0, ['team','mean']),
            ('PlayerWeight', 0, ['team','mean']), # , 'max', 'min', 'std'
            ('YardsFromOwnGoal', 0, ['first']),
            ('X_end', 0, ['team','min','max', 'mean', 'std']),
            ('Y_end', 0, ['team','min','max', 'mean', 'std']),
            ('S_horizontal', 0, ['team', 'mean', 'std']),
            ('S_vertical', 0, ['team', 'std']), # mean
            ('X_end_rusher', 2, ['first']),
            ('Y_end_rusher', 2, ['first']),
            ('dist_rusher_travel', 0, ['first']),
            ('A_rusher', 0, ['first']),
            ('S_rusher', 0, ['first']),
            ('S_horizontal_rusher', 0, ['first']),
            ('S_vertical_rusher', 0, ['first']),
            ('A_horizontal_rusher', 0, ['first']),
            ('Old', 0, ['first']),
        ]
        
        play_df = aggreate_by_play(df, raw_feature_configs)
        unuse_features = [f[0] for f in raw_feature_configs if f[1] == 2]
        categorical_features = [f for f in play_df.columns if str(play_df[f].dtype) == 'object' and f not in unuse_features]
        encoders = generate_categorical_encoders(play_df, categorical_features)
        encode_categorical_features(play_df, categorical_features, encoders)
        
        concat_list = [
            play_df, dist_rusher, dist_rusher_end, dist_x, dist_y_end,
            dist_close, dist_close_end, angle, angle_end, dist_yard
        ]
        play_df = pd.concat(concat_list, axis=1)

        play_df["dist_rusher_end_def_mean"] = ((play_df["X_end_mean_def"]-play_df["X_end_rusher"])**2+(play_df["Y_end_mean_def"]-play_df["Y_end_rusher"])**2)**0.5
        play_df["dist_center"] = ((play_df["X_mean_def"]-play_df["X_mean_off"])**2+(play_df["Y_mean_def"]-play_df["Y_mean_off"])**2)**0.5

        play_df["X_span_def"] = (play_df["X_max_def"]-play_df["X_min_def"])
        play_df["X_span_off"] = (play_df["X_max_off"]-play_df["X_min_off"])
        play_df["Y_span_off"] = (play_df["Y_max_off"]-play_df["Y_min_off"])
        play_df["rectangle_off"] = play_df["X_span_off"]*play_df["Y_span_off"]

        useful_raw_features = ['PlayId','YardsFromOwnGoal','angle_max_def', 'X_end_min_def', 'dist_close_end_mean_off','X_std_def', 
                       'S_vertical_std_def','Y_mean_off', 'S_vertical_std_off', 'X_min_off', 'dist_Y_end_mean_off','dist_X_max_off',
                       'Dir_mean_def', 'dist_rusher_end_min_off', 'S_max_def', 'dist_X_min_off',
                       'S_rusher', 'S_std_def', 'A_std_off', 'dist_rusher_end_std_def',
                       'dist_close_min_off', 'PlayerWeight_mean_def', 'dist_rusher_min_def', 'dist_X_max_def', 'angle_min_def',
                       'dist_X_mean_off', 'X_end_std_def', 'angle_end_min_def', 'dist_rusher_end_def_mean',
                       'angle_std_off', 'S_horizontal_mean_off', 'dist_X_min_def', 'dist_close_end_mean_def',
                       'angle_end_max_def', 'dist_rusher_travel', 'dist_rusher_end_mean_def', 'dist_Y_end_mean_def',
                       'S_horizontal_mean_def', 'S_horizontal_rusher', 'A_rusher', 'A_horizontal_rusher', 'dist_rusher_end_min_def',
                       'dist_center', 'X_span_def','X_span_off','rectangle_off',
                       'dist_yard_mean_def', 'dist_yard_min_def', 'dist_yard_min_off', 'Old']
        return play_df[useful_raw_features]

    def aggreate_by_play(self, df, configs):
        df = df.sort_values('PlayId')
        agg_df = pd.DataFrame({'PlayId': list(df['PlayId'].unique())}).sort_values('PlayId')
        # TODO aggerate with a sliding window
        for config in configs:
            feature = config[0]
            if feature == 'PlayId' or feature not in df.columns:
                continue
            gy = df.groupby('PlayId')
            gy_team = df.groupby(['PlayId','IsOnOffense'])
            if 'team' in config[2]:
                for agg_func in config[2]:                
                    if agg_func == 'team':
                        continue
                    elif agg_func == 'first':
                        agg_df[feature+"_def"] = gy_team[feature].agg(agg_func)[::2].values
                        agg_df[feature+"_off"] = gy_team[feature].agg(agg_func)[1::2].values
                    else:
                        agg_df[f'{feature}_{agg_func}_def'] = gy_team[feature].agg(agg_func)[::2].values
                        agg_df[f'{feature}_{agg_func}_off'] = gy_team[feature].agg(agg_func)[1::2].values
            else:
                for agg_func in config[2]:
                    if agg_func == 'first':
                        agg_df[feature] = gy[feature].agg(agg_func).values
                    else:
                        agg_df[f'{feature}_{agg_func}'] = gy[feature].agg(agg_func).values
        return agg_df

In [6]:
def neg_enhancement(df):
    df['Y'] = (160 / 3) - df['Y']
    df['Dir'] = df['Dir'].apply(lambda x:(-(x-90)%360 + 90)%360)
    df['Orientation'] = df['Orientation'].apply(lambda x:(-(x-90)%360 + 90)%360)
    return df

# Train and Validation

In [7]:
train_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
train_fake = train_df.copy()
train_fake = neg_enhancement(train_fake)

timer = GoldenTimer()
categorical_features = ["OffenseFormation","Location"]
encoders = generate_categorical_encoders(train_df, categorical_features)
feature_factory = FeatureFactory(encoders)
play_emin = feature_factory.make_feats(train_df, show=True)

timer.time("done play")

fake_encoders = generate_categorical_encoders(train_fake, categorical_features)
fake_factory = FeatureFactory(fake_encoders)
fake_emin = fake_factory.make_feats(train_fake, show=True)

timer.time("done fake")

done play 53.260546922683716
done fake 50.87688136100769


In [8]:
useful_raw_features = ['angle_max_def', 'X_end_min_def', 'dist_close_end_mean_off','X_std_def', 'S_vertical_std_def',
                       'Y_mean_off', 'S_vertical_std_off', 'X_min_off', 'dist_Y_end_mean_off','dist_X_max_off',
                       'Dir_mean_def', 'dist_rusher_end_min_off', 'S_max_def', 'dist_X_min_off',
                       'S_rusher', 'DefendersInTheBox', 'S_std_def', 'A_std_off', 'dist_rusher_end_std_def',
                       'dist_close_min_off', 'PlayerWeight_mean_def', 'dist_rusher_min_def', 'dist_X_max_def', 'angle_min_def',
                       'dist_X_mean_off', 'X_end_std_def', 'angle_end_min_def', 'dist_rusher_end_def_mean',
                       'angle_std_off', 'Distance', 'S_horizontal_mean_off', 'dist_X_min_def', 'dist_close_end_mean_def',
                       'angle_end_max_def', 'YardsFromOwnGoal', 'dist_rusher_travel', 'dist_rusher_end_mean_def', 'dist_Y_end_mean_def',
                       'S_horizontal_mean_def', 'S_horizontal_rusher', 'A_rusher', 'A_horizontal_rusher', 'dist_rusher_end_min_def','Old',
                       'dist_center', 'X_span_def','X_span_off','rectangle_off',
                       'dist_yard_mean_def', 'dist_yard_min_def', 'dist_yard_min_off',
                       'closest_A','closest3_A','min_unblocked_dist3','max_df_cnt3','min_df_cnt3','free_width3',
                       "closest_next_dist", "closest3_prev_dist"]

In [9]:
play_emin["min_unblocked_dist3"] = play_emin["min_unblocked_dist3"].fillna(99)
fake_emin["min_unblocked_dist3"] = fake_emin["min_unblocked_dist3"].fillna(99)
play_emin[useful_raw_features] = play_emin[useful_raw_features].fillna(play_emin[useful_raw_features].mean())
fake_emin[useful_raw_features] = fake_emin[useful_raw_features].fillna(fake_emin[useful_raw_features].mean())
timer.time("done fill")

done fill 0.2860713005065918


In [10]:
print(play_emin[useful_raw_features].shape)
play_emin[useful_raw_features].head()

(23171, 59)


Unnamed: 0,angle_max_def,X_end_min_def,dist_close_end_mean_off,X_std_def,S_vertical_std_def,Y_mean_off,S_vertical_std_off,X_min_off,dist_Y_end_mean_off,dist_X_max_off,Dir_mean_def,dist_rusher_end_min_off,S_max_def,dist_X_min_off,S_rusher,DefendersInTheBox,S_std_def,A_std_off,dist_rusher_end_std_def,dist_close_min_off,PlayerWeight_mean_def,dist_rusher_min_def,dist_X_max_def,angle_min_def,dist_X_mean_off,X_end_std_def,angle_end_min_def,dist_rusher_end_def_mean,angle_std_off,Distance,S_horizontal_mean_off,dist_X_min_def,dist_close_end_mean_def,angle_end_max_def,YardsFromOwnGoal,dist_rusher_travel,dist_rusher_end_mean_def,dist_Y_end_mean_def,S_horizontal_mean_def,S_horizontal_rusher,A_rusher,A_horizontal_rusher,dist_rusher_end_min_def,Old,dist_center,X_span_def,X_span_off,rectangle_off,dist_yard_mean_def,dist_yard_min_def,dist_yard_min_off,closest_A,closest3_A,min_unblocked_dist3,max_df_cnt3,min_df_cnt3,free_width3,closest_next_dist,closest3_prev_dist
0,71.527935,33.933859,5.497838,5.294079,1.465928,24.797879,1.495569,30.24,3.79194,4.17,170.59,1.786392,4.55,-1.01,3.63,6.0,1.186578,0.738631,4.946979,0.580517,233.545455,4.59331,22.12,-56.951875,3.088,5.165283,-74.221902,3.903068,62.814056,2,1.365528,3.69,7.865433,96.95701,35,3.63,6.991214,4.616856,0.172228,3.309436,3.35,3.054163,2.010548,1,4.137117,18.43,5.18,100.233,7.531487,1.838314,1.408013,1.35,0.73,3.329092,2.0,1.0,0.0,3.329092,4.880256
1,71.448793,42.687928,5.60103,5.406292,1.49138,28.570606,1.644804,38.21,6.035394,4.88,169.080909,1.548273,2.79,-0.72,3.06,6.0,0.630436,0.602741,6.989089,0.603738,233.545455,4.287773,23.02,-51.527841,3.231,5.191388,-49.734085,6.985689,72.521277,10,1.165587,4.27,8.713621,75.289672,43,3.06,9.112459,5.196817,0.221386,2.266862,2.41,1.785339,1.599975,1,4.696345,18.75,5.6,123.76,7.625625,0.438292,0.742428,0.55,0.55,11.970149,3.0,2.0,0.0,1.599975,4.287773
2,58.943926,64.476474,9.003005,4.720893,1.244774,31.882424,1.225667,60.49,5.568518,5.5,146.707273,1.538625,4.35,-0.85,5.77,7.0,0.855726,1.013888,5.285258,0.903383,233.545455,4.22167,20.42,-65.122113,3.859,4.575924,-95.139498,6.065421,54.064127,10,1.533994,3.91,11.734751,67.621387,65,5.77,7.875559,6.057012,0.455077,3.857889,2.42,1.61804,1.379718,1,3.723536,16.51,6.35,142.24,7.709866,1.365613,1.028591,1.13,2.15,99.0,2.0,2.0,0.0,3.180789,4.98255
3,49.577496,96.688536,3.306709,0.962418,0.995038,27.533333,1.051438,93.7,3.024926,5.24,197.353636,1.68307,3.28,-0.77,4.45,9.0,0.7367,0.869787,2.358279,0.44,257.454545,4.528002,6.33,-57.851951,3.574,1.296919,-89.998517,1.189177,59.482461,2,1.147356,3.32,3.981284,137.832325,98,4.45,3.889928,3.637683,-0.148942,4.429957,3.2,3.185587,0.994415,1,1.593201,3.01,6.01,70.918,4.088511,1.589277,1.194362,0.83,2.16,2.473951,2.0,1.0,0.0,3.96227,5.704849
4,57.258252,22.872706,6.037462,5.391251,1.332185,27.061818,2.517887,19.51,7.122583,6.2,198.156364,2.253578,3.33,-0.48,3.9,7.0,0.6735,0.565544,6.481055,0.91351,242.454545,4.288088,19.42,-62.631067,3.46,5.9936,-70.21819,6.692979,74.517985,10,0.615832,3.96,9.3395,69.28026,25,3.9,9.595146,6.089184,-0.380134,1.466013,2.53,0.951029,2.669379,1,5.299553,15.46,6.68,177.02,8.080746,0.895879,0.723395,2.04,2.04,10.731683,2.0,1.0,0.0,2.669379,4.288088


In [11]:
print(fake_emin[useful_raw_features].shape)
fake_emin[useful_raw_features].head()

(23171, 59)


Unnamed: 0,angle_max_def,X_end_min_def,dist_close_end_mean_off,X_std_def,S_vertical_std_def,Y_mean_off,S_vertical_std_off,X_min_off,dist_Y_end_mean_off,dist_X_max_off,Dir_mean_def,dist_rusher_end_min_off,S_max_def,dist_X_min_off,S_rusher,DefendersInTheBox,S_std_def,A_std_off,dist_rusher_end_std_def,dist_close_min_off,PlayerWeight_mean_def,dist_rusher_min_def,dist_X_max_def,angle_min_def,dist_X_mean_off,X_end_std_def,angle_end_min_def,dist_rusher_end_def_mean,angle_std_off,Distance,S_horizontal_mean_off,dist_X_min_def,dist_close_end_mean_def,angle_end_max_def,YardsFromOwnGoal,dist_rusher_travel,dist_rusher_end_mean_def,dist_Y_end_mean_def,S_horizontal_mean_def,S_horizontal_rusher,A_rusher,A_horizontal_rusher,dist_rusher_end_min_def,Old,dist_center,X_span_def,X_span_off,rectangle_off,dist_yard_mean_def,dist_yard_min_def,dist_yard_min_off,closest_A,closest3_A,min_unblocked_dist3,max_df_cnt3,min_df_cnt3,free_width3,closest_next_dist,closest3_prev_dist
0,56.951875,33.933859,5.497838,5.294079,1.465928,28.535455,1.495569,30.24,3.79194,4.17,173.046364,1.786392,4.55,-1.01,3.63,6.0,1.186578,0.738631,4.946979,0.580517,233.545455,4.59331,22.12,-71.527935,3.088,5.165283,-96.95701,3.903068,62.814056,2,1.365528,3.69,7.865433,74.221902,35,3.63,6.991214,4.616856,0.172228,3.309436,3.35,3.054163,2.010548,1,4.137117,18.43,5.18,100.233,7.531487,1.838314,1.408013,1.35,0.73,3.329092,2.0,1.0,0.0,3.329092,4.880256
1,51.527841,42.687928,5.60103,5.406292,1.49138,24.762727,1.644804,38.21,6.035394,4.88,207.282727,1.548273,2.79,-0.72,3.06,6.0,0.630436,0.602741,6.989089,0.603738,233.545455,4.287773,23.02,-71.448793,3.231,5.191388,-75.289672,6.985689,72.521277,10,1.165587,4.27,8.713621,49.734085,43,3.06,9.112459,5.196817,0.221386,2.266862,2.41,1.785339,1.599975,1,4.696345,18.75,5.6,123.76,7.625625,0.438292,0.742428,0.55,0.55,11.970149,3.0,2.0,0.0,1.599975,4.287773
2,65.122113,64.476474,9.003005,4.720893,1.244774,21.450909,1.225667,60.49,5.568518,5.5,164.201818,1.538625,4.35,-0.85,5.77,7.0,0.855726,1.013888,5.285258,0.903383,233.545455,4.22167,20.42,-58.943926,3.859,4.575924,-67.621387,6.065421,54.064127,10,1.533994,3.91,11.734751,95.139498,65,5.77,7.875559,6.057012,0.455077,3.857889,2.42,1.61804,1.379718,1,3.723536,16.51,6.35,142.24,7.709866,1.365613,1.028591,1.13,2.15,99.0,2.0,2.0,0.0,3.180789,4.98255
3,57.851951,96.688536,3.306709,0.962418,0.995038,25.8,1.051438,93.7,3.024926,5.24,146.282727,1.68307,3.28,-0.77,4.45,9.0,0.7367,0.869787,2.358279,0.44,257.454545,4.528002,6.33,-49.577496,3.574,1.296919,-137.832325,1.189177,59.482461,2,1.147356,3.32,3.981284,89.998517,98,4.45,3.889928,3.637683,-0.148942,4.429957,3.2,3.185587,0.994415,1,1.593201,3.01,6.01,70.918,4.088511,1.589277,1.194362,0.83,2.16,2.473951,2.0,1.0,0.0,3.96227,5.704849
4,62.631067,22.872706,6.037462,5.391251,1.332185,26.271515,2.517887,19.51,7.122583,6.2,210.934545,2.253578,3.33,-0.48,3.9,7.0,0.6735,0.565544,6.481055,0.91351,242.454545,4.288088,19.42,-57.258252,3.46,5.9936,-69.28026,6.692979,74.517985,10,0.615832,3.96,9.3395,70.21819,25,3.9,9.595146,6.089184,-0.380134,1.466013,2.53,0.951029,2.669379,1,5.299553,15.46,6.68,177.02,8.080746,0.895879,0.723395,2.04,2.04,10.731683,2.0,1.0,0.0,2.669379,4.288088


# NN

In [12]:
final_score = 0
gamblers = []
scores = []
folds = 5

iterations = 5
cv = "cv"
if cv == "cv":
    iterations = 1

for it in range(iterations):
    print(f'Iteration {it+1}')    
    gambler = Gambler(len(useful_raw_features))
    nn_ids, nn_predictions, nn_targets = train_loop(gambler, play_emin, fake_emin, folds, useful_raw_features, cv=cv)
    nn_raw = nn_predictions.copy()
    
    for i in range(len(nn_predictions)):
        temp_yard = int(play_emin.loc[play_emin["PlayId"]==nn_ids[i],"YardsFromOwnGoal"].iloc[0])
        nn_predictions[i][-temp_yard:] = 1
        nn_predictions[i][:99-temp_yard] = 0

    nn_score = ((nn_predictions - nn_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * nn_targets.shape[0])
    scores.append(nn_score)
    print(f'{it+1} Iteration score', nn_score)    
    
    gamblers.append(gambler)
    final_score+= nn_score/iterations

print(f'NN Score', final_score)

timer.time("done nn training")

Iteration 1
### Fold 1 ###
tr CRPS 0.014510701523477807 val CRPS 0.01470571788064951
tr CRPS 0.01298908102015213 val CRPS 0.013169605138158822
tr CRPS 0.012575982431812786 val CRPS 0.012780205358036116
tr CRPS 0.012391406951541485 val CRPS 0.012634036501715292
tr CRPS 0.012263648763492537 val CRPS 0.012522870632817082
tr CRPS 0.012194728527817864 val CRPS 0.01247463850239634
tr CRPS 0.012131400029855272 val CRPS 0.012416295107316495
tr CRPS 0.01208757472272294 val CRPS 0.01238627615376649
tr CRPS 0.012082914916983264 val CRPS 0.012418599013811937
tr CRPS 0.012032322895608667 val CRPS 0.012383188465605382
tr CRPS 0.011980588032423469 val CRPS 0.012331943450989839
tr CRPS 0.01197892991105688 val CRPS 0.012337561542637725
tr CRPS 0.011931158998672524 val CRPS 0.01233850471328079
tr CRPS 0.011923266667900759 val CRPS 0.012291480729080028
tr CRPS 0.011898981496310426 val CRPS 0.012292470849536231
tr CRPS 0.011863865908442578 val CRPS 0.01227183134111366
tr CRPS 0.011853531467303545 val CRPS

In [13]:
# NN Score 0.012072372546225158
# done nn training 694.745926618576

# Triple Part

In [14]:
gc.collect()

33

In [15]:
train_df = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2020/train.csv")
exclude_global_feature = ['Week','Humidity', 'Stadium', 'Location', 'Turf', 'GameWeather',
                          'Temperature',  'WindSpeed', 'WindDirection', 'Season',
                           'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'YardLine_std','YardLine']#
exclude_personal_feature = [ 'NflId',"player's name",'NflIdRusher', 'Orientation', 'Dir', 'PlayerWeight', 'PlayerHeight']
exclude_feature = ['Quarter','Yards', 'PlayId', 'GameId'] + exclude_global_feature + exclude_personal_feature

category_features = ['DefendersInTheBox', 'Position', 'OffenseFormation', 'DefensePersonnel']# 'Quarter', 'Down',,
#player_continuous_feature = ['speed_sin_Dir', 'speed_cos_Dir', 'acc_sin_Dir', 'acc_cos_Dir', 'X', 'Y', 'dx', 'dy']
player_continuous_feature = ['speed_sin_Dir', 'speed_cos_Dir', 'acc_sin_Dir', 'acc_cos_Dir', 'X', 'Y', "x1", "y1", "x2", "y2"]
absolute_player_continuous_feature = []

# TODO: age
player_category_feature = ['Position','JerseyNumber']
# TODO: JerseyNumber

# team features 
friend_continuous_features = ['friend' + str(i) + '_cont_' + cont_feature for cont_feature in player_continuous_feature for i in range(10)]
friend_category_features = ['friend' + str(i) + '_cat#' + cont_feature for cont_feature in player_category_feature for i in range(10)]
ravel_continuous_features = ['ravel' + str(i) + '_cont_' + cont_feature for cont_feature in player_continuous_feature for i in range(11)]
ravel_category_features = ['ravel' + str(i) + '_cat#' + cont_feature for cont_feature in player_category_feature for i in range(11)]
friends_position = ["friend" + str(i) + "_position" for i in range(10)]
ravel_position = ["ravel" + str(i) + "_position" for i in range(11)]

team_player_feature = friend_continuous_features + friend_category_features + ravel_continuous_features + ravel_category_features


class GroupDataWrap(object):
    def __init__(self, np_arr, columns, column_map=None):
        self.np_arr = np_arr
        self.columns = columns
        self.column_map = column_map if column_map is not None else dict([(pair[1], pair[0]) for pair in enumerate(columns)])
        
    def __getitem__(self, col):
        if isinstance(col, str):
            return self.np_arr[:,self.column_map[col]]
        else:
            return GroupDataWrap(self.np_arr[col,:], self.columns)

def fe(df):
    mean_old, mean_new, std_old, std_new = 2.435519556913685, 2.7570316419451517, 1.2929623410155855, 1.4551321358655551
    df["S"] = np.where(df["Season"] >= 2018, (df["S"]-mean_new) / std_new * std_old + mean_old, df["S"])
    mean_old, mean_new, std_old, std_new = 1.5895792207792045, 1.7819953460610594, 0.8795106467756848, 1.060305722313926
    df["A"] = np.where(df["Season"] >= 2018, (df["A"]-mean_new) / std_new * std_old + mean_old, df["A"])
    
    df["PlayId"] = df["PlayId"].fillna(0)        
    df["Team"] = df["Team"].fillna(0)        
    for col in ['Season', 'YardLine', 'Quarter', 'Down','HomeScoreBeforePlay', 'VisitorScoreBeforePlay','NflIdRusher', 
     'Week','Temperature', 'Humidity','DefendersInTheBox', 'PlayerWeight']:
        df["temp"] = df.groupby(["PlayId"])[col].transform("mean")
        df[col] = df[col].fillna(df["temp"])            

    for col in ['X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir','Distance']:
        df["temp"] = df.groupby(["PlayId","Team"])[col].transform("mean")
        df[col] = df[col].fillna(df["temp"])            

    del df["temp"]

    df = df.fillna(0)

    df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "ARI", "ARZ", df["VisitorTeamAbbr"])
    df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "ARI", "ARZ", df["HomeTeamAbbr"])
    df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "BAL", "BLT", df["VisitorTeamAbbr"])
    df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "BAL", "BLT", df["HomeTeamAbbr"])
    df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "CLE", "CLV", df["VisitorTeamAbbr"])
    df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "CLE", "CLV", df["HomeTeamAbbr"])
    df["VisitorTeamAbbr"] = np.where(df["VisitorTeamAbbr"] == "HOU", "HST", df["VisitorTeamAbbr"])
    df["HomeTeamAbbr"] = np.where(df["HomeTeamAbbr"] == "HOU", "HST", df["HomeTeamAbbr"])

    df["PossessionTeam"] = np.where(df["PossessionTeam"] == "ARI", "ARZ", df["PossessionTeam"])
    df["FieldPosition"] = np.where(df["FieldPosition"] == "ARI", "ARZ", df["FieldPosition"])
    df["PossessionTeam"] = np.where(df["PossessionTeam"] == "BAL", "BLT", df["PossessionTeam"])
    df["FieldPosition"] = np.where(df["FieldPosition"] == "BAL", "BLT", df["FieldPosition"])
    df["PossessionTeam"] = np.where(df["PossessionTeam"] == "CLE", "CLV", df["PossessionTeam"])
    df["FieldPosition"] = np.where(df["FieldPosition"] == "CLE", "CLV", df["FieldPosition"])
    df["PossessionTeam"] = np.where(df["PossessionTeam"] == "HOU", "HST", df["PossessionTeam"])
    df["FieldPosition"] = np.where(df["FieldPosition"] == "HOU", "HST", df["FieldPosition"])
        
    df["team_abbr"] = np.where(df["Team"] == "home", df["HomeTeamAbbr"], df["VisitorTeamAbbr"])
    df["reval_abbr"] = np.where(df["Team"] == "away", df["HomeTeamAbbr"], df["VisitorTeamAbbr"])

    df.loc[df['PlayDirection'] == 'left', 'X'] = 120 - df.loc[df['PlayDirection'] == 'left', 'X']
    df.loc[df['PlayDirection'] == 'left', 'Y'] = (160 / 3) - df.loc[df['PlayDirection'] == 'left', 'Y']
    df.loc[df['PlayDirection'] == 'left', 'Orientation'] = np.mod(180 + df.loc[df['PlayDirection'] == 'left', 'Orientation'], 360)
    df.loc[df['PlayDirection'] == 'left', 'Dir'] = np.mod(180 + df.loc[df['PlayDirection'] == 'left', 'Dir'], 360)    
            
    df['speed_sin_Dir'] = df['S'] * np.sin(df['Dir'] / 180 * np.pi)
    df['speed_cos_Dir'] = df['S'] * np.cos(df['Dir'] / 180 * np.pi)
    
    df['acc_sin_Dir'] = df['A'] * np.sin(df['Dir'] / 180 * np.pi)
    df['acc_cos_Dir'] = df['A'] * np.cos(df['Dir'] / 180 * np.pi)
    
    df["dxdy_dir"] = (540 - (df["Dir"] + 90)) % 360
    df["dy"] = np.sin(np.deg2rad(df["dxdy_dir"]))
    df["dx"] = np.cos(np.deg2rad(df["dxdy_dir"]))
    df["x1"] = df["X"] + (df["dx"] * (df["S"]))
    df["y1"] = df["Y"] + (df["dy"] * (df["S"]))
    df["x2"] = df["X"] + (df["dx"] * (df["S"]*2))
    df["y2"] = df["Y"] + (df["dy"] * (df["S"]*2))
    df.drop(['dxdy_dir'], inplace=True, axis=1)
    
    df["is_old"] = np.where(df["Season"]<2018, 1, 0)
    
    df["YardLine_std"] = np.where(df["PossessionTeam"] == df["FieldPosition"], df["YardLine"]+10, 110-df["YardLine"])
    df['YardLine_std_diff'] = df['X'] - df['YardLine_std']
    
    grass_labels = tuple(['grass', 'natural grass', 'natural', 'naturall grass'])
    df['Grass'] = np.where(df.Turf.str.lower().isin([grass_labels]), 1, 0)

    rusher_df = df[df['NflId'] == df['NflIdRusher']]   
    
    # team features
    # fields: [rusher nfl id, PlayId] [friends fields] [reval fields]
    val_rows = []
    val_schema = ['NflId', 'PlayId'] + team_player_feature
    
    for groupname, group_df in (df.groupby(['PlayId'])):
        row_fields = []
        
        group_df = GroupDataWrap(group_df.to_numpy(), group_df.columns)
        
        runner = group_df[group_df['NflId'] == group_df['NflIdRusher']]   
        friends = group_df[(group_df['PossessionTeam'] == group_df['team_abbr']) & (group_df['NflId'] != group_df['NflIdRusher'])]
        ravel = group_df[group_df['PossessionTeam'] == group_df['reval_abbr']]
        
        row_fields.append(runner['NflId'][0])
        row_fields.append(runner['PlayId'][0])
        for cont_feature in player_continuous_feature:
            if cont_feature in absolute_player_continuous_feature:
                row_fields += friends[cont_feature].tolist()
            else:
                row_fields += (friends[cont_feature] - runner[cont_feature][0]).tolist()
        for cat_feature in player_category_feature:
            row_fields += friends[cat_feature].tolist()
        for cont_feature in player_continuous_feature:
            if cont_feature in absolute_player_continuous_feature:
                row_fields += ravel[cont_feature].tolist()
            else:
                row_fields += (ravel[cont_feature] - runner[cont_feature][0]).tolist()
        for cat_feature in player_category_feature:
            row_fields += ravel[cat_feature].tolist()
            
        val_rows.append(row_fields)
            
    team_feature_df = pd.DataFrame(val_rows, columns=val_schema)
    
    df = rusher_df.merge(team_feature_df, on=['NflId', 'PlayId'])

    return df

timer = GoldenTimer()
train_df = fe(train_df)
train_df.fillna(0, inplace=True)
print(train_df.shape)
timer.time("done fe")

(23171, 317)
done fe 78.04456448554993


In [16]:
gc.collect()

0

In [17]:
Y_MAX = 54
X_MAX = 120
X_FORWARD = 20

RESOLUTION = 2

def DropoutDense(units, activation="linear", name=None, dropout_ratio=None):
    if dropout_ratio is None:
        return Dense(units, name=name, activation=activation)
    else:
        dense = Dense(units, activation=activation)
        dropout = Dropout(dropout_ratio, name=name)
        return lambda x: dropout(dense(x))


class NLF_NN(object):
    def __init__(self, continuous_features, category_features, label, epoch, batch_size=2000, 
                 common_embedding_size=8, specified_embedding_map=None, learning_rate=0.005, 
                 player_continuous_feature=None, player_category_feature=None):
        self.continuous_features = [f for f in continuous_features if f != label]
        self.category_features = category_features
        self.common_embedding_size = common_embedding_size
        self.specified_embedding_map = specified_embedding_map if specified_embedding_map is not None else dict()
        self.learning_rate = learning_rate
        self.label = label
        self.epoch = epoch
        self.batch_size = batch_size
        self.player_continuous_feature = player_continuous_feature
        self.player_category_feature = player_category_feature
        
        
    def crps_loss_by_label_tensor(self, y_true, y_pred):
        y_true = K.clip(K.cumsum(y_true, axis=-1), 0, 1)
        y_pred = K.clip(K.cumsum(y_pred, axis=-1), 0, 1)
        return K.mean(K.square(y_true - y_pred))
    
    def crps_loss_by_label_np(self, y_true, y_pred):
        return np.mean(np.square(y_true - y_pred))
    
    def make_category_embedding(self, cat_input, category_list, cat_size_map, name=None, embedding_size=None):
        def _inner_make_category_embedding(cat_input):
            cat_emb_res = []
            for i in range(len(category_list)):
                cat_field_name = category_list[i]
                emb_size = embedding_size if embedding_size is not None else self.specified_embedding_map.get(cat_field_name, self.common_embedding_size)
                cat_size = cat_size_map[cat_field_name]
                emb_tensor = Embedding(cat_size, emb_size)(cat_input[:,i])
                cat_emb_res.append(emb_tensor)
                
            return K.concatenate(cat_emb_res, axis=-1)
        
        return Lambda(_inner_make_category_embedding, name=name)(cat_input)
    
    def make_team_embedding(self, cont_features, cat_features, cat_size_map):
        # l1:256 cont dense
        # l2:combined cont and cat emb
        # l3:128 dense
        
        cont_dense = Dense(256, activation='relu')
        comb_dense_ = Dense(128, activation='relu')
        comb_bn = BatchNormalization()
        comb_dense = lambda x: comb_bn(comb_dense_(x))
        out_dense = DropoutDense(64, activation='relu')
        
        team_embeddings = []
        for i in range(len(cont_features)):
            hidden1 = cont_dense(cont_features[i])
#            cat_emb = self.make_category_embedding(cat_features[i], self.player_category_feature, cat_size_map, embedding_size=8)
#            combined_feature = Lambda(lambda xs: K.concatenate(xs, axis=-1))([hidden1, cat_emb])
#            hidden2 = comb_dense(combined_feature)
            hidden2 = comb_dense(hidden1)
            hidden3 = out_dense(hidden2)
            team_embeddings.append(Lambda(lambda x: K.expand_dims(x, 1))(hidden3))
                
        return team_embeddings
    
    def make_mix_team_embedding(self, cont_friend, cat_friend, cont_ravel, cat_ravel, cat_size_map):        
        friend_embeddings = self.make_team_embedding(cont_friend, cat_friend, cat_size_map)
        ravel_embeddings = self.make_team_embedding(cont_ravel, cat_ravel, cat_size_map)
        
        friend_weight_dense = Dense(1, activation='sigmoid')
        ravel_weight_dense = Dense(1, activation='sigmoid')
        
        
        friends_weights = []
        ravels_weights = []
        
        friend_mean_embedding = Lambda(lambda xs: K.mean(xs, axis=1))(Concatenate(axis=1)(friend_embeddings))
        ravel_mean_embedding = Lambda(lambda xs: K.mean(xs, axis=1))(Concatenate(axis=1)(ravel_embeddings))
        mean_embedding = Add()([friend_mean_embedding, ravel_mean_embedding])
                
        for palyer_emb in friend_embeddings:
            ca = Lambda(lambda xs: K.concatenate([xs[0], K.squeeze(xs[1], axis=1)]))([mean_embedding, palyer_emb])
            friends_weights.append(friend_weight_dense(ca))
            
        for palyer_emb in ravel_embeddings:
            ca = Lambda(lambda xs: K.concatenate([xs[0], K.squeeze(xs[1], axis=1)]))([mean_embedding, palyer_emb])
            ravels_weights.append(ravel_weight_dense(ca))
            
        friend_concated_weights = Lambda(lambda x: K.exp(x))(Concatenate(axis=-1)(friends_weights))
        friend_concated_weights = Lambda(lambda x: x / K.sum(x, axis=-1, keepdims=True))(friend_concated_weights)
        
        ravel_concated_weights = Lambda(lambda x: K.exp(x))(Concatenate(axis=-1)(ravels_weights))
        ravel_concated_weights = Lambda(lambda x: x / K.sum(x, axis=-1, keepdims=True))(ravel_concated_weights)
        
        friend_concated_team_embeddings = Concatenate(axis=1)(friend_embeddings)
        ravel_concated_team_embeddings = Concatenate(axis=1)(ravel_embeddings)
        
        friend_res = Lambda(lambda ps: K.mean(tf.matmul(K.expand_dims(ps[0], axis=1), ps[1]), axis=1))([friend_concated_weights, friend_concated_team_embeddings])
        ravel_res = Lambda(lambda ps: K.mean(tf.matmul(K.expand_dims(ps[0], axis=1), ps[1]), axis=1))([ravel_concated_weights, ravel_concated_team_embeddings])
        
        return friend_res, ravel_res
        
        
    def build_muti_class_model(self, cat_size_map):
        # build nn structure of discrete feature
        cat_embedding = self.make_category_embedding(self.category_input, self.category_features, cat_size_map, name="multiclass_cat_embedding")
        
        friend_team_embedding, ravel_team_embedding = self.make_mix_team_embedding(self.continuous_friend_input, self.category_friend_input, self.continuous_ravel_input, self.category_ravel_input, cat_size_map)
        
        hidden_layer1 = Dense(256, name="sigmoid_hidden1", activation='relu')(self.continuous_input)
        combined_feature = Lambda(lambda xs: K.concatenate(xs, axis=-1), name="combine_cont_dis")([hidden_layer1, friend_team_embedding, ravel_team_embedding, cat_embedding])
        hidden_layer2 = DropoutDense(256, name="sigmoid_hidden2", activation='relu')(combined_feature)

        # final output
        hidden_layer4 = DropoutDense(128, name="sigmoid_hidden4", dropout_ratio=0.5, activation='relu')(hidden_layer2)
        final_output = Dense(199, name="output_dense1", activation='softmax')(hidden_layer4)     
        final_output = Lambda(lambda x: K.clip(x, 1e-8, 1), name="output_dense1_clip")(final_output)

        return final_output
    
    
    def build_model(self, cat_size_map):
        self.continuous_input = Input((len(self.continuous_features), ), dtype=tf.float32, name="continuous_input")
        self.category_input = Input((len(self.category_features), ), dtype=tf.int32, name="category_input")
        
        # team friend features
        self.continuous_friend_input = [Input((len(self.player_continuous_feature), ),  dtype=tf.float32, name="friend_continuous_input" + str(i)) for i in range(10)]
        self.category_friend_input = [Input((len(self.player_category_feature), ),  dtype=tf.int32, name="friend_category_input" + str(i)) for i in range(10)]

        # team ravel features
        self.continuous_ravel_input = [Input((len(self.player_continuous_feature), ),  dtype=tf.float32, name="ravel_continuous_input" + str(i)) for i in range(11)]
        self.category_ravel_input = [Input((len(self.player_category_feature), ),  dtype=tf.int32, name="ravel_category_input" + str(i)) for i in range(11)]
  
        # clip value avoid overflow
        self.output = self.build_muti_class_model(cat_size_map)
                
        self.model = Model(inputs=[self.continuous_input, self.category_input] + 
                           self.continuous_friend_input + self.category_friend_input +
                           self.continuous_ravel_input + self.category_ravel_input,
                           outputs=self.output)
    
        optimizer = adam(lr=self.learning_rate, clipvalue=0.005)
        
        # compile model
        self.model.compile(optimizer, loss="categorical_crossentropy", metrics=[self.crps_loss_by_label_tensor])
        
    def make_y_vec(self, y_list):
        y_list =np.round(y_list).astype('int32') + 99
        target = np.zeros((len(y_list), 199))
        target[range(len(y_list)),y_list] = 1
        
        return target
        
    def train_model(self, train_df, validate_df=None):
        # print(self.continuous_features)
        x_continuous = train_df[self.continuous_features]
        x_category = train_df[self.category_features]
        x_friend_continuours = [train_df[[f for f in friend_continuous_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        x_friend_category = [train_df[[f for f in friend_category_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        x_ravel_continuours = [train_df[[f for f in ravel_continuous_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        x_ravel_category = [train_df[[f for f in ravel_category_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        # print(x_friend_continuours[0].columns)
        # print(x_ravel_continuours[0].shape)
        
        validate_x_continuous = validate_df[self.continuous_features]
        validate_x_category = validate_df[self.category_features]
        validate_x_friend_continuours = [validate_df[[f for f in friend_continuous_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        validate_x_friend_category = [validate_df[[f for f in friend_category_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        validate_x_ravel_continuours =  [validate_df[[f for f in ravel_continuous_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        validate_x_ravel_category = [validate_df[[f for f in ravel_category_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        
        y_vector = self.make_y_vec(train_df[self.label])
        validate_y_vector = self.make_y_vec(validate_df[self.label])
        
        xs = [x_continuous, x_category] + x_friend_continuours + x_friend_category + \
            x_ravel_continuours + x_ravel_category
        
        validate_xs = [validate_x_continuous, validate_x_category] + validate_x_friend_continuours + \
                        validate_x_friend_category  + \
                        validate_x_ravel_continuours + validate_x_ravel_category
                
        es = EarlyStopping(monitor='val_crps_loss_by_label_tensor', 
                       mode='min',
                       verbose=2, 
                       restore_best_weights=True,
                       patience=10)

        self.model.fit(xs, y_vector, epochs=self.epoch, batch_size=self.batch_size, 
                       callbacks=[es], validation_data=(validate_xs, validate_y_vector))
        
        
    def score(self, validate_df):
        y = self.predict(validate_df)
        y_vector = np.cumsum(self.make_y_vec(validate_df[self.label]), axis=-1)
        
        return y, y_vector, self.crps_loss_by_label_np(y, y_vector)
       
        
    def predict(self, test_df):
        x_continuous = test_df[self.continuous_features]
        x_category = test_df[self.category_features]
        x_friend_continuours = [test_df[[f for f in friend_continuous_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        x_friend_category = [test_df[[f for f in friend_category_features if f.startswith("friend"+str(i)+"_")]]  for i in range(10)]
        x_ravel_continuours = [test_df[[f for f in ravel_continuous_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        x_ravel_category = [test_df[[f for f in ravel_category_features if f.startswith("ravel"+str(i)+"_")]]  for i in range(11)]
        
        xs = [x_continuous, x_category] + x_friend_continuours + x_friend_category + \
              x_ravel_continuours + x_ravel_category
        
        y = self.model.predict(xs)
        y = np.clip(np.cumsum(y, axis=-1), 0, 1)
        
        # modify by YardLine_std
        max_to_go = np.array(110 - test_df['YardLine_std'] + 0.5).astype(np.int32)
        target_index = np.array([np.arange(199)] * len(max_to_go))
        # print(max_to_go)
        y[target_index-99 >= np.array([max_to_go]).T] = 1
        y[target_index-99 <= np.array([max_to_go]).T - 110] = 0
        
        return y


class KFoldByKey(object):
    def __init__(self, nfold, shuffle=False):
        self.nfold = nfold
        self.shuffle= shuffle
        
    def __flatten(self, list2d):
        ret = []
        for i in list2d:
            ret += i
        return ret
        
    def split(self, df_col):        
        key_values = list(set(df_col))
        split_step = float(len(key_values)) / self.nfold
        range_point = [int(i * split_step) for i in range(self.nfold)] + [len(key_values)]
        key_splits = [key_values[range_point[i]:range_point[i+1]] for i in range(self.nfold)]
        
        for i in range(self.nfold):
            test_keys = key_splits[i]
            train_keys = self.__flatten([key_splits[j] for j in range(self.nfold) if j != i])
            
            yield (np.where(df_col.isin(train_keys)), np.where(df_col.isin(test_keys)))


class CategoryFeatureEncoder(object):
    def __init__(self, category_features):
        self.label_encoders = dict()
        self.category_features = category_features
        self.cat_size = dict()

    def fit(self, df):
        for column in self.category_features:
            suffix = column.split('#')[-1]
            if suffix not in self.label_encoders:
                related_cols = [col for col in self.category_features if col.split("#")[-1] == suffix]
                related_category = []
                for col in related_cols:
                    related_category += list(set(df[col]))
                related_category = list(set(related_category))
                _, indexer = pd.DataFrame({"x": related_category})['x'].factorize()
                self.cat_size[suffix] = indexer.size + 2
                self.label_encoders[suffix] = indexer

    def transform(self, df):
        df = df.copy()
        for col in self.category_features:
            suffix = col.split('#')[-1]
            indexer = self.label_encoders[suffix]
            df[col] = indexer.get_indexer(df[col]) + 2

        return df

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

    def get_cat_size_map(self):
        return self.cat_size



class DataNegEnhancement(object):
    def __init__(self, f_convert=None):
        self.f_convert = f_convert if f_convert is not None else self.neg_enhancement
        
    def transform(self, df):
        neg_df = df.copy()
        neg_df = self.f_convert(neg_df)
        return pd.concat([df, neg_df])
    
    @staticmethod
    def neg_enhancement(df):
        df['Y'] = (160 / 3) - df['Y']
        df['y1'] = (160 / 3) - df['y1']
        df['y2'] = (160 / 3) - df['y2']
        for f in df.columns:
            if f.endswith('_cont_Y') or f.endswith('_cont_y1') or f.endswith('_cont_y2'):
                df[f] = -df[f]
        df['Dir'] = df['Dir'].apply(lambda x:(-(x-90)%360 + 90)%360)
        df['Orientation'] = df['Orientation'].apply(lambda x:(-(x-90)%360 + 90)%360)
        for col in ['speed_cos_Dir','acc_cos_Dir','speed_cos_Ori','acc_cos_Ori']:
            for f in df.columns:
                if col in f:
                    df[f] = -df[f]
            
        return df

In [18]:
def train_base_model(train_df, continuous_features, category_features, label, cat_size_map, epoch=100, batch_size=1024,
                     common_embedding_size=8, specified_embedding_map=None, learning_rate=0.001, nfold=5, last_only=False):
    errs = []
    clfs = []

    oof_predictions = np.zeros((train_df.shape[0], 199))
    oof_targets = np.zeros((train_df.shape[0], 199))

    category_features = [f for f in category_features if f in cat_size_map]
    
    # fold data
    kf = KFoldByKey(nfold, shuffle=False)
    cnt = 0
    for train_index, test_index in kf.split(train_df["GameId"]):
        cnt += 1
        print(f"Training for FOLD {cnt}")
        gc.collect()
        timer = GoldenTimer()
        if (not last_only) or cnt == nfold:
            X_train, X_test = train_df.iloc[train_index].copy(), train_df.iloc[test_index].copy()
            
            data_enhancement = DataNegEnhancement()
            X_train = data_enhancement.transform(X_train)
            
            X_train[continuous_features] = sc.transform(X_train[continuous_features])
            X_test[continuous_features] = sc.transform(X_test[continuous_features])
    
            model = NLF_NN(continuous_features, category_features, label, epoch, batch_size, 
                           common_embedding_size=common_embedding_size, specified_embedding_map=specified_embedding_map,
                           learning_rate=learning_rate, 
                           player_continuous_feature=player_continuous_feature, player_category_feature=player_category_feature)
             
            model.build_model(cat_size_map)
            model.train_model(X_train, validate_df=X_test)
            
            y_pred, y_target, score = model.score(X_test)
            
            oof_predictions[test_index] = y_pred
            oof_targets[test_index] = y_target

            print(f"FOLD {cnt} score:", score)
            
            if score < 0.015 and not math.isnan(score):
                errs.append(score)
                clfs.append(model)

    oof_score = ((oof_predictions - oof_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * oof_targets.shape[0])                        
    print(f"OOF Score {oof_score}, Mean Score {np.mean(errs)}, Std Score {np.std(errs)}")
    return clfs, oof_predictions, oof_targets


In [19]:
def is_team_feature(f):
    team_prefix = ['friend', 'ravel']
    for prefix in team_prefix:
        if f.startswith(prefix):
            return True
    return False

continuous_features = [f for f in train_df.columns if f not in exclude_feature + category_features and 
                       train_df[f].dtype != 'object' and not is_team_feature(f)]

continuous_features = [
    'X', 'Y', 'S', 'A', 'Dis', 'Distance', #'PlayerWeight',
    'Dir',
    'speed_sin_Dir', 'speed_cos_Dir', 'acc_sin_Dir', 'acc_cos_Dir',
    "x1", "y1", "x2", "y2",
    'YardLine_std_diff', #'Grass'
    "is_old", #'YardLine_std',
]

print(category_features)
print(continuous_features)

train_df.fillna(0, inplace=True)
sc = StandardScaler()
sc.fit(train_df[continuous_features])

cfe = CategoryFeatureEncoder(category_features+friend_category_features+ravel_category_features)
train_encode_df = cfe.fit_transform(train_df)

timer = GoldenTimer()
clfs, triple_predictions, triple_targets = train_base_model(train_encode_df, continuous_features, category_features, 
                        'Yards', cfe.get_cat_size_map(), 
                        common_embedding_size = 10, specified_embedding_map={"NflId": 256}, last_only=False)
timer.time("done training")

['DefendersInTheBox', 'Position', 'OffenseFormation', 'DefensePersonnel']
['X', 'Y', 'S', 'A', 'Dis', 'Distance', 'Dir', 'speed_sin_Dir', 'speed_cos_Dir', 'acc_sin_Dir', 'acc_cos_Dir', 'x1', 'y1', 'x2', 'y2', 'YardLine_std_diff', 'is_old']
Training for FOLD 1
Train on 36960 samples, validate on 4691 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Restoring model weights from the end of the best epoch
Epoch 00035: early stopping
FOLD 1 score: 0.01134906865170375
Training for FOLD 2
Train on 37264 samples, validate on 4539 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

In [20]:
avg_pred = np.zeros(nn_predictions.shape)
for i in range(199):
    avg_pred[:, i] = (nn_predictions[:, i]*33 + triple_predictions[:, i]*67) / 100
    
oof_score = ((nn_predictions - nn_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * nn_targets.shape[0])
print('nn score', oof_score)
oof_score = ((triple_predictions - triple_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * triple_targets.shape[0])
print('triple score', oof_score)
oof_score = ((avg_pred - nn_targets) ** 2).sum(axis=1).sum(axis=0) / (199 * nn_targets.shape[0])
print('avg score', oof_score)

nn score 0.012054316179004394
triple score 0.011828397898187626
avg score 0.01174698434613743


# Test Prediction

In [21]:
def make_predict(test_df, models, exclude_feature=[]):
    test_df = test_df.copy()
    test_df = fe(test_df)
    test_df.fillna(0, inplace=True)
    test_df[continuous_features] = sc.transform(test_df[continuous_features])
    test_df = cfe.transform(test_df)
    
    preds_labels = np.mean([model.predict(test_df) for model in models], axis=0)
    
    preds_labels[0][0] = 0
    preds_labels[0][-1] = 1
    
    return preds_labels

In [22]:
env = nflrush.make_env()
for (test_df, sample_prediction_df) in tqdm_notebook(env.iter_test()):
    test_triple = test_df.copy()
    test_play_emin = feature_factory.make_feats(test_df, False)
    test_play_emin["min_unblocked_dist3"] = test_play_emin["min_unblocked_dist3"].fillna(99)
    test_play_emin = test_play_emin[useful_raw_features].fillna(test_play_emin[useful_raw_features].mean())
    
    temp_yard = test_play_emin["YardsFromOwnGoal"].iloc[0]
    if temp_yard is None:
        temp_yard = 999
    else:
        temp_yard = int(temp_yard)
    
    nn_pred = gambler.predict_final(test_play_emin)
    nn_pred = np.clip(np.cumsum(nn_pred, axis=1), 0, 1)
    nn_pred = nn_pred.ravel() 
    
    y_triple = make_predict(test_triple, clfs).ravel() 

    y_pred = (33*nn_pred+67*y_triple)/100
    y_pred = np.clip(y_pred, 0, 1)

    if temp_yard != 999:
        y_pred[-temp_yard:] = 1
        y_pred[:99-temp_yard] = 0
    
    sample_prediction_df.iloc[0, :] = y_pred

    env.predict(sample_prediction_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [23]:
env.write_submission_file()

Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
