https://www.kaggle.com/coolcoder22/nn-19-features

In [122]:
# IMPORTS 
import numpy as np
import pandas as pd
import sklearn.metrics as mtr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import Callback, EarlyStopping
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, merge, Add
from keras.layers.embeddings import Embedding
from sklearn.model_selection import KFold,GroupKFold
import warnings
import random as rn
import tensorflow as tf
from keras.models import load_model
import os
import gc
from scipy.spatial import Voronoi
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [123]:
train = pd.read_csv('../input/train.csv', dtype={'WindSpeed': 'object'})

In [124]:
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

In [125]:
# evaluation metric
def crps(y_true, y_pred):
    y_true = np.clip(np.cumsum(y_true, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    return ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * y_true.shape[0]) 

In [126]:
# author : nlgn
# Link : https://www.kaggle.com/kingychiu/keras-nn-starter-crps-early-stopping
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred = self.model.predict(X_train)
        y_true = np.clip(np.cumsum(y_train, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        tr_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_train[-1].shape[0])
        tr_s = np.round(tr_s, 6)
        logs['tr_CRPS'] = tr_s

        X_valid, y_valid = self.data[1][0], self.data[1][1]

        y_pred = self.model.predict(X_valid)
        y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid[-1].shape[0])
        val_s = np.round(val_s, 6)
        logs['val_CRPS'] = val_s
        print('tr CRPS', tr_s, 'val CRPS', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

In [127]:
# author : ryancaldwell
# Link : https://www.kaggle.com/ryancaldwell/location-eda
def create_features(df, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]

        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    
#     def defense_features(df):
#         rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
#         rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

#         df = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
#         defense = df[df['Team'] != df['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY','S','A','Dis','Orientation','Dir']]
#         defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

#         defense = defense.groupby(['GameId','PlayId'])\
#                          .agg({'def_dist_to_back':['min','max','mean','std']})\
#                          .reset_index()
#         defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']
        
#         return defense
    
    
    def add_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        df = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        df['dist_from_rusher'] = df[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        df = df.sort_values(by=['GameId','PlayId','dist_from_rusher'])
        
        defense_summary = df[df['Team'] != df['RusherTeam']].groupby(['GameId','PlayId'])\
                         .agg({'dist_from_rusher':['min','max','mean','std']})\
                         .reset_index()
        defense_summary.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']
        
        defense_closest = df[df['Team'] != df['RusherTeam']][::11][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir']]  # Rusherに最も近いDefense
        defense_closest.columns = ['GameId','PlayId','X_def_closest','Y_def_closest','S_def_closest','A_def_closest','Dis_def_closest','Orientation_def_closest','Dir_def_closest']
        
        offense_closest = df[df['Team'] == df['RusherTeam']][1::11][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir']]  # Rusherに最も近いOffense
        offense_closest.columns = ['GameId','PlayId','X_off_closest','Y_off_closest','S_off_closest','A_off_closest','Dis_off_closest','Orientation_off_closest','Dir_def_closest']
        
        df_nn = pd.merge(defense_summary, defense_closest, on=['GameId','PlayId'], how='inner')
        df_nn = pd.merge(df_nn, offense_closest, on=['GameId','PlayId'], how='inner')
        
        # Rusherのボロノイ領域を計算して特徴量を作成
        # RusherがPlayIdの中で最も上に並ぶようにソート
        # ソートしてから、ボロノイ領域を計算
        vor = df.groupby(['GameId','PlayId'])[['X','Y']].apply(Voronoi)

        # Rusherのボロノイ領域の面積
        def PolyArea(arr):
            x = arr[:,0]
            y = arr[:,1]
            return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))

        df_nn['X_maxInVoronoi'] = 0
        df_nn['X_minInVoronoi'] = 0
        df_nn['AreaInVoronoi'] = 0
        for i, playid in enumerate(list(df_nn['PlayId'].unique())):
            df_nn.loc[df_nn['PlayId']==playid, 'X_maxInVoronoi'] = vor.iloc[i].vertices[vor.iloc[i].regions[vor.iloc[i].point_region[0]]][:,0].max()  # Rusherのボロノイ領域の最大のx座標
            df_nn.loc[df_nn['PlayId']==playid, 'X_minInVoronoi'] = vor.iloc[i].vertices[vor.iloc[i].regions[vor.iloc[i].point_region[0]]][:,0].min()  # Rusherのボロノイ領域の最小のx座標
            df_nn.loc[df_nn['PlayId']==playid, 'AreaInVoronoi'] = PolyArea(vor.iloc[i].vertices[vor.iloc[i].regions[vor.iloc[i].point_region[0]]])  # Rusherのボロノイ領域の面積
    
        return df_nn
    
        
    
    def static_features(df):
        static_features = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                            'YardLine','Quarter','Down','Distance','DefendersInTheBox']].drop_duplicates()
        static_features['DefendersInTheBox'] = static_features['DefendersInTheBox'].fillna(np.mean(static_features['DefendersInTheBox']))

        return static_features
    
    def split_personnel(s):
        splits = s.split(',')
        for i in range(len(splits)):
            splits[i] = splits[i].strip()

        return splits

    def defense_formation(l):
        dl = 0
        lb = 0
        db = 0
        other = 0

        for position in l:
            sub_string = position.split(' ')
            if sub_string[1] == 'DL':
                dl += int(sub_string[0])
            elif sub_string[1] in ['LB','OL']:
                lb += int(sub_string[0])
            else:
                db += int(sub_string[0])

        counts = (dl,lb,db,other)

        return counts

    def offense_formation(l):
        qb = 0
        rb = 0
        wr = 0
        te = 0
        ol = 0

        sub_total = 0
        qb_listed = False
        for position in l:
            sub_string = position.split(' ')
            pos = sub_string[1]
            cnt = int(sub_string[0])

            if pos == 'QB':
                qb += cnt
                sub_total += cnt
                qb_listed = True
            # Assuming LB is a line backer lined up as full back
            elif pos in ['RB','LB']:
                rb += cnt
                sub_total += cnt
            # Assuming DB is a defensive back and lined up as WR
            elif pos in ['WR','DB']:
                wr += cnt
                sub_total += cnt
            elif pos == 'TE':
                te += cnt
                sub_total += cnt
            # Assuming DL is a defensive lineman lined up as an additional line man
            else:
                ol += cnt
                sub_total += cnt

        # If not all 11 players were noted at given positions we need to make some assumptions
        # I will assume if a QB is not listed then there was 1 QB on the play
        # If a QB is listed then I'm going to assume the rest of the positions are at OL
        # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
        if sub_total < 11:
            diff = 11 - sub_total
            if not qb_listed:
                qb += 1
                diff -= 1
            ol += diff

        counts = (qb,rb,wr,te,ol)

        return counts
    
    def personnel_features(df):
        personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
        personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
        personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
        personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
        personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
        personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
        personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
        personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
        personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

        # Let's create some features to specify if the OL is covered
        personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
        personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
        # Let's create a feature to specify if the defense is preventing the run
        # Let's just assume 7 or more DL and LB is run prevention
        personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

        personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)
        
        return personnel

    def combine_features(relative_to_back, static, personnel, add_features, deploy=deploy):
        df = pd.merge(relative_to_back,static,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,personnel,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,add_features,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    

    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    static_feats = static_features(df)
    personnel = personnel_features(df)
    
    add_features = add_features(df)
    
    basetable = combine_features(rel_back, static_feats, personnel, add_features, deploy=deploy)

#     display('rel_back', rel_back.head())
#     display('static_feats', static_feats.head())
#     display('personnel', personnel.head())
#     display('add_features', add_features.head())
    
    del rel_back, static_feats, personnel, add_features
    gc.collect()
    return basetable

In [128]:
train_basetable = create_features(train, False)

In [139]:
X = train_basetable.copy()
yards = X.Yards
X = X.drop(['Yards'], axis=1)

y = np.zeros((yards.shape[0], 199))
for idx, target in enumerate(list(yards)):
    y[idx][99 + target] = 1

In [140]:
def process_two(t_):
    t_['fe1'] = pd.Series(np.sqrt(np.absolute(np.square(t_.X.values) + np.square(t_.Y.values))))
    t_['fe5'] = np.square(t_['S'].values) + 2 * t_['A'].values * t_['Dis'].values  # N
    t_['fe7'] = np.arccos(np.clip(t_['X'].values / t_['Y'].values, -1, 1))  # N
    t_['fe8'] = t_['S'].values / np.clip(t_['fe1'].values, 0.6, None)
    radian_angle = (90 - t_['Dir']) * np.pi / 180.0
    t_['fe10'] = np.abs(t_['S'] * np.cos(radian_angle))
    t_['fe11'] = np.abs(t_['S'] * np.sin(radian_angle))
    return t_


X = process_two(X)

important = ['back_from_scrimmage', 'min_dist', 'max_dist', 'mean_dist', 'std_dist',
       'def_min_dist', 'def_max_dist', 'def_mean_dist', 'def_std_dist', 'X',
       'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'YardLine']

cat = ['back_oriented_down_field', 'back_moving_down_field']

# num = ['back_from_scrimmage', 'min_dist', 'max_dist', 'mean_dist', 'std_dist', 'def_min_dist', 'def_max_dist', 'def_mean_dist', 'def_std_dist',
#        'X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'YardLine', 'Distance'] + ['fe1', 'fe5', 'fe7','fe8', 'fe10', 'fe11']

num = [col for col in X.columns if col not in cat]


print(len(cat))
print(len(num))

2
57


In [141]:
scaler = StandardScaler()
X[num] = scaler.fit_transform(X[num])

In [142]:
X

Unnamed: 0,GameId,PlayId,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,Down,Distance,DefendersInTheBox,num_DL,num_LB,num_DB,num_QB,num_RB,num_WR,num_TE,num_OL,OL_diff,OL_TE_diff,run_def,def_min_dist,def_max_dist,def_mean_dist,def_std_dist,X_def_closest,Y_def_closest,S_def_closest,A_def_closest,Dis_def_closest,Orientation_def_closest,Dir_def_closest_x,X_off_closest,Y_off_closest,S_off_closest,A_off_closest,Dis_off_closest,Orientation_off_closest,Dir_def_closest_y,X_maxInVoronoi,X_minInVoronoi,AreaInVoronoi,fe1,fe5,fe7,fe8,fe10,fe11
0,-1.008048,-1.008048,-1.253018,1,0,0.153870,0.226343,-0.508223,-0.622002,-0.466483,1.063564,-0.535149,0.780300,-0.631894,0.635895,0.327572,-0.515170,-1.315124,2.191672,-1.667696,-0.951527,-1.865015,0.038883,2.111368,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,1.749984,1.038440,-0.924028,0.043930,0.231670,-0.409466,-0.435471,-0.468532,1.429804,-1.557216,-0.178633,-1.889929,1.165374,-1.044877,-0.513752,0.860870,-1.470448,-0.547559,-1.927316,-0.144395,0.024157,-0.516105,0.00657,-0.00657,-0.423973,-0.557856,-0.372904,-0.246612,0.384871,-0.665124
1,-1.008048,-1.008048,-0.911788,0,0,-0.689447,0.406484,-0.056590,-0.001102,-0.162534,0.098764,-1.032635,-0.213714,-1.002146,0.127400,-0.845337,-0.197209,-1.315124,-0.771350,0.440227,-0.951527,-1.865015,0.038883,2.111368,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,1.749984,1.038440,-0.924028,-0.268025,0.412004,-0.042434,-0.029163,-0.156973,0.183968,-0.880858,-1.112813,-0.927737,1.383125,1.030912,-0.198705,0.221287,-1.163991,-0.947294,-0.677786,0.081243,-0.284576,-0.225622,0.00657,-0.00657,-0.214948,-1.003626,-0.372904,-0.678778,-0.513070,-0.333621
2,-1.008048,-1.008048,-1.348989,1,0,0.406039,-0.272615,-0.161567,-0.820498,0.724382,-2.205879,1.332603,-0.203139,1.404492,0.856744,0.747250,0.677184,-1.315124,-0.771350,0.440227,0.057489,-1.865015,0.038883,2.111368,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,1.749984,1.038440,-0.924028,-0.335517,-0.267822,-0.307555,-0.639468,0.719906,-1.558735,0.211721,-0.435532,0.435370,-2.325364,-0.724604,0.682003,-2.007659,0.381914,-1.382300,0.571745,1.197738,-0.209506,0.652970,0.00657,-0.00657,0.590472,1.375561,-0.372904,-0.047633,0.857240,0.980201
3,-1.008048,-1.008048,-1.487614,0,0,-0.527794,-3.501886,-2.493579,-3.098202,2.035562,-0.416560,0.180531,0.621681,0.108610,0.382900,-0.196584,1.988773,-1.315124,0.710161,-1.667696,2.075520,0.665020,1.244971,-2.281028,-0.072453,1.991120,-2.898730,0.881754,3.229332,0.495145,1.038440,1.082218,-0.022750,-3.500548,-2.730224,-3.240833,2.016024,-1.017812,-0.967570,-0.785850,0.275004,-2.069063,0.025571,1.994764,-0.619764,-0.040316,-1.676223,0.071933,-1.194044,0.691335,1.957084,0.00657,-0.00657,2.105144,0.105402,-0.372904,-1.036407,1.349947,-1.293835
4,-1.008048,-1.008048,0.090576,0,0,-1.060177,-0.128348,0.384018,0.009647,-0.912118,0.087312,-0.299498,-0.086819,-0.076516,-1.073179,1.098099,-0.912622,-1.315124,-0.771350,0.440227,0.057489,-0.599997,-1.167206,2.111368,-0.072453,-0.467382,-1.637151,2.421286,-0.279712,0.495145,2.111951,-0.924028,-0.267704,-0.123401,0.469439,0.024478,-0.907169,0.198653,0.367803,0.627098,0.675918,-0.418021,-0.661052,-0.937899,0.150933,-1.327435,0.451779,-1.427504,1.598999,-3.106293,-0.969457,0.00657,-0.00657,-0.913961,-0.396045,-0.372904,0.445731,-1.202821,0.582461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23166,1.057783,1.057784,-0.378616,0,0,-0.966632,-0.681585,-1.549841,-0.929412,0.688367,1.561711,-0.220948,0.812024,-0.539331,-0.673543,-0.457603,0.677184,1.307014,-0.771350,0.440227,1.066504,0.665020,0.038883,-0.816896,-0.072453,1.991120,-0.375572,-0.657778,-0.279712,-0.759695,-1.108583,1.082218,-1.641734,-0.677229,-1.426717,-0.638044,0.619352,1.762680,0.731996,0.253426,0.355187,0.213383,0.449131,0.671725,1.577202,0.572598,-1.382300,0.509268,1.629807,0.809957,0.621273,0.00657,-0.00657,0.753230,-0.290271,-0.372904,-0.764276,0.759034,-0.730831
23167,1.057783,1.057784,-0.986432,0,0,-0.228882,1.137186,0.908062,1.605327,-0.872145,1.195258,-0.055120,0.357315,-0.446768,-0.147409,0.975620,-0.912622,1.307014,-0.771350,0.440227,-0.951527,0.665020,-1.167206,0.647236,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,-0.759695,-1.108583,-0.924028,-1.777933,1.143488,0.956595,1.501639,-0.937256,1.344138,-0.091773,-0.330437,-0.206092,0.237228,0.069615,-0.917344,1.142286,-1.381916,-0.712156,-1.365027,-1.213442,1.039608,-0.956603,0.00657,-0.00657,-0.760925,-0.169784,-0.372904,0.416305,-0.718985,0.606695
23168,1.057783,1.057784,-0.911788,0,0,-0.709724,-0.648190,-0.328217,-0.044066,-0.518725,0.682797,0.372543,1.129262,0.016047,-0.772173,-0.960934,-0.554916,1.307014,2.191672,-1.931187,0.057489,0.665020,-1.167206,0.647236,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,-0.759695,-1.108583,-0.924028,-0.974333,-0.643798,-1.034124,-0.443226,-0.554043,0.205996,0.818709,-0.237019,0.916466,0.787015,0.947373,-0.544980,0.963203,-1.368296,-0.147824,-1.365027,-0.546797,-1.517788,-0.580376,0.00657,-0.00657,-0.507313,0.329623,-0.372904,0.390583,0.186528,0.521988
23169,1.057783,1.057784,-1.189037,0,0,-0.955362,-0.748900,-0.384506,0.200232,1.114214,-0.877488,-0.011481,-0.192565,-0.354205,-0.738461,-0.961111,1.074635,1.307014,-0.771350,0.440227,0.057489,0.665020,-1.167206,0.647236,-0.072453,-0.467382,0.886008,-0.657778,-0.279712,-0.759695,-1.108583,-0.924028,-0.469055,-0.744617,-0.874498,-0.241379,1.098765,-1.157326,0.575913,2.191850,0.836283,0.799935,1.191048,1.082432,-0.933160,-1.729234,-1.593925,-1.677410,-0.710534,-1.723084,1.033974,0.00657,-0.00657,1.073800,-0.164770,-0.372904,-0.804620,-0.063816,0.327826


In [143]:
def model_396_1():
    inputs = []
    embeddings = []
    for i in cat:
        input_ = Input(shape=(1,))
        embedding = Embedding(int(np.absolute(X[i]).max() + 1), 10, input_length=1)(input_)
        embedding = Reshape(target_shape=(10,))(embedding)
        inputs.append(input_)
        embeddings.append(embedding)
    input_numeric = Input(shape=(len(num),))
    embedding_numeric = Dense(512, activation='relu')(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model


n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_369, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_369}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
    X_train = [np.absolute(X_train[i]) for i in cat] + [X_train[num]]
    X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]]
    model = model_396_1()
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[])
    es = EarlyStopping(monitor='val_CRPS', 
                   mode='min',
                   restore_best_weights=True, 
                   verbose=2, 
                   patience=5)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train,y_train), (X_val,y_val)])
    for i in range(1):
        model.fit(X_train, y_train, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=64, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=128, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=256, verbose=False)
    model.fit(X_train, y_train, callbacks=[metric], epochs=100, batch_size=1024, verbose=False)
    score_ = crps(y_val, model.predict(X_val))
    model.save(f'keras_369_{i_369}.h5')
    print(score_)
    score.append(score_)
    

print(np.mean(score))

Fold : 0
tr CRPS 0.012378 val CRPS 0.012451
tr CRPS 0.012349 val CRPS 0.012444
tr CRPS 0.012312 val CRPS 0.012426
tr CRPS 0.012296 val CRPS 0.012455
tr CRPS 0.01228 val CRPS 0.012438
tr CRPS 0.012232 val CRPS 0.012425
tr CRPS 0.012194 val CRPS 0.012415
tr CRPS 0.012155 val CRPS 0.012418
tr CRPS 0.012125 val CRPS 0.012388
tr CRPS 0.012106 val CRPS 0.01239
tr CRPS 0.012063 val CRPS 0.012394
tr CRPS 0.012043 val CRPS 0.012392
tr CRPS 0.012024 val CRPS 0.012433
tr CRPS 0.011989 val CRPS 0.012383
tr CRPS 0.011949 val CRPS 0.012397
tr CRPS 0.011883 val CRPS 0.012404
tr CRPS 0.011869 val CRPS 0.012436
tr CRPS 0.011827 val CRPS 0.012405
tr CRPS 0.011829 val CRPS 0.01244
Restoring model weights from the end of the best epoch
Epoch 00019: early stopping
0.012383361010701699
Fold : 1
tr CRPS 0.01231 val CRPS 0.012872
tr CRPS 0.012262 val CRPS 0.012849
tr CRPS 0.012263 val CRPS 0.012858
tr CRPS 0.012208 val CRPS 0.012844
tr CRPS 0.012178 val CRPS 0.012833
tr CRPS 0.012145 val CRPS 0.012822
tr CRPS

In [134]:
models = []
for i in range(n_splits):
    models.append(load_model(f'keras_369_{i}.h5'))
    
for (test_df, sample_prediction_df) in iter_test:
    basetable = create_features(test_df, deploy=True)
    basetable = process_two(basetable)
    basetable[num] = scaler.transform(basetable[num])
    test_ = [np.absolute(basetable[i]) for i in cat] + [basetable[num]]
    
    y_pred = np.mean([model.predict(test_) for model in models], axis=0)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    
    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
    env.predict(preds_df)
    
env.write_submission_file()

NameError: name 'iter_test' is not defined

### GBDT

In [147]:
params = {
    'objective':'regression',
    #'num_class': 199,
    "boosting": "gbdt",
    #"metric": 'mae',
    'learning_rate': 0.01,
    'num_leaves': 63,
    'max_depth': -1,
    # 'min_data_in_leaf': 30, 
    # "min_child_samples": 20,
    # "feature_fraction": 0.9,
    # "bagging_freq": 1,
    # "bagging_fraction": 0.9 ,
    # "bagging_seed": 11,
    # "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": -1,
    "random_state": 16
}

In [155]:
y.shape

(23171, 199)

In [150]:
import lightgbm as lgb
from sklearn.model_selection import RepeatedKFold, KFold, StratifiedKFold
folds = KFold(n_splits=5, shuffle=True, random_state=16)

models = []
oof = np.zeros((509762//22, 199))
feature_importance_df = pd.DataFrame()
for fold_, (tr_idx, val_idx) in enumerate(folds.split(X, y)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    
    tr_data = lgb.Dataset(X[tr_idx], label=y[tr_idx])
    val_data = lgb.Dataset(X[val_idx], label=y[val_idx])

    num_round = 10000
    model = lgb.train(params, tr_data, num_round, valid_sets = [tr_data, val_data], verbose_eval=100, early_stopping_rounds=100)
    oof[val_idx, :] = model.predict(X[val_idx], num_iteration=model.best_iteration)
    
    models.append(model)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = model.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

-
Fold 1


KeyError: "None of [Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,\n               11,\n            ...\n            23159, 23161, 23162, 23163, 23164, 23165, 23166, 23168, 23169,\n            23170],\n           dtype='int64', length=18536)] are in the [columns]"

### Graph features

In [34]:
train

Unnamed: 0,GameId,PlayId,X,Y,S,A,Dis,Orientation,Dir,dist_from_rusher,IsOffense
0,2017090700,20170907000118,73.91,34.84,1.69,1.13,0.40,81.99,177.18,6.480872,0
1,2017090700,20170907000118,74.67,32.64,0.42,1.35,0.01,27.61,198.70,4.593310,0
2,2017090700,20170907000118,74.00,33.20,1.22,0.59,0.31,3.01,202.73,5.448982,0
3,2017090700,20170907000118,71.46,27.70,0.42,0.54,0.02,359.77,105.64,7.820038,0
4,2017090700,20170907000118,69.32,35.42,1.82,2.43,0.16,12.63,164.31,10.622476,0
...,...,...,...,...,...,...,...,...,...,...,...
509757,2018123015,20181230154157,86.77,24.20,2.14,2.12,0.22,44.55,15.31,6.345345,1
509758,2018123015,20181230154157,86.76,27.18,1.16,0.66,0.11,53.63,42.80,6.017516,1
509759,2018123015,20181230154157,87.26,27.05,2.59,1.18,0.26,3.96,21.12,6.497815,1
509760,2018123015,20181230154157,84.57,24.37,4.36,1.79,0.47,148.08,183.34,4.258321,1


In [36]:
train = train.sort_values(by=['GameId','PlayId','dist_from_rusher'])
train.head(200)

Unnamed: 0,GameId,PlayId,X,Y,S,A,Dis,Orientation,Dir,dist_from_rusher,IsOffense
18,2017090700,20170907000118,78.75,30.53,3.63,3.35,0.38,161.98,245.74,0.000000,1
15,2017090700,20170907000118,79.76,29.49,0.84,1.22,0.00,192.18,110.86,1.449724,1
13,2017090700,20170907000118,75.43,32.41,1.50,1.36,0.32,207.08,222.76,3.815337,1
19,2017090700,20170907000118,74.60,31.88,1.86,1.51,0.17,218.49,267.32,4.364058,1
20,2017090700,20170907000118,74.58,29.12,1.00,0.69,0.22,169.36,242.05,4.401931,1
...,...,...,...,...,...,...,...,...,...,...,...
509750,2018123015,20181230154157,87.91,22.38,4.52,2.55,0.44,338.09,306.72,8.143279,0
509749,2018123015,20181230154157,87.82,21.82,2.90,2.67,0.29,295.36,40.07,8.354717,0
509754,2018123015,20181230154157,87.34,41.37,3.96,1.48,0.39,139.94,136.00,16.382063,1
509745,2018123015,20181230154157,99.14,29.81,1.97,1.40,0.19,256.20,248.21,18.663526,0


In [48]:
import networkx as nx
graphs = []

for i, playid in enumerate(train['PlayId'].unique()):
    tmp = train[train.PlayId==playid]
    g = nx.Graph()
    g.add_nodes_from(range(22))
    g.ndata['features'] = np.array(tmp[['X','Y','Dir','Orientation','S','A','IsOffense']])
    for n in range(21):
        g.add_edge(0, n+1, weight=tmp.iloc[n]['dist_from_rusher'].values)

    graphs.append(g)

AttributeError: 'Graph' object has no attribute 'ndata'

In [43]:
g

DGLGraph(num_nodes=22, num_edges=21,
         ndata_schemes={'features': Scheme(shape=(7,), dtype=torch.float64)}
         edata_schemes={'distance': Scheme(shape=(), dtype=torch.float64)})

In [44]:
len(graphs)

23171

### kernelのkerasモデル

In [10]:
def model_396_1():
    inputs = []
    embeddings = []
    for i in cat:
        input_ = Input(shape=(1,))
        embedding = Embedding(int(np.absolute(X[i]).max() + 1), 10, input_length=1)(input_)
        embedding = Reshape(target_shape=(10,))(embedding)
        inputs.append(input_)
        embeddings.append(embedding)
    input_numeric = Input(shape=(len(num),))
    embedding_numeric = Dense(512, activation='relu')(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model

In [11]:
n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_369, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_369}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
    X_train = [np.absolute(X_train[i]) for i in cat] + [X_train[num]]
    X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]]
    model = model_396_1()
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[])
    es = EarlyStopping(monitor='val_CRPS', 
                   mode='min',
                   restore_best_weights=True, 
                   verbose=2, 
                   patience=5)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train,y_train), (X_val,y_val)])
    for i in range(1):
        model.fit(X_train, y_train, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=64, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=128, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=256, verbose=False)
    model.fit(X_train, y_train, callbacks=[metric], epochs=100, batch_size=1024, verbose=False)
    score_ = crps(y_val, model.predict(X_val))
    model.save(f'keras_369_{i_369}.h5')
    print(score_)
    score.append(score_)

Fold : 0
tr CRPS 0.012911 val CRPS 0.012699
tr CRPS 0.012891 val CRPS 0.012692
tr CRPS 0.01287 val CRPS 0.012686
tr CRPS 0.012847 val CRPS 0.012665
tr CRPS 0.012833 val CRPS 0.012651
tr CRPS 0.012821 val CRPS 0.012646
tr CRPS 0.012802 val CRPS 0.012645
tr CRPS 0.012792 val CRPS 0.01265
tr CRPS 0.012764 val CRPS 0.012613
tr CRPS 0.012753 val CRPS 0.012619
tr CRPS 0.012737 val CRPS 0.012626
tr CRPS 0.01272 val CRPS 0.012607
tr CRPS 0.012723 val CRPS 0.012615
tr CRPS 0.012704 val CRPS 0.012603
tr CRPS 0.012693 val CRPS 0.012586
tr CRPS 0.012682 val CRPS 0.012584
tr CRPS 0.012648 val CRPS 0.012556
tr CRPS 0.012622 val CRPS 0.012564
tr CRPS 0.012603 val CRPS 0.01256
tr CRPS 0.012612 val CRPS 0.012576
tr CRPS 0.012571 val CRPS 0.012547
tr CRPS 0.012572 val CRPS 0.012549
tr CRPS 0.012548 val CRPS 0.012556
tr CRPS 0.012536 val CRPS 0.012554
tr CRPS 0.012499 val CRPS 0.012548
tr CRPS 0.012487 val CRPS 0.012549
Restoring model weights from the end of the best epoch
Epoch 00026: early stopping
0.

KeyboardInterrupt: 

In [None]:
print(np.mean(score))

In [None]:
models = []
for i in range(n_splits):
    models.append(load_model(f'keras_369_{i}.h5'))
    
for (test_df, sample_prediction_df) in iter_test:
    basetable = create_features(test_df, deploy=True)
    basetable = process_two(basetable)
    basetable[num] = scaler.transform(basetable[num])
    test_ = [np.absolute(basetable[i]) for i in cat] + [basetable[num]]
    
    y_pred = np.mean([model.predict(test_) for model in models], axis=0)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    
    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)