In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

from timeit import default_timer as timer
from sklearn import preprocessing

#!pip install ultimate
#from ultimate.mlp import MLP 

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

import gc, sys
gc.enable()


Using TensorFlow backend.


In [2]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [3]:
def feature_engineering(is_train = True):
    if is_train: 
        df = pd.read_csv('../input/train_V2.csv')
        # Only take the samples with matches that have more than 1 player; there are matches with no players or just one player
        df = df[df['maxPlace'] > 1]
    else:
        df = pd.read_csv('../input/test_V2.csv')
        
    # Make a new feature indecating the total distance a player cut :
    state('totalDistance')
    s = timer()
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    e = timer()
    state('totalDistance', False, e - s)
    state('rankPoints')
    
    s = timer()
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])
    
    e = timer()                                  
    state('rankPoints', False, e-s)
    
    target = 'winPlacePerc'
    
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    y = None

    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype = np.float64)
        features.remove(target)
    
    # Make new features indicating the mean of the features(grouped by match and group) :
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct = True).reset_index()
    

    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    
    df_out = df_out.merge(agg.reset_index(), suffixes = ["", ""], how = 'left', on = ['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes = ["_mean", "_mean_rank"], how = 'left', on = ['matchId', 'groupId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by match )
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct = True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes = ["", ""], how = 'left', on = ['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes = ["_max", "_max_rank"], how = 'left', on = ['matchId', 'groupId'])
    
    # Make new features indicating the minimum value of the features for each group(grouped by match)
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct = True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes = ["", ""], how = 'left', on = ['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes = ["_min", "_min_rank"], how = 'left', on = ['matchId', 'groupId'])
    
    # Make new features indicating the number of players in each group ( grouped by match )
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name = 'group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how = 'left', on = ['matchId', 'groupId'])
    
    # Make new features indicating the mean value of each features for each match :
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes = ["", "_match_mean"], how = 'left', on = ['matchId'])
    
    agg = df.groupby(['matchId']).size().reset_index(name = 'match_size')
    
    df_out = df_out.merge(agg, how = 'left', on = ['matchId'])
    df_out.drop(["matchId", "groupId"], axis = 1, inplace = True)

    X = np.array(df_out, dtype = np.float64)
    
    del df, df_out, agg, agg_rank
    gc.collect()
    return X, y

In [4]:
%%time
# Process the training data :
x_train, y = feature_engineering(True)
# Scale the data to be in the range (-1 , 1)
scaler = preprocessing.MinMaxScaler(feature_range = (-1, 1), copy = False).fit(x_train)

Working on totalDistance ... 
Working on totalDistance took (0.138) Sec 

Working on rankPoints ... 
Working on rankPoints took (0.048) Sec 

get group min feature
get group size feature
CPU times: user 2min 53s, sys: 30.9 s, total: 3min 24s
Wall time: 3min 24s


Scale and normalize

In [5]:
print("x_train: ", x_train.shape, x_train.max(), x_train.min())
scaler.transform(x_train)
print("x_train now: ", x_train.shape, x_train.max(), x_train.min())

x_train:  (2026744, 170) 41270.1 0.0
x_train now:  (2026744, 170) 1.0000000000000002 -1.0000000000000002


In [7]:
y[:5]

array([0.3333, 0.037 , 0.    , 0.3704, 1.    ])

In [8]:
y *= 2
y -= 1
print("y", y.shape, y.max(), y.min())

y (2026744,) 1.0 -1.0


In [9]:
%%time
model = Sequential()
model.add(Dense(x_train.shape[1],  input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dense(136, activation = 'relu'))
model.add(Dense(136, activation = 'relu'))
model.add(Dense(136, activation = 'relu'))
model.add(Dense(136, activation = 'relu'))

# output Layer
model.add(Dense(1, activation = 'linear'))

CPU times: user 76 ms, sys: 4 ms, total: 80 ms
Wall time: 79.8 ms


In [11]:
# Compile the network :
from keras.optimizers import SGD, Adam
adam = Adam(lr=0.00001)
model.compile(loss = 'mean_absolute_error', optimizer = 'adam', metrics=['mean_absolute_error'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 170)               29070     
_________________________________________________________________
dense_2 (Dense)              (None, 136)               23256     
_________________________________________________________________
dense_3 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_4 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_5 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 137       
Total params: 108,359
Trainable params: 108,359
Non-trainable params: 0
_________________________________________________________________


In [13]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]


In [14]:
%%time
model.fit(x = x_train, y = y, batch_size=1000, epochs=30, verbose=1, callbacks=callbacks_list,
            validation_split=0.15, validation_data=None, shuffle=True,
            class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)
del x_train, y
gc.collect()

Train on 1722732 samples, validate on 304012 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.06892, saving model to Weights-001--0.06892.hdf5
Epoch 2/30

Epoch 00002: val_loss did not improve from 0.06892
Epoch 3/30

Epoch 00003: val_loss improved from 0.06892 to 0.06551, saving model to Weights-003--0.06551.hdf5
Epoch 4/30

Epoch 00004: val_loss improved from 0.06551 to 0.06210, saving model to Weights-004--0.06210.hdf5
Epoch 5/30

Epoch 00005: val_loss improved from 0.06210 to 0.06172, saving model to Weights-005--0.06172.hdf5
Epoch 6/30

Epoch 00006: val_loss improved from 0.06172 to 0.06088, saving model to Weights-006--0.06088.hdf5
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.06088
Epoch 8/30

Epoch 00008: val_loss did not improve from 0.06088
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.06088
Epoch 10/30

Epoch 00010: val_loss improved from 0.06088 to 0.05962, saving model to Weights-010--0.05962.hdf5
Epoch 11/30

Epoch 00011: val_loss did

Downloading test set

In [None]:
x_test, _ = feature_engineering(False)
scaler.transform(x_test)
print("x_test", x_test.shape, x_test.max(), x_test.min())
np.clip(x_test, out=x_test, a_min=-1, a_max=1)
print("x_test", x_test.shape, x_test.max(), x_test.min())

Predict the target

In [None]:
%%time
pred = model.predict(x_test)
del x_test
gc.collect()

In [None]:
pred = pred.reshape(-1)
pred = (pred + 1) / 2

In [None]:
df_test = pd.read_csv('../input/test_V2.csv')

In [None]:
%%time
print("fix winPlacePerc")
for i in range(len(df_test)):
    winPlacePerc = pred[i]
    maxPlace = int(df_test.iloc[i]['maxPlace'])
    if maxPlace == 0:
        winPlacePerc = 0.0
    elif maxPlace == 1:
        winPlacePerc = 1.0
    else:
        gap = 1.0 / (maxPlace - 1)
        winPlacePerc = round(winPlacePerc / gap) * gap
    
    if winPlacePerc < 0: winPlacePerc = 0.0
    if winPlacePerc > 1: winPlacePerc = 1.0    
    pred[i] = winPlacePerc

In [None]:
df_test['winPlacePerc'] = pred

In [None]:
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)