Data Using for this experiment was taken from Kaggle "Porto Seguro's Safe Driver Prediction".
https://www.kaggle.com/c/porto-seguro-safe-driver-prediction

This work is based on following an excellent kernel.
https://www.kaggle.com/aquatic/entity-embedding-neural-net

In [1]:
import numpy as np
import pandas as pd

#random seeds for stochastic parts of neural network 
np.random.seed(10)
from tensorflow import set_random_seed
set_random_seed(15)

  from ._conv import register_converters as _register_converters


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Merge, Reshape, Dropout
from keras.layers.embeddings import Embedding

from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


## Data loading & preprocessing

In [3]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [4]:
X_train, y_train = df_train.iloc[:,2:], df_train.target
X_test = df_test.iloc[:,1:]

In [5]:
# using subset of columns; col names start with 'ps_calc_'
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]

X_train = X_train[cols_use]
X_test = X_test[cols_use]

In [6]:
len(X_train)

595212

In [7]:
col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}

# look at value counts to know the embedding dimensions
embed_cols = []
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c]))

ps_ind_02_cat: 5 values
ps_ind_04_cat: 3 values
ps_ind_05_cat: 8 values
ps_car_01_cat: 13 values
ps_car_02_cat: 3 values
ps_car_03_cat: 3 values
ps_car_04_cat: 10 values
ps_car_05_cat: 3 values
ps_car_06_cat: 18 values
ps_car_07_cat: 3 values
ps_car_09_cat: 6 values
ps_car_10_cat: 3 values
ps_car_11_cat: 104 values


## Choosing 10000 samples with only 5 columns from entire dataset
Subset data includes 2 umerical futures and 3 categorical features.

In [8]:
X_tr = X_train.iloc[:10000, :5]
y_tr = y_train.iloc[:10000]

X_tr.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat
0,2,2,5,1,0
1,1,1,7,0,0
2,5,4,9,1,0
3,0,1,2,0,0
4,0,2,0,1,0


In [9]:
input_list_train = []

# the cols to be embedded (categorical features): rescaling to range [0, # values)
for c in X_tr.columns:
    if c.endswith('cat'):
        raw_vals = np.unique(X_tr[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_tr[c].map(val_map).values)

# the rest of the columns (numerical features)
other_cols = [c for c in X_tr.columns if (not c in embed_cols)]
input_list_train.append(X_tr[other_cols].values)

In [10]:
print(np.unique(input_list_train[0]))
print(np.unique(input_list_train[1]))
print(np.unique(input_list_train[2]))

[0 1 2 3 4]
[0 1 2]
[0 1 2 3 4 5 6 7]


In [11]:
# make sure each categorical feature contain ceartain unique values
[print(c, 'contains',  X_tr[c].nunique(), 'unique values.') for c in X_tr.columns if c.endswith('cat')]

ps_ind_02_cat contains 5 unique values.
ps_ind_04_cat contains 3 unique values.
ps_ind_05_cat contains 8 unique values.


[None, None, None]

In [12]:
# checking shape of rest of the columns (non categorical)
input_list_train[3].shape

(10000, 2)

## Build NN model for Entity Embedding

In [13]:
def build_embedding_network():
    
    models = []
    
    model_ps_ind_02_cat = Sequential()
    model_ps_ind_02_cat.add(Embedding(5, 3, input_length=1))
    model_ps_ind_02_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_ind_02_cat)
    
    model_ps_ind_04_cat = Sequential()
    model_ps_ind_04_cat.add(Embedding(3, 2, input_length=1))
    model_ps_ind_04_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_ind_04_cat)
    
    model_ps_ind_05_cat = Sequential()
    model_ps_ind_05_cat.add(Embedding(8, 5, input_length=1))
    model_ps_ind_05_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_ind_05_cat)
        
    model_rest = Sequential()
    model_rest.add(Dense(16, input_dim=2))
    models.append(model_rest)

    model = Sequential()
    model.add(Merge(models, mode='concat'))
    model.add(Dense(80))
    model.add(Activation('relu'))
    model.add(Dropout(.35))
    model.add(Dense(20))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(10))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [14]:
# track oof prediction for cv scores
val_preds = 0
y_preds = np.zeros((np.shape(X_test)[0]))

In [15]:
runs_per_fold = 3
n_epochs = 15

In [16]:
# fiting subset of train data to taget value
for j in range(runs_per_fold):
    NN = build_embedding_network()
    NN.fit(input_list_train, y_tr.values, epochs=n_epochs, batch_size=4096, verbose=0)
   
    # y_preds += NN.predict(proc_X_test)[:,0] / runs_per_fold



In [17]:
NN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_3 (Merge)              (None, 26)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 80)                2160      
_________________________________________________________________
activation_9 (Activation)    (None, 80)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 20)                1620      
_________________________________________________________________
activation_10 (Activation)   (None, 20)                0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 20)                0         
__________

## Checking weights for each categorical feature

In [18]:
weights = NN.get_weights()

In [19]:
len(weights)

18

In [20]:
# ps_ind_02_cat: 5*3 matrix
print(weights[0][0])

[[-0.02200509  0.04319452 -0.03653887]
 [ 0.0770293   0.00489654 -0.00247264]
 [ 0.06709933  0.09106427 -0.01616774]
 [ 0.03431904  0.01542955 -0.00072964]
 [ 0.05137567 -0.02562283 -0.03074536]]


In [21]:
# ps_ind_04_cat: 3*2 matrix
print(weights[2][0])

[[ 0.0419449  -0.02938309]
 [ 0.0579549  -0.03725957]
 [-0.0150009   0.02484703]]


In [22]:
# ps_ind_05_cat: 8*5 matrix
print(weights[4][0])

[[ 0.07340115 -0.0658811   0.02774858 -0.04807903 -0.02562266]
 [ 0.05910323 -0.05320253  0.06396441  0.04002887 -0.04610789]
 [-0.03580792  0.02135029  0.02098037  0.04861105 -0.032686  ]
 [-0.02894982 -0.02541417  0.05249073  0.0445907   0.01066875]
 [ 0.02661128 -0.01903299 -0.00638229 -0.00375705  0.02683738]
 [ 0.05469328 -0.01887664  0.00190938 -0.0392197  -0.0403111 ]
 [ 0.01924985 -0.0019205   0.00478835  0.04196957  0.00983921]
 [ 0.0594023  -0.07615691  0.06344219 -0.04673522 -0.01267588]]


In [23]:
pd.DataFrame({'y_true': y_tr.values, 'y_pred': NN.predict(input_list_train).flatten()}).head()

Unnamed: 0,y_pred,y_true
0,0.012422,0
1,0.004006,0
2,0.000406,0
3,0.134801,0
4,0.352139,0


## Evaluation

In [24]:
from sklearn import metrics
score = metrics.roc_auc_score(y_tr.values, NN.predict(input_list_train).flatten())
print("{:.6}".format(score))

0.479835
