#### An exmaple of entity embeddings on the Kaggle's https://www.kaggle.com/c/porto-seguro-safe-driver-prediction challenge  

###### Most of the code has been taken from Joe Eddy's kernel available here https://www.kaggle.com/aquatic/entity-embedding-neural-net

##### Importing libraries: 

In [1]:
import time 
start_time  = int(time.time())
import pandas as pd 
import numpy as np
from keras.layers import *
from keras.models import Sequential
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


##### Reading and shuffling training file:

In [2]:
df_train = pd.read_csv("train.csv")
df_train = df_train.sample(frac=1).reset_index(drop=True)

##### As classes are highly imbalanced, using only a part of the larger-class data while training : 

In [3]:
df_train_cat0 = df_train.loc[ df_train['target'] == 0 ]
df_train_cat1 = df_train.loc[ df_train['target'] == 1 ]
print len(df_train_cat1)
print len(df_train_cat0)

21694
573518


In [4]:
df_train = df_train_cat0[:40000].append(df_train_cat1 )

In [5]:
df_train['target'].value_counts()

0    40000
1    21694
Name: target, dtype: int64

##### Shuffling again:

In [6]:
df_train = df_train.sample(frac=1).reset_index(drop=True)

##### Taking out labels column and droping unnecesary columns: 

In [7]:
Y_labels = df_train['target']

In [8]:
df_train = df_train.drop("id", 1)
df_train = df_train.drop("target", 1)
all_cols = df_train.columns

In [9]:
len(all_cols)

57

##### Breaking into 3 columns type, binary, categorical and other ( continous):

In [10]:
categoical_vars = [ ]
continous_vars = [ ]
categoical_binary_vars = [ ]

for col in all_cols:
    #if "bin" in col :
    #    categoical_binary_vars.append(col)
    if "cat" in col :
        categoical_vars.append(col)
    else:
        continous_vars.append(col )

In [11]:
print ("categorical binary vars: ", len(categoical_binary_vars))
print ("categorical non binary vars: ", len(categoical_vars))
print ("continues vars: ", len(continous_vars))

('categorical binary vars: ', 0)
('categorical non binary vars: ', 14)
('continues vars: ', 43)


##### Displaying number of unique vals in each categorical column: 

In [12]:
for cat_var in categoical_vars:
    print (cat_var, df_train[cat_var].nunique())

('ps_ind_02_cat', 5)
('ps_ind_04_cat', 3)
('ps_ind_05_cat', 8)
('ps_car_01_cat', 13)
('ps_car_02_cat', 3)
('ps_car_03_cat', 3)
('ps_car_04_cat', 10)
('ps_car_05_cat', 3)
('ps_car_06_cat', 18)
('ps_car_07_cat', 3)
('ps_car_08_cat', 2)
('ps_car_09_cat', 6)
('ps_car_10_cat', 3)
('ps_car_11_cat', 104)


##### Making train and val set : 

In [13]:
df_tr = df_train[:40000]
df_val = df_train[40000:]

In [14]:
Y_train  = Y_labels[:40000]
Y_val  = Y_labels[40000: ]

## NOW LETS WORK WITH MODEL BUILDING
---

In [15]:
other_cols = [i for i in all_cols if i not in categoical_vars ]

##### For purpose of understanding lets only consider 2 categorical columns and one unique columns: 

In [16]:
categoical_vars = categoical_vars[:2]
other_cols = other_cols[:3]

In [17]:
#understanding th eembedding size thing: 
 
#say there is a string ["I am a man"]
#coresponding input will be ["12 56 89 74"]
#Now for each word will converted to some vector of lenght embedding size (say 2 here )
#coresponding output will be  [[0.2 0.3], [0.12, 0.25], [78, 45]]
#And for multiple ips, multiple ops 

  # the model will take as input an integer matrix of size (batch, input_length).
  # the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
  # now model.output_shape == (None, 10, 64), where None is the batch dimension.
  # now model.output_shape == (None, 10, 64), where None is the batch dimension, 10 is embedding size and 64 is input length 



In [18]:
def preproc(X_train ) : 

    input_list_train = []
    
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in categoical_vars :
        
        """
        vals = np.asarray(X_train[c].tolist() )
        input_list_train.append( np.asarray( vals ))
        this fails as keras Expect 0,1,2,3.. as cat and not 1,2,3,5 if there are 4 categories.
        Using below method instead from https://stackoverflow.com/a/45988584 
        
        """
        vals = np.asarray(X_train[c].tolist())
        vals = pd.factorize( vals )[0]
        input_list_train.append( np.asarray(vals)  )
        """
        This below was the original method used in the code by INSERT NAME HERE. But I found the above implemntation much simpler to understand.
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        """
    #the rest of the columns
    input_list_train.append(X_train[other_cols].values)
    return input_list_train 


In [19]:
df_tr_modified = preproc( df_tr ) 

In [23]:
some_df = df_tr[categoical_vars + other_cols[:2] ][:15]

In [24]:
some_df

Unnamed: 0,ps_ind_02_cat,ps_ind_04_cat,ps_ind_01,ps_ind_03
0,1,0,0,8
1,4,0,5,11
2,1,0,3,7
3,1,0,1,2
4,1,1,0,4
5,1,1,7,6
6,2,1,0,0
7,1,0,4,10
8,1,1,4,6
9,1,0,0,6


In [None]:
df_tr_modified = preproc( df_tr ) 

In [51]:
df_val = preproc( df_val )

##### As expected, the proceesed training DataFrame has 3 elements becasue we had 2 categorical columns and one for other continous columns : 

In [52]:
len(df_tr_modified)

3

##### The lenght of each list in the processed dataframe is same as the number of training rows we have: 

In [53]:
print len(df_tr_modified[0])
print len(df_tr_modified[1])
print len(df_tr_modified[2])

40000
40000
40000


In [54]:
print df_tr_modified[0]
print df_tr_modified[0][0]
print df_tr_modified[0].ndim

print df_tr_modified[1]
print df_tr_modified[1][0]
print df_tr_modified[1].ndim

[0 0 0 ... 0 0 0]
0
1
[0 0 1 ... 0 0 0]
0
1


In [55]:
print df_tr_modified[2]
print "---"
print df_tr_modified[2][0]
print "---"
print df_tr_modified[2][0][0]
print "---"
print df_tr_modified[2].ndim

[[ 5  9  0]
 [ 1 10  1]
 [ 3  2  0]
 ...
 [ 2  9  1]
 [ 1  5  0]
 [ 0  5  1]]
---
[5 9 0]
---
5
---
2


In [56]:
df_tr_modified[2].shape

(40000, 3)

#####  The below code adds a embedding network for each of the catgeoriacal variable. The embedding size is set as according to the rule: 
```
Embedding size  = min( no-of-unique-cat/2 , 50 )
```
##### Each  model is appending to a list named models defined intialised in below cell. 

In [71]:
models = []

In [72]:
for categoical_var in categoical_vars :
    print "------------------------------------------------------------------"
    print "for categoical column ", categoical_var     
    model = Sequential()
    no_of_unique_cat  = df_train[categoical_var].nunique()
    print "number of unique cat",no_of_unique_cat
    embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
    embedding_size = int(embedding_size)
    print "embedding_size set as ", embedding_size
    model.add(  Embedding( no_of_unique_cat+1, embedding_size, input_length = 1 ) )
    
    model.add(Reshape(target_shape=( [embedding_size] )))

    
    print model.summary() 
    
    models.append( model )

------------------------------------------------------------------
for categoical column  ps_ind_02_cat
number of unique cat 5
embedding_size set as  2
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1, 2)              12        
_________________________________________________________________
reshape_5 (Reshape)          (None, 2)                 0         
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________
None
------------------------------------------------------------------
for categoical column  ps_ind_04_cat
number of unique cat 3
embedding_size set as  1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1, 1)              4         
___________________________________

##### Once the categorical columns are made, we add another single model for all the continous variables and add it to the models list .

In [73]:
model_rest = Sequential()
model_rest.add(Dense(16, input_shape = [3] ))
#model_rest.add(Reshape(target_shape=([16] )))
model_rest.summary() 
models.append(model_rest)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 16)                64        
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________


#### Our models list will contain N-cat+1 models. ( N-cat models for each of the categorical columns and one model for all other columns.)

In [74]:
models

[<keras.models.Sequential at 0x7fdf00226ed0>,
 <keras.models.Sequential at 0x7fdf001e98d0>,
 <keras.models.Sequential at 0x7fdeffebe690>]

#### Finally we merge all the models present into a  single model. Concat places the model one after the other 

In [75]:
full_model = Sequential()
full_model.add(Merge(models, mode='concat'))

  


#### The ouptut szie from the merged model is 19 which is actually ( 2 + 1 + 16 ).  (The size e1 + e2 and the 16 outputs from dense layer. ) : 

In [76]:
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_3 (Merge)              (None, 19)                0         
Total params: 80
Trainable params: 80
Non-trainable params: 0
_________________________________________________________________


##### the below msg is good for understanding what happens when we merge using the `add` mode. The add mode does a element  wise addtion while concat appends it one after another

```
Code:
full_model = Sequential()
full_model.add(Merge(models, mode='sum'))

O/p: 
ValueError: Only layers of same output shape can be merged using sum mode. Layer shapes: [(None, 2), (None, 1), (None, 16)]
Here the lengths of 3 vectors to be added are 2,1, 16 thus add mode concatenation cannot be done 
```

#### Now are input layers and embedding layers are done. We can build on those as any other keras sequential model.
##### Adding a few layers below:

In [77]:
full_model.add(Dense(1024))
full_model.add(Activation('relu'))
full_model.add(Dense(512))
full_model.add(Activation('relu'))
full_model.add(Dense(256))
full_model.add(Activation('sigmoid'))

full_model.add(Dense(2))
full_model.add(Activation('sigmoid'))
full_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
full_model.fit( df_tr_modified, Y_train_cat)

In [78]:
full_model.summary() 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_3 (Merge)              (None, 19)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1024)              20480     
_________________________________________________________________
activation_5 (Activation)    (None, 1024)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 512)               524800    
_________________________________________________________________
activation_6 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 256)               131328    
_________________________________________________________________
activation_7 (Activation)    (None, 256)               0         
__________

In [79]:
from keras.utils.np_utils import to_categorical
Y_train_cat = to_categorical(Y_train.tolist() )
Y_val_cat = to_categorical(Y_val.tolist()  )

In [80]:
full_model.fit( df_tr_modified, Y_train_cat)

Epoch 1/1
 8256/40000 [=====>........................] - ETA: 8s - loss: 0.6519 - acc: 0.6467

KeyboardInterrupt: 

In [86]:
end_time  = int(time.time())
net_time  = (end_time - start_time)

In [87]:
print "done execution in ", net_time,"s"

done execution in  1265 s
