In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, SpatialDropout1D, Activation, concatenate
from keras.optimizers import Adam, SGD
from keras.layers.advanced_activations import ReLU, PReLU, LeakyReLU, ELU
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras.utils import to_categorical
from tensorflow.keras.utils import plot_model

Using TensorFlow backend.


In [2]:
COLUMNS = [
       'company', 'from', 'to', 'distance',
       'duration', 'price', 'depart_session',
       'user_age', 'user_gender', 'profission'
    ]

LABEL_COLUMN = "rating"

CATEGORICAL_COLUMNS = [
     'company', 'from', 'to' ,'depart_session','user_gender', 'profission'
]

CONTINUOUS_COLUMNS = [
    'distance','duration', 'price','user_age'
]

In [3]:
def preprocessing():
    data = pd.read_csv('dataset.csv',delimiter =';')
    # Convert the position of each starting city to a number 
    a = data.groupby(['from_lat','from_lan']).size()
    from_dict = {}
    for i,row in enumerate(a.iteritems()):
        from_dict[row[0]] = i
    f = data[['from_lat','from_lan']]
    from_col = []
    for a,b in f.iterrows():
        from_col.append(from_dict[(b['from_lat'],b['from_lan'])])
    data['from'] = from_col
    data = data.drop(['from_lat','from_lan'],axis = 1)
    # Convert the position of each arrival city to a number 
    a = data.groupby(['to_lat','to_lan']).size()
    from_dict = {}
    for i,row in enumerate(a.iteritems()):
        from_dict[row[0]] = i
    f = data[['to_lat','to_lan']]
    from_col = []
    for a,b in f.iterrows():
        from_col.append(from_dict[(b['to_lat'],b['to_lan'])])
    data['to'] = from_col
    data = data.drop(['to_lat','to_lan'],axis = 1)
    
    for c in CATEGORICAL_COLUMNS:
        le = LabelEncoder()
        data[c] = le.fit_transform(data[c])
    # train,validation,test split
    train=data.sample(frac=0.8,random_state=200)
    val = data.drop(train.index).sample(frac = 0.5,random_state=200)
    test=data.drop(train.index).drop(val.index)
    
    y_train = train['rating'].values
    y_val = val['rating'].values
    y_test = test['rating'].values
    x_train = train.drop(['rating'],axis=1)
    x_val = val.drop(['rating'],axis=1)
    x_test = test.drop(['rating'],axis=1)
    x_train_categ = np.array(x_train[CATEGORICAL_COLUMNS]) 
    x_val_categ = np.array(x_val[CATEGORICAL_COLUMNS])
    x_test_categ = np.array(x_test[CATEGORICAL_COLUMNS])
    x_train_conti = np.array(x_train[CONTINUOUS_COLUMNS], dtype='float64') 
    x_val_conti = np.array(x_val[CONTINUOUS_COLUMNS], dtype='float64')
    x_test_conti = np.array(x_test[CONTINUOUS_COLUMNS], dtype='float64')

    scaler = StandardScaler()
    x_train_conti = scaler.fit_transform(x_train_conti)
    x_val_conti = scaler.transform(x_val_conti)    
    x_test_conti = scaler.transform(x_test_conti)
    return [x_train, y_train, x_val, y_val, x_test, y_test, \
            x_train_categ,x_val_categ, x_test_categ, x_train_conti,x_val_conti, x_test_conti, data]


In [4]:
class Wide_and_Deep:
    def __init__(self, mode='wide and deep'):
        self.mode = mode
        x_train, y_train, x_val, y_val, x_test, y_test, x_train_categ,x_val_categ, x_test_categ, x_train_conti,x_val_conti, x_test_conti, data = preprocessing()
        self.x_train = x_train
        self.y_train = to_categorical(y_train)
        self.x_val = x_val
        self.y_val = to_categorical(y_val)
        self.x_test = x_test
        self.y_test = to_categorical(y_test)
        self.x_train_categ = x_train_categ 
        self.x_val_categ = x_val_categ 
        self.x_test_categ = x_test_categ 
        self.x_train_conti = x_train_conti 
        self.x_test_conti = x_test_conti 
        self.x_val_conti = x_val_conti 
        self.all_data = data
        self.poly = PolynomialFeatures(degree=2, interaction_only=True)
        self.x_train_categ_poly = self.poly.fit_transform(x_train_categ)
        self.x_test_categ_poly = self.poly.transform(x_test_categ)
        self.x_val_categ_poly = self.poly.transform(x_val_categ)
        self.categ_inputs = None
        self.conti_input = None
        self.deep_component_outlayer = None
        self.logistic_input = None
        self.model = None
    
    def deep_component(self):
        categ_inputs = []
        categ_embeds = []
        # Create Input layer and Embedding layer for each feature of category data
        for i in range(len(CATEGORICAL_COLUMNS)):
            input_i = Input(shape=(1,), dtype='int32')
            dim = len(np.unique(self.all_data[CATEGORICAL_COLUMNS[i]]))
            print(dim)
            embed_dim = int(np.ceil(dim ** 0.5)) # TO TUNE
            embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
            flatten_i = Flatten()(embed_i)
            categ_inputs.append(input_i)
            categ_embeds.append(flatten_i)
        # Continuous data is input collectively in all connected layers
        conti_input = Input(shape=(len(CONTINUOUS_COLUMNS),))
        conti_dense = Dense(256, use_bias=False)(conti_input)
        # Attach the output of all connected layers and each Embedding
        concat_embeds = concatenate([conti_dense]+categ_embeds)
        concat_embeds = Activation('relu')(concat_embeds)
        bn_concat = BatchNormalization()(concat_embeds)
        # Furthermore, three layers of all bonding layers are stacked
        fc1 = Dense(512, use_bias=False)(bn_concat)
        ac1 = ReLU()(fc1)
        bn1 = BatchNormalization()(ac1)
        fc2 = Dense(256, use_bias=False)(bn1)
        ac2 = ReLU()(fc2)
        bn2 = BatchNormalization()(ac2)
        fc3 = Dense(128)(bn2)
        ac3 = ReLU()(fc3)

        # Convert input and last layers into member variables (for model creation)
        self.categ_inputs = categ_inputs
        self.conti_input = conti_input
        self.deep_component_outlayer = ac3
    
    def wide_component(self):
        # Include only category data in linear model
        dim = self.x_train_categ_poly.shape[1]
        self.logistic_input = Input(shape=(dim,))

    def create_model(self):
        self.deep_component()
        self.wide_component()
        if self.mode == 'wide and deep':
            out_layer = concatenate([self.deep_component_outlayer, self.logistic_input])
            inputs = [self.conti_input] + self.categ_inputs + [self.logistic_input]
        elif self.mode =='deep':
            out_layer = self.deep_component_outlayer
            inputs = [self.conti_input] + self.categ_inputs
        else:
            print('wrong mode')
            return

        output = Dense(self.y_train.shape[1], activation='softmax')(out_layer)
        self.model = Model(inputs=inputs, outputs=output)
        
    def train_model(self, epochs=15, optimizer='adam', batch_size=128):
        if not self.model:
            print('You have to create model first')
            return

        if self.mode == 'wide and deep':
            train_data = [self.x_train_conti] +\
                         [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])] +\
                         [self.x_train_categ_poly]
            val_data =   [self.x_val_conti] +\
                         [self.x_val_categ[:, i] for i in range(self.x_train_categ.shape[1])] +\
                         [self.x_val_categ_poly]
        elif self.mode == 'deep':
            input_data = [self.x_train_conti] +\
                         [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])]
        else:
            print('wrong mode')
            return
        
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        self.model.fit(train_data, self.y_train,validation_data=(val_data,self.y_val) ,epochs=epochs, batch_size=batch_size)


In [5]:
wide_deep_net = Wide_and_Deep()
wide_deep_net.create_model()

4
Instructions for updating:
Colocations handled automatically by placer.
82
83
6
2
21


In [7]:
wide_deep_net.train_model()

Instructions for updating:
Use tf.cast instead.
Train on 452302 samples, validate on 56538 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15

KeyboardInterrupt: 

In [6]:
plot_model(wide_deep_net.model, to_file='model.png', show_shapes=True, show_layer_names=False)

ImportError: Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.