# Models pipeline
## Siamese CNN and Siamese LSTM 

In [None]:
from complementary_products_suggestions import helper_functions, embeddings, config
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.metrics import classification_report
import os
import datetime
import tensorflow.python as tf
from tensorflow.python.keras.layers import Input, LSTM, dot, Embedding, Conv1D, Flatten, Dense, Dropout, Activation, MaxPooling1D, ZeroPadding1D
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.regularizers import l1, l2

### Retrieving the datasets

In [None]:
database = pd.read_csv("../dummy_sample_matches.csv")
content = pd.read_csv("../dummy_sample_content.csv")

In [None]:
content

In [None]:
database

## Splitting the data in train-test split
We are using GroupShuffleSplit as we want to make sure that the products that appear as add-ons in the train set will not appear as an add-on in the test set. We do this to make sure that the model performance will be evaluated on unseen data (real-life scenarios)

In [None]:
X_train, X_test, y_train, y_test = helper_functions.train_test_split(database, 0.2)

## Using Word2vec to create embeddings for each word in product titles based on the whole corpus

In [None]:
embedding_weights = embeddings.word2vec(content, X_train)

# Tokenizing the data

In [None]:
t, train_set_main, train_set_addon, test_set_main, test_set_addon = helper_functions.tokenize_train_test_set(X_train, X_test, 30)

# Model comparison

## Siamese CNN
The difference for the Late Merge (LM) or Intermediate Merge (IM) is accordingly documented in the code below. Uncomment the commented lines to get the configuration as described.

In [None]:
input1_layer = Input(shape=(train_set_main.shape[1],))
input2_layer = Input(shape=(train_set_addon.shape[1],))

model = Sequential()

#Embedding layer with pre-initialized weights from word2vec
model.add(Embedding(input_dim=len(t.word_index)+1, 
                    output_dim=config.feature_dim,
                    weights=[embedding_weights],
                    input_length=30,
                    trainable=False))

model.add(ZeroPadding1D(padding=(config.filter1_length-1)))
model.add(Conv1D(filters=config.nb_filter,
                kernel_size=config.filter1_length,
                padding=config.padding,
                activation=config.activation,
                kernel_regularizer=l2(config.regularizer),
                bias_regularizer=l2(config.regularizer),
                activity_regularizer=l2(config.regularizer)))

model.add(MaxPooling1D(pool_size=config.pool1_length))

model.add(ZeroPadding1D(padding=config.filter2_length-1))
model.add(Conv1D(filters=config.nb_filter,
                kernel_size=config.filter2_length,
                padding=config.padding,
                activation=config.activation,
                kernel_regularizer=l2(config.regularizer),
                bias_regularizer=l2(config.regularizer),
                activity_regularizer=l2(config.regularizer)))

model.add(MaxPooling1D(pool_size=config.pool2_length))

model.add(Dropout(config.dropout_rate))

model.add(Flatten())

#Intermediate merge start
# encoded_main = model(input1_layer)
# encoded_addon = model(input2_layer)

# merged_layer = dot([encoded_main, encoded_addon], axes=1, trainable=True)

# dense = Dense(config.nb_neurons_dense, 
#                 activation=config.activation)(merged_layer)

# # Add a dense layer with a sigmoid unit to generate the similarity score
# prediction = Dense(1,
#                    activation='sigmoid')(dense)
#Intermediate merge end


#Late merge start
model.add(Dense(config.nb_neurons_dense, 
                activation=config.activation))

encoded_main = model(input1_layer)
encoded_addon = model(input2_layer)
merged_layer = dot([encoded_main, encoded_addon],
                   axes=1,
                   trainable=True)

# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = Dense(1,
                   activation='sigmoid')(merged_layer)
#Late merge end


# Connect the inputs with the outputs
siamese_cnn = Model(inputs=[input1_layer,input2_layer],
                    outputs=prediction)

siamese_cnn.compile(optimizer=config.optimizer,
                    loss='binary_crossentropy', 
                    metrics=['accuracy'])

logdir = os.path.join("logs-cnn", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir, histogram_freq=1)
callbacks = [EarlyStopping(monitor='val_loss', patience=config.stop_epochs, verbose=1, mode='auto'), tensorboard]

Training the Siamese CNN model

In [None]:
history_cnn = siamese_cnn.fit([train_set_main, train_set_addon],
                              y_train,
                              validation_split=0.1,
                              batch_size=config.batch_size,
                              epochs=config.nb_epochs,
                              callbacks=callbacks,
                              verbose=1)

Making predictions for the test set using the Siamese CNN model

In [None]:
y_pred_cnn = siamese_cnn.predict([test_set_main, test_set_addon],
                                 verbose=1)

Combining the predicted values (scores) with the real values for the test set

In [None]:
X_test_cnn = X_test
X_test_cnn['predicted_label'] = pd.Series(np.round(y_pred_cnn.ravel(),3), index=X_test_cnn.index)
X_test_cnn['real_label'] = pd.Series(y_test, index=X_test_cnn.index)
X_test_cnn.tail(100)

## Siamese LSTM
The difference for the Late Merge (LM) or Intermediate Merge (IM) is accordingly documented in the code below. Uncomment the commented lines to get the configuration as described.

In [None]:
input_1 = Input(shape=(train_set_main.shape[1],))
input_2 = Input(shape=(train_set_addon.shape[1],))

common_embed = Embedding(input_dim =len(t.word_index)+1,
                         weights=[embedding_weights],
                         trainable=False,
                         output_dim=config.feature_dim,
                         input_length=30) 

lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)

common_lstm = LSTM(config.nb_neurons_lstm,
                   return_sequences=True, 
                   activation=config.activation,
                   kernel_regularizer=l2(config.regularizer),
                   bias_regularizer=l2(config.regularizer),
                   activity_regularizer=l2(config.regularizer))

vector_1 = common_lstm(lstm_1)
vector_1 = Flatten(name='flatten1')(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten(name='flatten2')(vector_2)

#Intermediate merge start
conc = dot([vector_1, vector_2],
           axes=1,
           normalize=True,
           name='dot')

x = Dense(config.nb_neurons_dense,
          activation=config.activation,
          name='conc_layer')(conc)

x = Dropout(config.dropout_rate)(x)
#Intermediate merge end

#Late merge start
# x_1 = Dense(config.np_neurons_dense,
#             activation=config.activation,
#             name='conc_layer')(vector_1)

# x_2 = Dense(config.np_neurons_dense,
#             activation=config.activation,
#             name='conc_layer')(vector_2)

# x_1 = Dropout(config.dropout_rate)(x_1)
# x_2 = Dropout(config.dropout_rate)(x_2)

# conc = dot([x_1, x_2],
#            axes=1,
#            normalize=True,
#            name='dot')
#Late merge end

out = Dense(1,
            activation="sigmoid",
            name = 'out')(x)

siamese_lstm = Model([input_1, input_2],
                     out)

siamese_lstm.compile(loss='binary_crossentropy',
                     optimizer=config.optimizer,
                     metrics=['accuracy'])

logdir = os.path.join("logs-lstm",
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir, histogram_freq=1)
callbacks = [EarlyStopping(monitor='val_loss',patience=config.stop_epochs, verbose=1, mode='auto'),
             tensorboard]

Training the Siamese LSTM

In [None]:
history_lstm = siamese_lstm.fit([train_set_main, train_set_addon],
                                y_train,
                                validation_split=0.1,
                                batch_size=config.batch_size,
                                epochs=config.nb_epochs,
                                callbacks=callbacks,
                                verbose=1)

Testing the Siamese LSTM

In [None]:
y_pred_lstm = siamese_lstm.predict([test_set_main, test_set_addon],
                                    verbose=1)


Combining the predictons scores for the test set with the real values 

In [None]:
X_test_lstm = X_test
X_test_lstm['predicted_label'] = pd.Series(np.round(y_pred_lstm.ravel(),3), index=X_test_lstm.index)
X_test_lstm['real_label'] = pd.Series(y_test, index=X_test_lstm.index)
X_test_lstm.tail(100)

## Analyzing the results
All result graphs and metrics can be used for both siamese_cnn and siamese_lstm networks. We just need to change the name when we want to show specific outcomes for one of them. Where we do comparative analysis, we keep both model outputs in the graph.

### ROC - AUC curve

In [None]:
# for siamese_cnn
y_probas_cnn = np.concatenate((1-y_pred_cnn,y_pred_cnn),axis=1)

fig = plt.figure()
skplt.metrics.plot_roc_curve(y_test, y_probas_cnn)
fig.show()

In [None]:
# for siamese_lstm
y_probas_lstm = np.concatenate((1-y_pred_lstm,y_pred_lstm),axis=1)

fig = plt.figure()
skplt.metrics.plot_roc_curve(y_test, y_probas_lstm)
fig.show()

### Results and analysis

In [None]:
#for siamese_cnn
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_cnn)*100
acc = sklearn.metrics.accuracy_score(y_test, y_pred_cnn.ravel() > 0.5)*100
print('AUC for Siamese CNN: %s\n' % auc)
print('Accuracy for Siamese CNN: %s\n' % acc)

print(sklearn.metrics.confusion_matrix(y_test, y_pred_cnn.ravel() > 0.5))
print(classification_report(y_test, y_pred_cnn.ravel() > 0.5))

plt.figure(figsize=(10,10));
plt.hist(y_pred_cnn[y_test == 0], bins=50, color='red', alpha=0.7);
plt.hist(y_pred_cnn[y_test == 1], bins=50, color='green', alpha=0.7);
plt.text(0.4, 1000, "Siamese CNN", fontsize=18)
plt.xlabel("probability score")
plt.ylabel("samples")

In [None]:
#for siamese_lstm
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_lstm)*100
acc = sklearn.metrics.accuracy_score(y_test, y_pred_lstm.ravel() > 0.5)*100
print('AUC for Siamese LSTM: %s\n' % auc)
print('Accuracy for Siamese LSTM: %s\n' % acc)

print(sklearn.metrics.confusion_matrix(y_test, y_pred_lstm.ravel() > 0.5))
print(classification_report(y_test, y_pred_lstm.ravel() > 0.5))

plt.figure(figsize=(10,10));
plt.hist(y_pred_lstm[y_test == 0], bins=50, color='red', alpha=0.7);
plt.hist(y_pred_lstm[y_test == 1], bins=50, color='green', alpha=0.7);
plt.text(0.7, 3900, "Siamese LSTM", fontsize=18)
plt.xlabel("probability score")
plt.ylabel("samples")

### Plot accuracy and loss curve

In [None]:
# list all data in history
print(history_cnn.history.keys())
print("Siamese CNN")

# summarize history for accuracy
plt.plot(history_cnn.history['accuracy'])
plt.plot(history_cnn.history['val_accuracy'])
plt.title('Siamese CNN accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history_cnn.history['loss'])
plt.plot(history_cnn.history['val_loss'])
plt.title('Siamese CNN loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# list all data in history
print(history_lstm.history.keys())
print("Siamese LSTM")

# summarize history for accuracy
plt.plot(history_lstm.history['accuracy'])
plt.plot(history_lstm.history['val_accuracy'])
plt.title('Siamese LSTM accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history_lstm.history['loss'])
plt.plot(history_lstm.history['val_loss'])
plt.title('Siamese LSTM loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()