# Testing the value of adding word2vec embeddings before the Embedding layer

In [None]:
from complementary_products_suggestions import helper_functions, embeddings, config
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.metrics import classification_report
import os
import datetime
import timeit
import tensorflow.python as tf
from tensorflow.keras.layers import Input, LSTM, dot, Embedding, Conv1D, Flatten, Dense, Dropout, Activation, MaxPooling1D, ZeroPadding1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1, l2

## Retreiving the datasets

In [None]:
database = pd.read_pickle("../dummy_sample_matches.csv")
content = pd.read_pickle("../dummy_sample_content.csv")

In [None]:
database

# Splitting the data in train-test split
We are using GroupShuffleSplit as we want to make sure that the products that appear as add-ons in the train set will not appear as an add-on in the test set. We do this to make sure that the model performance will be evaluated on unseen data (real-life scenarios)

In [None]:
X_train, X_test, y_train, y_test = helper_functions.train_test_split(database, 0.2)

## Using Word2vec to create embeddings for each word in product titles based on the whole corpus

In [None]:
embedding_weights = embeddings.word2vec(content, X_train)

## Tokenizing the data

In [None]:
t, train_set_main, train_set_addon, test_set_main, test_set_addon = helper_functions.tokenize_train_test_set(X_train, X_test, 30)

# Siamese LSTM with pretrained embeddings 

In [None]:
input_1 = Input(shape=(train_set_main.shape[1],))
input_2 = Input(shape=(train_set_addon.shape[1],))

common_embed = Embedding(input_dim =len(t.word_index)+1,
                         weights=[embedding_weights],
                         trainable=True,
                         output_dim=config.feature_dim,
                         input_length=30) 

lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)

common_lstm = LSTM(config.nb_neurons_lstm,
                   return_sequences=True, 
                   activation=config.activation,
                   kernel_regularizer=l2(config.regularizer),
                   bias_regularizer=l2(config.regularizer),
                   activity_regularizer=l2(config.regularizer))

vector_1 = common_lstm(lstm_1)
vector_1 = Flatten(name='flatten1')(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten(name='flatten2')(vector_2)

conc = dot([vector_1, vector_2],
           axes=1,
           normalize=True,
           name='dot')

x = Dense(config.nb_neurons_dense,
          activation=config.activation,
          name='conc_layer')(conc)

x = Dropout(config.dropout_rate)(x)

out = Dense(1,
            activation="sigmoid",
            name = 'out')(x)

siamese_lstm_with_word2vec = Model([input_1, input_2],
                                   out)

siamese_lstm_with_word2vec.compile(loss='binary_crossentropy',
                                   optimizer=config.optimizer,
                                   metrics=['accuracy'])

logdir = os.path.join("logs-lstm",
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir, histogram_freq=1)
callbacks = [EarlyStopping(monitor='val_loss',patience=config.stop_epochs, verbose=1, mode='auto'),
             tensorboard]

In [None]:
start = timeit.default_timer()
history_lstm_with_word2vec = siamese_lstm_with_word2vec.fit([train_set_main, train_set_addon],
                                                            y_train,
                                                            validation_split=0.1,
                                                            batch_size=config.batch_size,
                                                            epochs=config.nb_epochs,
                                                            callbacks=callbacks,
                                                            verbose=1)
stop = timeit.default_timer()
print(f"Time: {stop-start}")

In [None]:
y_pred_lstm_with_word2vec = siamese_lstm_with_word2vec.predict([test_set_main, test_set_addon],
                                                               verbose=1)


Combining the predictons scores for the test set with the real values 

In [None]:
X_test_lstm_with_word2vec = X_test
X_test_lstm_with_word2vec['predicted_label'] = pd.Series(np.round(y_pred_lstm_with_word2vec.ravel(),3), index=X_test_lstm_with_word2vec.index)
X_test_lstm_with_word2vec['real_label'] = pd.Series(y_test, index=X_test_lstm_with_word2vec.index)
X_test_lstm_with_word2vec.tail(100)

# Siamese LSTM without pretrained word embeddings 
The only difference is in the Embedding layer under the parameters *weights* and *trainable* (we simply remove them)

In [None]:
input_1 = Input(shape=(train_set_main.shape[1],))
input_2 = Input(shape=(train_set_addon.shape[1],))

common_embed = Embedding(input_dim =len(t.word_index)+1,
                         output_dim=config.feature_dim,
                         input_length=30) 

lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)

common_lstm = LSTM(config.nb_neurons_lstm,
                   return_sequences=True, 
                   activation=config.activation,
                   kernel_regularizer=l2(config.regularizer),
                   bias_regularizer=l2(config.regularizer),
                   activity_regularizer=l2(config.regularizer))

vector_1 = common_lstm(lstm_1)
vector_1 = Flatten(name='flatten1')(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten(name='flatten2')(vector_2)

conc = dot([vector_1, vector_2],
           axes=1,
           normalize=True,
           name='dot')

x = Dense(config.nb_neurons_dense,
          activation=config.activation,
          name='conc_layer')(conc)

x = Dropout(config.dropout_rate)(x)

out = Dense(1,
            activation="sigmoid",
            name = 'out')(x)


siamese_lstm_without_word2vec = Model([input_1, input_2],
                     out)

siamese_lstm_without_word2vec.compile(loss='binary_crossentropy',
                                      optimizer=config.optimizer,
                                      metrics=['accuracy'])

logdir = os.path.join("logs-lstm",
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir, histogram_freq=1)
callbacks = [EarlyStopping(monitor='val_loss',patience=config.stop_epochs, verbose=1, mode='auto'),
             tensorboard]

In [None]:
start = timeit.default_timer()
history_lstm_without_word2vec = siamese_lstm_without_word2vec.fit([train_set_main, train_set_addon],
                                                                  y_train,
                                                                  validation_split=0.1,
                                                                  batch_size=config.batch_size,
                                                                  epochs=config.nb_epochs,
                                                                  callbacks=callbacks,
                                                                  verbose=1)
stop = timeit.default_timer()
print(f"Time: {stop-start}")

In [None]:
y_pred_lstm_without_word2vec = siamese_lstm_without_word2vec.predict([test_set_main, test_set_addon],
                                                                     verbose=1)

In [None]:
X_test_lstm_without_word2vec = X_test
X_test_lstm_without_word2vec['predicted_label'] = pd.Series(np.round(y_pred_lstm_without_word2vec.ravel(),3), index=X_test_lstm_without_word2vec.index)
X_test_lstm_without_word2vec['real_label'] = pd.Series(y_test, index=X_test_lstm_without_word2vec.index)
X_test_lstm_without_word2vec.tail(100)

# Comparative Analysis

In [None]:
# list all data in history
print(history_lstm_with_word2vec.history.keys())
print(history_lstm_without_word2vec.history.keys())

# summarize history for accuracy
plt.plot(history_lstm_with_word2vec.history['val_accuracy'])
plt.plot(history_lstm_without_word2vec.history['val_accuracy'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['with Word2vec', 'without Word2vec'], loc='lower right')
plt.show()

# summarize history for loss
plt.plot(history_lstm_with_word2vec.history['val_loss'])
plt.plot(history_lstm_without_word2vec.history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['with Word2vec', 'without Word2vec'],  loc='upper right')
plt.show()

In [None]:
#for siamese lstm with word2vec
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_lstm_with_word2vec)*100
acc = sklearn.metrics.accuracy_score(y_test, y_pred_lstm_with_word2vec.ravel() > 0.5)*100
print('AUC for Siamese LSTM with word2vec %s\n' % auc)
print('Accuracy for Siamese LSTM with word2vec: %s\n' % acc)

print(sklearn.metrics.confusion_matrix(y_test, y_pred_lstm_with_word2vec.ravel() > 0.5))
print(classification_report(y_test, y_pred_lstm_with_word2vec.ravel() > 0.5))

plt.figure(figsize=(10,10));
plt.hist(y_pred_lstm_with_word2vec[y_test == 0], bins=50, color='red', alpha=0.7);
plt.hist(y_pred_lstm_with_word2vec[y_test == 1], bins=50, color='green', alpha=0.7);
plt.text(0.2, 5000, "Siamese LSTM with word2vec", fontsize=18)
plt.xlabel("probability score")
plt.ylabel("samples")

In [None]:
#for siamese lstm without word2vec
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_lstm_without_word2vec)*100
acc = sklearn.metrics.accuracy_score(y_test, y_pred_lstm_without_word2vec.ravel() > 0.5)*100
print('AUC for Siamese LSTM without word2vec %s\n' % auc)
print('Accuracy for Siamese LSTM without word2vec: %s\n' % acc)
print(sklearn.metrics.confusion_matrix(y_test, y_pred_lstm_without_word2vec.ravel() > 0.5))
print(classification_report(y_test, y_pred_lstm_without_word2vec.ravel() > 0.5))

plt.figure(figsize=(10,10));
plt.hist(y_pred_lstm_without_word2vec[y_test == 0], bins=50, color='red', alpha=0.7);
plt.hist(y_pred_lstm_without_word2vec[y_test == 1], bins=50, color='green', alpha=0.7);
plt.text(0.2, 3900, "Siamese LSTM without word2vec", fontsize=18)
plt.xlabel("probability score")
plt.ylabel("samples")