# Using the Siamese LSTM weights for transforming the solution to KNN (cosine similarity) problem

In [None]:
from complementary_products_suggestions import helper_functions, embeddings, config
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.metrics import classification_report
import os
import datetime
import timeit
import tensorflow.python as tf
from tensorflow.keras.layers import Input, LSTM, dot, Embedding, Conv1D, Flatten, Dense, Dropout, Activation, MaxPooling1D, ZeroPadding1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1, l2
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import seaborn as sns

## Retreiving the datasets

In [None]:
database = pd.read_pickle("../dummy_sample_matches.pickle")
content = pd.read_pickle("../dummy_sample_content.pickle")

In [None]:
database

# Splitting the data in train-test split
We are using GroupShuffleSplit as we want to make sure that the products that appear as add-ons in the train set will not appear as an add-on in the test set. We do this to make sure that the model performance will be evaluated on unseen data (real-life scenarios)

In [None]:
X_train, X_test, y_train, y_test = helper_functions.train_test_split(database, 0.2)


## Using Word2vec to create embeddings for each word in product titles based on the whole corpus

In [None]:
embedding_weights = embeddings.word2vec(content, X_train)

## Tokenizing the data

In [None]:
t, train_set_main, train_set_addon, test_set_main, test_set_addon = helper_functions.tokenize_train_test_set(X_train, X_test, 30)

In [None]:
input_1 = Input(shape=(train_set_main.shape[1],))
input_2 = Input(shape=(train_set_addon.shape[1],))

common_embed = Embedding(input_dim =len(t.word_index)+1,
                         weights=[embedding_weights],
                         trainable=False,
                         output_dim=config.feature_dim,
                         input_length=30) 

lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)

common_lstm = LSTM(config.nb_neurons_lstm,
                   return_sequences=True, 
                   activation=config.activation,
                   kernel_regularizer=l2(config.regularizer),
                   bias_regularizer=l2(config.regularizer),
                   activity_regularizer=l2(config.regularizer))

vector_1 = common_lstm(lstm_1)
vector_1 = Flatten(name='flatten1')(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten(name='flatten2')(vector_2)

conc = dot([vector_1, vector_2],
           axes=1,
           normalize=True,
           name='dot')

x = Dense(config.nb_neurons_dense,
          activation=config.activation,
          name='conc_layer')(conc)

x = Dropout(config.dropout_rate)(x)

out = Dense(1,
            activation="sigmoid",
            name = 'out')(x)

siamese_lstm = Model([input_1, input_2],
                     out)

siamese_lstm.compile(loss='binary_crossentropy',
                     optimizer=config.optimizer,
                     metrics=['accuracy'])

logdir = os.path.join("logs-lstm",
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir, histogram_freq=1)
callbacks = [EarlyStopping(monitor='val_loss',patience=config.stop_epochs, verbose=1, mode='auto'),
             tensorboard]

Training the Siamese LSTM

In [None]:
history_lstm = siamese_lstm.fit([train_set_main, train_set_addon],
                                y_train,
                                validation_split=0.1,
                                batch_size=config.batch_size,
                                epochs=config.nb_epochs,
                                callbacks=callbacks,
                                verbose=1)

Testing the Siamese LSTM

In [None]:
# measuring the time needed for predicting
start = timeit.default_timer()
y_pred_lstm = siamese_lstm.predict([test_set_main, test_set_addon],
                                   verbose=1)
stop = timeit.default_timer()
print(f"Time: {stop-start}")

Combining the predictons scores for the test set with the real values 

In [None]:
X_test_lstm = X_test
X_test_lstm['predicted_label'] = pd.Series(np.round(y_pred_lstm.ravel(),3), index=X_test_lstm.index)
X_test_lstm['real_label'] = pd.Series(y_test, index=X_test_lstm.index)
X_test_lstm.tail(100)

In [None]:
#for siamese lstm without word2vec
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_lstm)*100
acc = sklearn.metrics.accuracy_score(y_test, y_pred_lstm.ravel() > 0.5)*100
print('AUCc %s\n' % auc)
print('Accuracy %s\n' % acc)
print(sklearn.metrics.confusion_matrix(y_test, y_pred_lstm.ravel() > 0.5))
print(classification_report(y_test, y_pred_lstm.ravel() > 0.5))

## Saving the weights from the NN before the dot product happens (basically saving the product embeddings/representations)

### Transforming the test set into two lists of target and candindate products
We only store unique products 

In [None]:
target_products_dict = pd.Series(X_test.title_main.values,index=X_test.id_main).to_dict()
print(len(target_products_dict))
target_products_dict.update(pd.Series(X_test.title_addon.values,index=X_test.id_addon).to_dict())

In [None]:
len(database)

In [None]:
len(X_train)

In [None]:
len(X_test)

In this case we don't know which are target which candidate so we put all products from the test set in both sets

In [None]:
candidate_products_dict = target_products_dict

### Tokenizing the target and candidate products

In [None]:
seq = X_train['title_main'].tolist() + X_train['title_addon'].tolist()

t = Tokenizer(lower=True, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
t.fit_on_texts(seq)

test_set_main = t.texts_to_sequences(list(target_products_dict.values()))
target_products = pad_sequences(test_set_main, maxlen=30, padding='post')

test_set_addon = t.texts_to_sequences(list(candidate_products_dict.values()))
candidate_products = pad_sequences(test_set_addon, maxlen=30, padding='post')

### Saving the weights for the target and candidate products 

In [None]:
m2 = Model(inputs=siamese_lstm.input, outputs=siamese_lstm.get_layer('flatten1').output)
target_product_weights = m2.predict(target_products)

In [None]:
m2 = Model(inputs=siamese_lstm.input, outputs=siamese_lstm.get_layer('flatten1').output)
candidate_product_weights = m2.predict(candidate_products)

### Calculating the cosine similarity between the two vector of target and candidate products

In [None]:
dot_product = sklearn.metrics.pairwise.cosine_similarity(target_product_weights, Y=target_product_weights, dense_output=True)

Creating a dataframe

In [None]:
dot_product_df = pd.DataFrame(dot_product, 
                             index = target_products_dict.keys(),
                             columns = target_products_dict.keys())

In [None]:
dot_product_df

Some analysis for specific products

### Finding the top K closest products to the selected one 

In [None]:
dot_product_df.nlargest(5, columns='2')

Creating a heatmap of the cosine similarities

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
ax = sns.heatmap(dot_product_df.iloc[0:2,0:2], cmap='RdYlGn', linewidths=1, annot=True)