In [None]:
from complementary_products_suggestions import helper_functions, embeddings, config, data_preprocessing
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.metrics import classification_report
import os
import datetime
import tensorflow.python as tf
from tensorflow.keras.layers import Input, LSTM, dot, Embedding, Conv1D, Flatten, Dense, Dropout, Activation, MaxPooling1D, ZeroPadding1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1, l2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

## Retreiving the datasets

In [None]:
database = pd.read_pickle("../dummy_sample_matches.csv")
content = pd.read_pickle("../dummy_sample_content.csv")

In [None]:
database['combined'] = database[['title_main', 'title_addon']].apply(lambda x: ' '.join(x), axis=1)

# Splitting the data in train-test split
We are using GroupShuffleSplit as we want to make sure that the products that appear as add-ons in the train set will not appear as an add-on in the test set. We do this to make sure that the model performance will be evaluated on unseen data (real-life scenarios)

In [None]:
X_train, X_test, y_train, y_test = helper_functions.train_test_split(database, 0.2, single=True)

### Helper function which is repeating in this notebook for every classifer

In [None]:
def results(y_pred):
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)*100
    acc = sklearn.metrics.accuracy_score(y_test, y_pred.ravel() > 0.5)*100
    print('AUC: %s\n' % auc)
    print('Accuracy: %s\n' % acc)
    print(sklearn.metrics.confusion_matrix(y_test, y_pred.ravel() > 0.5))
#     y_pred_bool = np.argmax(y_pred, axis=1)
    print(classification_report(y_test, y_pred.ravel() > 0.5))

# Random Forest

We are using Count Vectorizer for transforming the data before the model

In [None]:
all_text = pd.concat([X_train['combined'], X_test['combined']])
word_vectorizer = CountVectorizer(analyzer='word', lowercase=True)
word_vectorizer.fit(X_train['combined'])
train_features = word_vectorizer.transform(X_train['combined'])
test_features = word_vectorizer.transform(X_test['combined'])

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42, verbose=1)
rf.fit(train_features, y_train)
y_pred_rf = rf.predict(test_features)

Reporting the results using the previously defined function for analyzing the performance

In [None]:
results(y_pred_rf)

## Word2vec and tokenization for the single neural networks

### Using Word2vec to create embeddings for each word in product titles based on the whole corpus

In [None]:
embedding_weights = embeddings.word2vec(content, X_train)

## Tokenizing the data
We tokenize the combined data (main + add-on product) as we are now dealing with single neural network

In [None]:
t, train_set_combined, test_set_combined = helper_functions.tokenize_train_test_set(X_train, X_test, 60, single=True)

## Vanilla NN

In [None]:
sequence_input = Input(shape=(train_set_combined.shape[1],))

embedding_layer = Embedding(input_dim =len(t.word_index)+1,
                            weights=[embedding_weights],
                            output_dim=config.feature_dim,
                            input_length=60,
                            trainable=False) 

embedded_sequences = embedding_layer(sequence_input)

flatten = Flatten()(embedded_sequences)
x = Dense(config.nb_neurons_dense, activation=config.activation)(flatten)

x = Dropout(config.dropout_rate)(x)

out = Dense(1, activation="sigmoid", name = 'out')(x)

vanilla_nn = Model(sequence_input,
                   out)

vanilla_nn.compile(loss='binary_crossentropy',
                   optimizer=config.optimizer,
                   metrics=['acc'])

In [None]:
history_vanila_nn = vanilla_nn.fit(train_set_combined,
                                   y_train,
                                   validation_split=0.2,
                                   batch_size=config.batch_size,
                                   epochs=config.nb_epochs,
                                   verbose=1)

In [None]:
y_pred_vanilla_nn = vanilla_nn.predict(test_set_combined,
                                       verbose=1)


Results from the Vanilla NN

In [None]:
results(y_pred_vanilla_nn)

# Single LSTM

In [None]:
input_1 = Input(shape=(train_set_combined.shape[1],))

common_embed = Embedding(input_dim =len(t.word_index)+1,
                         weights=[embedding_weights],
                         output_dim=config.feature_dim,
                         input_length=60,
                         trainable=False) 

lstm_1 = common_embed(input_1)

common_lstm = LSTM(150,
                   return_sequences=True, 
                   activation="relu",
                   kernel_regularizer=l2(config.regularizer),
                   bias_regularizer=l2(config.regularizer),
                   activity_regularizer=l2(config.regularizer))

vector_1 = common_lstm(lstm_1)
vector_1 = Flatten(name='flatten1')(vector_1)

x = Dense(config.nb_neurons_dense, activation=config.activation, name='conc_layer')(vector_1)

x = Dropout(0.01)(x)

out = Dense(1, activation="sigmoid", name = 'out')(x)

single_lstm = Model(input_1, out)

single_lstm.compile(loss='binary_crossentropy',
                    optimizer=config.optimizer,
                    metrics=['accuracy'])

In [None]:
history_single_lstm = single_lstm.fit(train_set_combined,
                                      y_train,
                                      validation_split=0.1,
                                      batch_size=config.batch_size,
                                      epochs=config.nb_epochs,
                                      verbose=1)

In [None]:
y_pred_single_lstm = single_lstm.predict(test_set_combined,
                                         verbose=1)


Reporting the results from the single LSTM

In [None]:
results(y_pred_single_lstm)