In [1]:
from helpers.helper_funcs import (import_data,
                                  create_customer_sessions,
                                  make_prod_index
                                 )
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding
from random import choices
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

np.random.seed(0)

In [2]:
try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != 'GPU'
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

#### Step 1:

In [3]:
customer_sessions = create_customer_sessions()
customer_sessions.head(1)

Unnamed: 0,CustomerID,StockCode
0,12347,"[85116, 22375, 71477, 22492, 22771, 22772, 227..."


#### Step 2:

#### FYI: LSTM is not expected to work well in this data because training label was just some random assignment of 0's and 1's

In [4]:
stock_codes = customer_sessions.StockCode.apply(lambda x: ' '.join(x))

vocabulary_size = 3684
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(stock_codes)

sequences = tokenizer.texts_to_sequences(stock_codes)
data = pad_sequences(sequences, maxlen=50)

model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size, 40, input_length=50))
model_lstm.add(LSTM(40, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])

with tf.device("/cpu:0"):
    model_lstm.fit(data, 
                   np.random.randint(low=0, high=2, size=len(data)), #random for the sake of demo but should/could be trained on 
                                                                    #sequences of products browsed that end up to sale
                   validation_split=0.4, 
                   epochs=3)

2021-10-09 22:01:25.184752: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-09 22:01:25.184901: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/3
Epoch 2/3
Epoch 3/3


#### LSTM Model approach

In [5]:
lstm_embds = model_lstm.layers[0].get_weights()[0]

p, d = lstm_embds.shape
print("\n{d}-D vectors for {p} products\n".format(p=p, d=d))

stock_descrip = make_prod_index()

for index, word in tokenizer.index_word.items():
    if index == 3:
        break
    print("Index: {i}, StockCode: {s}, Descrip: {d}\n Vector: [{v} ... x_40] ".format(i=index,
                                                                          s=word, 
                                                                          d=stock_descrip[word.upper()],
                                                                          v=', '.join([str(i) for i in lstm_embds[index][:5]])
                                                                         ))


40-D vectors for 3684 products

Index: 1, StockCode: 85123a, Descrip: cream hanging heart t-light holder
 Vector: [0.05075116, -0.04993635, -0.014939041, 0.019268792, -0.047574632 ... x_40] 
Index: 2, StockCode: 22423, Descrip: regency cakestand 3 tier
 Vector: [-0.045232665, 0.014394014, -0.008984042, -0.034486726, 0.02907243 ... x_40] 


#### Recommendations doesn't make sense since we don't have a real labeled data for the sequences
 - with real labels, LSTM should be able learn (product) vectors using sequential browsing pattern that leads to a sale (or no sale)
 - this model could be trained to make recommendations that more likely end to a sale
 - the model could be applied in real time, while customer browse/click thru a sequence of products the recommendations will nudge them to purchase

In [6]:
stockitem=np.random.randint(tokenizer.num_words)

print("\n\n==============================================================================")
print("""
Find similar items to:\n 
    StockCode: {s}, Descrip: {d}
""".format(s=tokenizer.index_word[stockitem].upper(), 
           d=stock_descrip[tokenizer.index_word[stockitem].upper()]
          ))
print("==============================================================================")
for item_index in list(reversed(np.argsort(cosine_similarity(lstm_embds[stockitem].reshape(1, -1), 
                                                             lstm_embds))[0]))[:11]:
    if item_index!= stockitem:
        print("\t StockCode: {s}, Descrip: {d}".format(s=tokenizer.index_word[item_index], 
                                                       d=stock_descrip[tokenizer.index_word[item_index].upper()]))
print("==============================================================================")




Find similar items to:
 
    StockCode: 22396, Descrip: magnets pack of 4 retro photo

	 StockCode: 23185, Descrip: french style storage jar jam
	 StockCode: 22299, Descrip: pig keyring with light & sound 
	 StockCode: 79066k, Descrip: retro mod tray
	 StockCode: 16045, Descrip: popart wooden pencils asst
	 StockCode: 84976, Descrip: rectangular shaped mirror
	 StockCode: 72349b, Descrip: set/6 purple butterfly t-lights
	 StockCode: 20932, Descrip: pink pot plant candle
	 StockCode: 21509, Descrip: cowboys and indians birthday card 
	 StockCode: 21894, Descrip: potting shed seed envelopes
	 StockCode: 20769, Descrip: daisy journal 


In [7]:
sim=0
while sim < 0.7:
    idxs=np.random.randint(tokenizer.num_words, size=2)
    stockitems=lstm_embds[idxs]
    sim=cosine_similarity(stockitems)[0,1]

print("\n\n==============================================================================")
print("""
Find similar items to:\n 
    StockCode: {s}, Descrip: {d}
    StockCode: {s1}, Descrip: {d1}
""".format(s=tokenizer.index_word[idxs[0]].upper(), 
           d=stock_descrip[tokenizer.index_word[idxs[0]].upper()],
           s1=tokenizer.index_word[idxs[1]].upper(), 
           d1=stock_descrip[tokenizer.index_word[idxs[1]].upper()]
          ))
print("==============================================================================")
for item_index in list(reversed(np.argsort(cosine_similarity(stockitems.mean(axis=0).reshape(1, -1), 
                                                             lstm_embds))[0]))[:12]:
    if item_index not in idxs:
        print("\t StockCode: {s}, Descrip: {d}".format(s=tokenizer.index_word[item_index], 
                                                       d=stock_descrip[tokenizer.index_word[item_index].upper()]))
print("==============================================================================")




Find similar items to:
 
    StockCode: 23075, Descrip: double ceramic parlour hook
    StockCode: 21058, Descrip: party invites woodland

	 StockCode: 22334, Descrip: dinosaur party bag + sticker set
	 StockCode: 22217, Descrip: t-light holder hanging lace
	 StockCode: 22938, Descrip: cupcake lace paper set 6
	 StockCode: 84792, Descrip: enchanted bird coathanger 5 hook
	 StockCode: 79403, Descrip: frosted white base 
	 StockCode: 22129, Descrip: party cones candy tree decoration
	 StockCode: 22737, Descrip: ribbon reel christmas present 
	 StockCode: 22107, Descrip: pizza plate in box
	 StockCode: 22496, Descrip: set of 2 round tins dutch cheese
	 StockCode: 23528, Descrip: spaceboy wall art
