In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
import os
import sys
import gc
import datetime

In [3]:
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

In [4]:
import tensorflow as tf

<h3>Load sequences</h3>

In [5]:
%%time
data = pd.read_parquet('../data/clean/sequences.parquet')

CPU times: user 225 ms, sys: 50.3 ms, total: 276 ms
Wall time: 313 ms


In [6]:
data.shape

(130752, 2)

<h4>Clip by length</h4>

In [7]:
# remove users with item count less than MIN_USER_SEQ_LEN or more than MAX_USER_SEQ_LEN

In [8]:
seq_len = data['items'].apply(len)

In [9]:
print('top 5%:', np.quantile(seq_len.values, 0.95), 'top 0.01%:', np.quantile(seq_len.values, 0.99))

top 5%: 25.0 top 0.01%: 56.0


In [10]:
MAX_USER_SEQ_LEN = 300
MIN_USER_SEQ_LEN = 3

In [11]:
clipped_df = data['items'][(seq_len > MIN_USER_SEQ_LEN) & (seq_len < MAX_USER_SEQ_LEN)]
clipped_df.shape

(64828,)

In [12]:
clipped_df

4         [ИГРУШКИ ДЛЯ МАЛЬЧИКОВ, ID10022107250, ДЕТСКОЕ...
5         [ID10022107250, ИГРУШКИ ДЛЯ ДЕВОЧЕК, КОНСТРУКТ...
7         [ID10011699553, ID9010019033048, ID10022107250...
15        [ID10022107250, ID9010011851452, ПОДГУЗНИКИ, П...
17        [ДЕТСКАЯ КОСМЕТИКА, ID10019991856, ДЕТСКАЯ КОС...
                                ...                        
130724    [ТОВАРЫ ДЛЯ МАМ, ИГРУШКИ ДЛЯ МАЛЬЧИКОВ, ТОВАРЫ...
130726    [ID10022107250, ОБУВЬ ДЕТСКАЯ, ОБУВЬ ДЕТСКАЯ, ...
130729    [ID10022107250, ИГРУШКИ ДЛЯ ДЕВОЧЕК, ИГРУШКИ Д...
130731    [ОБУВЬ ДЕТСКАЯ, ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 ...
130742    [IDL00007304856, IDL00007304856, ИГРУШКИ ДЛЯ Д...
Name: items, Length: 64828, dtype: object

<h3>Sequence clipped_df.valuespadding, convert to fixed length</h3>

In [13]:
MAX_HISTORY_LENGTH = 100

In [14]:
%%time
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    clipped_df.values, 
    dtype=object,
    padding='pre',
    truncating='pre',
    value='None',
    maxlen=MAX_HISTORY_LENGTH
)

CPU times: user 653 ms, sys: 176 ms, total: 829 ms
Wall time: 965 ms


In [15]:
padded_sequences = padded_sequences[padded_sequences[:, -1] != 'None']

In [16]:
%%time
np.random.seed(42)
np.random.shuffle(padded_sequences)

CPU times: user 154 ms, sys: 709 µs, total: 154 ms
Wall time: 154 ms


In [17]:
assert np.all(padded_sequences[:, -1] != 'None')

In [18]:
del data
del clipped_df

gc.collect()

72

In [19]:
padded_sequences.shape

(64793, 100)

<h3>Sliding window</h3>

In [20]:
WINDOW_LEN = 10

In [21]:
sequences = np.lib.stride_tricks.sliding_window_view(padded_sequences, window_shape=WINDOW_LEN, axis=1)
sequences.shape

(64793, 91, 10)

In [22]:
# get last sequence as out of sample
out_of_sample, sequences = sequences[:, -1, :], sequences[:, :-1:, :]

In [23]:
def count_nonzero(seq):
    # 2d array as input
    return np.array([(s != 'None').sum() for s in seq])

# remove sequences that are too short
sequences = [user_seq[count_nonzero(user_seq) > MIN_USER_SEQ_LEN] for user_seq in tqdm(sequences)]

100%|███████████████████████████████████| 64793/64793 [00:31<00:00, 2043.27it/s]


In [24]:
%%time
# remove axis
sequences = np.vstack(sequences)

CPU times: user 132 ms, sys: 23.2 ms, total: 155 ms
Wall time: 155 ms


In [25]:
sequences.shape, out_of_sample.shape

((522303, 10), (64793, 10))

In [26]:
# rebalancing
sequences = np.append(sequences, out_of_sample[:14793], axis=0)
out_of_sample = out_of_sample[14793:]

In [27]:
sequences.shape, out_of_sample.shape

((537096, 10), (50000, 10))

In [28]:
sequences = sequences[sequences[:, -1] != 'None']

In [29]:
assert np.all(sequences[:, -1] != 'None')
assert np.all(out_of_sample[:, -1] != 'None')

In [30]:
np.mean(count_nonzero(sequences) / WINDOW_LEN), np.mean(count_nonzero(out_of_sample) / WINDOW_LEN)

(0.8349219093462888, 0.7346479999999999)

<h3>Tokens indexing</h3>

In [31]:
%%time
unique = np.unique(np.ravel(padded_sequences))

CPU times: user 5.58 s, sys: 97.4 ms, total: 5.68 s
Wall time: 5.67 s


In [32]:
unique = unique[unique != 'None']

In [33]:
VOCAB_SIZE = len(unique)
VOCAB_SIZE

1255

In [34]:
del padded_sequences
gc.collect()

224

<h3>Get category</h3>

In [35]:
positions = pd.read_parquet('../data/clean/positions.parquet')

In [36]:
item_to_categ = positions.set_index('ID_SKU')['Группа3'].dropna().to_dict()

In [37]:
# for each item, get the category token
categories = [item_to_categ[i] if i in item_to_categ else i for i in unique]

In [38]:
del item_to_categ
del positions
gc.collect()

72

<h3>Split</h3>

In [39]:
half_size = int(len(out_of_sample) * 0.5)

valid = out_of_sample[:half_size]
test  = out_of_sample[half_size:]
train = sequences

In [40]:
with open('../data/models/tmp/test.npy', 'wb') as f:
    np.save(f, test)

In [69]:
with open('../data/models/tmp/vocab.npy', 'wb') as f:
    np.save(f, unique)
    np.save(f, categories)

<h3>Model</h3>

In [41]:
def embedding_encoder(vocabulary, embedding_dim, name=None):
    return tf.keras.Sequential([
        # num_oov_indices - reserves number of indices of lookup table for OOV tokens
        tf.keras.layers.StringLookup(
            vocabulary=vocabulary, mask_token=None, oov_token='None', num_oov_indices=1, name='lookup'
        ),
        # None has weights in the embedding matrix
        tf.keras.layers.Embedding(
            input_dim=len(vocabulary) + 1, output_dim=embedding_dim, mask_zero=True, name='embedding'
        )
    ], name=f"{name}" if name else None)

In [42]:
def point_wise_feed_forward(input_seq, dropout_rate: float, conv_dims: int):

    output = tf.keras.layers.Conv1D(filters=conv_dims, kernel_size=1, activation='relu', use_bias=True)(input_seq)
    output = tf.keras.layers.Dropout(dropout_rate)(output)

    output = tf.keras.layers.Conv1D(filters=conv_dims, kernel_size=1, activation=None, use_bias=True)(output)
    output = tf.keras.layers.Dropout(dropout_rate)(output)

    # Residual connection
    output += input_seq

    return output

In [73]:
def get_model(vocabulary, categories, seq_len, num_blocks=2, hidden_size=100, dropout_rate=0.5):
    
    # --- INPUT LAYER ---
    input_layer = tf.keras.layers.Input(shape=(seq_len,), name="seq_input", dtype=tf.string)
    # mark zeros
    mask = tf.expand_dims(tf.cast(tf.not_equal(input_layer, 'None'), tf.float32), -1)
    
    # --- INTEGRATE HELPER FUNCTIONS ---
    
    get_vocabulary = tf.keras.layers.Lambda(lambda x: vocabulary, name='get_vocabulary')(input_layer)
    get_categories = tf.keras.layers.Lambda(lambda x: categories, name='get_categories')(input_layer)
    
    # --- EMBEDDING LAYER ---
    
    # Get item embedding
    item_embedding = embedding_encoder(
        vocabulary=vocabulary,
        embedding_dim=hidden_size,
        name="item_embedding"
    )

    seq_embedding = item_embedding(input_layer)
    
    # get positional embedding
    pos_embedding = tf.expand_dims(tf.range(tf.shape(input_layer)[1]), 0)
    pos_embedding = tf.tile(pos_embedding, [tf.shape(input_layer)[0], 1])
    
    pos_embedding = tf.keras.layers.Embedding(seq_len, hidden_size, name="pos_embedding")(pos_embedding)
    
    # add positional embeddings
    seq_embedding += pos_embedding
    
    # add dropout
    seq_embedding = tf.keras.layers.Dropout(dropout_rate)(seq_embedding)
    
    # masking
    seq_embedding *= mask
    
    # --- ATTENTION BLOCKS ---
    attention = seq_embedding
    
    #
    for i in range(num_blocks):
        
        # normalize
        attention_norm = tf.keras.layers.LayerNormalization(name="emb_normalize_"+str(i))(attention)
        
        # attention layer
        attention = tf.keras.layers.MultiHeadAttention(
            num_heads=1, key_dim=hidden_size, name="attention_"+str(i), dropout=dropout_rate)(attention_norm, attention)
        
        # Feed forward
        attention = point_wise_feed_forward(attention, dropout_rate, hidden_size)
        
        # masking
        attention *= mask
        
    attention = tf.keras.layers.LayerNormalization(name="attention_normalize")(attention)
    
    # --- PREDICTION LAYER ---
    
    user_embedding = tf.reduce_sum(attention, axis=1)
    
    candidate_ids = tf.range(start=0, limit=len(vocabulary)+1) # the length of vocab + 1 (for "zero" token)
    candidate_emb = item_embedding.get_layer('embedding')(candidate_ids)
    
    scores = tf.matmul(user_embedding, candidate_emb, transpose_b=True)
    logits = tf.keras.layers.Softmax()(scores)
    
    # --- INFERENCE LAYER --- 
    
    # for each item, get the category index (insert 0 for special 0 token)
    categories_index = item_embedding.get_layer('lookup')(np.insert(categories, 0, 'None'))
    
    # for each item, get its category probability
    categories_logits = tf.gather(logits, categories_index, axis=1)
    
    #  Geometric mean sqrt(ITEM * CATEGORY)
    logits = tf.multiply(tf.sqrt(logits), tf.sqrt(categories_logits))
    
    ###########################
    
    model = tf.keras.Model(inputs=[input_layer], outputs=logits, name="model")
    
    return model

In [74]:
model = get_model(unique, categories, WINDOW_LEN-1)

In [43]:
lookup = model.get_layer('item_embedding').get_layer('lookup')

In [44]:
model.predict(valid[:2, :-1]).shape

2022-03-05 02:01:16.865488: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


(2, 1256)

In [45]:
#model.summary()

In [46]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=[
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top1acc'),
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10, name='top10acc'),
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=100, name='top100acc')
    ]
)

In [47]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_top100acc', patience=2, mode='max')

In [48]:
history = model.fit(train[:, :-1], lookup(train[:, -1]),
                    batch_size=256,
                    initial_epoch=0, epochs=2,
                    validation_data=(valid[:, :-1], lookup(valid[:, -1])),
                    callbacks=[early_stopping],
                    use_multiprocessing=True)

Epoch 1/2
Epoch 2/2


In [49]:
_ = model.evaluate(train[:, :-1], lookup(train[:, -1]), batch_size=512)



In [50]:
_ = model.evaluate(valid[:, :-1], lookup(valid[:, -1]), batch_size=512)



In [51]:
model.save('../data/models/model.h5')