In [1]:
import pandas as pd
import pickle
import numpy as np
from google.colab import drive
from datetime import datetime, timedelta
drive.mount('/content/drive')
pickle_path = '/content/drive/MyDrive/trendyol_bootcamp_capstone/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def load_pickle(filename):
  infile = open(pickle_path+filename,'rb')
  loaded_data = pickle.load(infile)
  infile.close()
  return loaded_data

In [3]:
test_data = load_pickle('test_data.pickle')
train_data = load_pickle('train_data.pickle')
validation_data = load_pickle('validation_data.pickle')

test_data.reset_index(drop=True,inplace=True)
validation_data.reset_index(drop=True,inplace=True)
train_data.reset_index(drop=True,inplace=True)

In [4]:
train_data = train_data[train_data['t_dat']>= '2020-03-01']
train_data.reset_index(drop=True,inplace=True)

In [5]:
last_day_training = train_data['t_dat'][len(train_data)-1]
first_day_training = train_data['t_dat'][0]
first_day_training

Timestamp('2020-03-01 00:00:00')

In [6]:
train_data['recency'] = last_day_training - train_data['t_dat']
train_data['recency'] = train_data['recency'].dt.days

In [7]:
train_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,recency
0,2020-03-01,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,751628002,0.022017,1,191
1,2020-03-01,0008968c0d451dbc5a9968da03196fe20051965edde741...,675662028,0.035576,2,191
2,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,821152004,0.025407,2,191
3,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,860738001,0.025407,2,191
4,2020-03-01,00117f79ce61af038e143ee26448e8401fdbff51f48d5a...,822957002,0.045746,2,191


In [8]:
boundaries = []
max_recency = (last_day_training-first_day_training).days
for i in range(0,max_recency+60,60):
  boundaries.append(i)

In [9]:
boundaries

[0, 60, 120, 180, 240]

In [10]:
#bucketing
labels = [1,2,3,4]
train_data['recency'][train_data['recency']==0] = 1 # to overcome nan values after bucketing
train_data['recency'] = pd.cut(train_data['recency'], bins=boundaries, labels=labels)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
train_data.drop(columns = ['sales_channel_id','price','t_dat'], inplace = True)
train_data.head()

Unnamed: 0,customer_id,article_id,recency
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,751628002,4
1,0008968c0d451dbc5a9968da03196fe20051965edde741...,675662028,4
2,001127bffdda108579e6cb16080440e89bf1250a776c6e...,821152004,4
3,001127bffdda108579e6cb16080440e89bf1250a776c6e...,860738001,4
4,00117f79ce61af038e143ee26448e8401fdbff51f48d5a...,822957002,4


In [12]:
modified_train_data = train_data.groupby(["customer_id"], as_index=False).agg(lambda x: ",".join(str(i) for i in x))
modified_train_data

Unnamed: 0,customer_id,article_id,recency
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"841260003,887593002,890498002,795440001,859416...",333331
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"751628002,599580055,599580055,811835004,811835...",4333333333333332
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"727808001,727808007,858883002,851400006,750424...",3333333333
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"589440005,827971001,818320001,896152002,730683...",2221111
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,"666444002,349301041,721257001,160442010,849942...",3333333332
...,...,...,...
756048,ffff8f9ecdce722b5bab97fff68a6d1866492209bfe524...,859845001889679001863001002,222
756049,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"832520001,835008005,826955010,826955010,797565...",3333333322222111111111111
756050,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"687034024,739590040,879891001,739819009,816597...","3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,..."
756051,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"816759002,570004009,570004009,636323002,835561...",222111111


In [13]:
article_count = len(train_data.groupby('article_id').size())
pad_type = 'post'
trunc_type = 'post'

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words= article_count, oov_token=',')
tokenizer.fit_on_texts(modified_train_data['article_id'])

# Get our training data word index
word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(modified_train_data['article_id'])

# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

# Output the results of our work
print("Word index:\n", word_index)

Word index:
 {',': 1, '706016001': 2, '610776002': 3, '759871002': 4, '610776001': 5, '599580038': 6, '599580055': 7, '751471001': 8, '372860002': 9, '841383002': 10, '372860001': 11, '741356002': 12, '720125001': 13, '599580052': 14, '783346001': 15, '706016002': 16, '841383003': 17, '783346018': 18, '688537004': 19, '464297007': 20, '684209027': 21, '824337001': 22, '749699002': 23, '714790020': 24, '806388002': 25, '684209013': 26, '776237011': 27, '448509014': 28, '749699001': 29, '776237020': 30, '806388003': 31, '850917001': 32, '748355003': 33, '753737001': 34, '739590032': 35, '800691007': 36, '684209004': 37, '733749001': 38, '688537011': 39, '351484002': 40, '806388001': 41, '778064028': 42, '811925009': 43, '799365011': 44, '827968001': 45, '759465001': 46, '806225002': 47, '778064003': 48, '723469001': 49, '688537021': 50, '160442010': 51, '759871025': 52, '816563001': 53, '759871001': 54, '160442007': 55, '760084003': 56, '832361001': 57, '706016003': 58, '768912001': 59, 

In [15]:
modified_train_data['article_id'] = train_sequences
modified_train_data['recency'] = modified_train_data['recency'].str.split(',').apply(list)

In [16]:
modified_train_data

Unnamed: 0,customer_id,article_id,recency
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[1638, 3035, 8004, 319, 2721, 7077]","[3, 3, 3, 3, 3, 1]"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[29909, 7, 7, 478, 478, 478, 478, 120, 60, 637...","[4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]"
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[18018, 12822, 6690, 4043, 1912, 1912, 856, 85...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[5331, 2575, 4032, 482, 113, 6144, 1548]","[2, 2, 2, 1, 1, 1, 1]"
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[7894, 2031, 7849, 51, 11136, 11, 55, 16778, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 2]"
...,...,...,...
756048,ffff8f9ecdce722b5bab97fff68a6d1866492209bfe524...,"[11333, 10983, 350]","[2, 2, 2]"
756049,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"[3843, 237, 2170, 2170, 2339, 3748, 161, 210, ...","[3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, ..."
756050,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"[471, 559, 2544, 735, 1494, 7386, 7, 2581, 60,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, ..."
756051,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"[6921, 1583, 1583, 220, 6189, 4570, 3229, 2881...","[2, 2, 2, 1, 1, 1, 1, 1, 1]"


In [17]:
train_seq_temp = []
target_seq = []
for seq in train_sequences:
  train_seq_temp.append(seq[:-1])
  target_seq.append(seq[1:])


In [18]:
recency_temp = []
for recency in modified_train_data['recency']:
  recency_temp.append(recency[:-1])

In [19]:
modified_train_data['article_id'] = train_seq_temp

In [20]:
modified_train_data['recency'] = recency_temp

In [21]:
modified_train_data.head()

Unnamed: 0,customer_id,article_id,recency
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[1638, 3035, 8004, 319, 2721]","[3, 3, 3, 3, 3]"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[29909, 7, 7, 478, 478, 478, 478, 120, 60, 637...","[4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[18018, 12822, 6690, 4043, 1912, 1912, 856, 85...","[3, 3, 3, 3, 3, 3, 3, 3, 3]"
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[5331, 2575, 4032, 482, 113, 6144]","[2, 2, 2, 1, 1, 1]"
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[7894, 2031, 7849, 51, 11136, 11, 55, 16778, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3]"


In [22]:
train_data = modified_train_data
train_data['target'] = target_seq
train_data

Unnamed: 0,customer_id,article_id,recency,target
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[1638, 3035, 8004, 319, 2721]","[3, 3, 3, 3, 3]","[3035, 8004, 319, 2721, 7077]"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[29909, 7, 7, 478, 478, 478, 478, 120, 60, 637...","[4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]","[7, 7, 478, 478, 478, 478, 120, 60, 637, 175, ..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[18018, 12822, 6690, 4043, 1912, 1912, 856, 85...","[3, 3, 3, 3, 3, 3, 3, 3, 3]","[12822, 6690, 4043, 1912, 1912, 856, 856, 1088..."
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[5331, 2575, 4032, 482, 113, 6144]","[2, 2, 2, 1, 1, 1]","[2575, 4032, 482, 113, 6144, 1548]"
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[7894, 2031, 7849, 51, 11136, 11, 55, 16778, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3]","[2031, 7849, 51, 11136, 11, 55, 16778, 10456, ..."
...,...,...,...,...
756048,ffff8f9ecdce722b5bab97fff68a6d1866492209bfe524...,"[11333, 10983]","[2, 2]","[10983, 350]"
756049,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"[3843, 237, 2170, 2170, 2339, 3748, 161, 210, ...","[3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, ...","[237, 2170, 2170, 2339, 3748, 161, 210, 5890, ..."
756050,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"[471, 559, 2544, 735, 1494, 7386, 7, 2581, 60,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, ...","[559, 2544, 735, 1494, 7386, 7, 2581, 60, 461,..."
756051,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"[6921, 1583, 1583, 220, 6189, 4570, 3229, 2881]","[2, 2, 2, 1, 1, 1, 1, 1]","[1583, 1583, 220, 6189, 4570, 3229, 2881, 18093]"


In [23]:
import tensorflow as tf
pad_article = tf.keras.preprocessing.sequence.pad_sequences(train_data['article_id'])
pad_recency = tf.keras.preprocessing.sequence.pad_sequences(train_data['recency'])
pad_target = tf.keras.preprocessing.sequence.pad_sequences(train_data['target'])

In [24]:
train_data['article_id'] = list(pad_article)
train_data['recency'] = list(pad_recency)
train_data['target'] = list(pad_target)

In [25]:
import tensorflow as tf

def create_train_tfdata(train_feat_dict, train_target_tensor,
                        batch_size, buffer_size=None):
    """
    Create train tf dataset for model train input
    :param train_feat_dict: dict, containing the features tensors for train data
    :param train_target_tensor: np.array(), the training TARGET tensor
    :param batch_size: (int) size of the batch to work with
    :param buffer_size: (int) Optional. Default is None. Size of the buffer
    :return: (tuple) 1st element is the training dataset,
                     2nd is the number of steps per epoch (based on batch size)
    """
    if buffer_size is None:
        buffer_size = batch_size*50

    train_steps_per_epoch = len(train_target_tensor) // batch_size

    train_dataset = tf.data.Dataset.from_tensor_slices((train_feat_dict,
                                                        train_target_tensor)).cache()
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
    train_dataset = train_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
    
    return train_dataset, train_steps_per_epoch
  

In [None]:
np.array(train_data['target'])

In [27]:
type(train_data['recency'])

pandas.core.series.Series

In [28]:
train_feat_dict = {'item_id': train_data['article_id'],
                    'recency': train_data['recency']}
train_target_tensor = train_data['target']

train_dataset, train_steps_per_epoch = create_train_tfdata(train_feat_dict,
                                                            train_target_tensor,
                                                            batch_size=512)

ValueError: ignored

In [None]:
def build_model(hp, max_len, item_vocab_size):
    """
    Build a model given the hyper-parameters with item and nb_days input features
    :param hp: (kt.HyperParameters) hyper-parameters to use when building this model
    :return: built and compiled tensorflow model 
    """
    inputs = {}
    inputs['item_id'] = tf.keras.Input(batch_input_shape=[None, max_len],
                                       name='item_id', dtype=tf.int32)
    # create encoding padding mask
    encoding_padding_mask = tf.math.logical_not(tf.math.equal(inputs['item_id'], 0))

    # nb_days bucketized
    inputs['nb_days'] = tf.keras.Input(batch_input_shape=[None, max_len],
                                       name='nb_days', dtype=tf.int32)

    # Pass categorical input through embedding layer
    # with size equals to tokenizer vocabulary size
    # Remember that vocab_size is len of item tokenizer + 1
    # (for the padding '0' value)
    
    embedding_item = tf.keras.layers.Embedding(input_dim=item_vocab_size,
                                               output_dim=hp.get('embedding_item'),
                                               name='embedding_item'
                                              )(inputs['item_id'])
    # nbins=100, +1 for zero padding
    embedding_nb_days = tf.keras.layers.Embedding(input_dim=100 + 1,
                                                  output_dim=hp.get('embedding_nb_days'),
                                                  name='embedding_nb_days'
                                                 )(inputs['nb_days'])

    #  Concatenate embedding layers
    concat_embedding_input = tf.keras.layers.Concatenate(
     name='concat_embedding_input')([embedding_item, embedding_nb_days])

    concat_embedding_input = tf.keras.layers.BatchNormalization(
     name='batchnorm_inputs')(concat_embedding_input)
    
    # LSTM layer
    rnn = tf.keras.layers.LSTM(units=hp.get('rnn_units_cat'),
                                   return_sequences=True,
                                   stateful=False,
                                   recurrent_initializer='glorot_normal',
                                   name='LSTM_cat'
                                   )(concat_embedding_input)

    rnn = tf.keras.layers.BatchNormalization(name='batchnorm_lstm')(rnn)

    # Self attention so key=value in inputs
    att = tf.keras.layers.Attention(use_scale=False, causal=True,
                                    name='attention')(inputs=[rnn, rnn],
                                                      mask=[encoding_padding_mask,
                                                            encoding_padding_mask])

    # Last layer is a fully connected one
    output = tf.keras.layers.Dense(item_vocab_size, name='output')(att)

    model = tf.keras.Model(inputs, output)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(hp.get('learning_rate')),
        loss=loss_function,
        metrics=['sparse_categorical_accuracy'])
    
    return model

In [None]:
def fit_model(model, train_dataset, steps_per_epoch, epochs):
    """
    Fit the Keras model on the training dataset for a number of given epochs
    :param model: tf model to be trained
    :param train_dataset: (tf.data.Dataset object) the training dataset
                          used to fit the model
    :param steps_per_epoch: (int) Total number of steps (batches of samples) before 
                            declaring one epoch finished and starting the next epoch.
    :param epochs: (int) the number of epochs for the fitting phase
    :return: tuple (mirrored_model, history) with trained model and model history
    """
    
    # mirrored_strategy allows to use multi GPUs when available
    mirrored_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
        tf.distribute.experimental.CollectiveCommunication.AUTO)
    
    with mirrored_strategy.scope():
        mirrored_model = model

    history = mirrored_model.fit(train_dataset,
                                 steps_per_epoch=steps_per_epoch,
                                 epochs=epochs, verbose=2)

    return mirrored_model, history

# **Notes**
https://medium.com/decathlontechnology/building-a-rnn-recommendation-engine-with-tensorflow-505644aa9ff3

Articleları tokenize ederken idleri değil de tek kelimelik açıklama kullanıp Word2Vec yapabilirdik