# Setup

In [None]:
import torch
import numpy as np

In [None]:
from tqdm.notebook import tqdm
import os

In [None]:
class PointWiseFeedForward(torch.nn.Module):
    def __init__(self, hidden_units, dropout_rate):

        super(PointWiseFeedForward, self).__init__()

        self.conv1 = torch.nn.Conv1d(hidden_units, hidden_units, kernel_size=1)
        self.dropout1 = torch.nn.Dropout(p=dropout_rate)
        self.relu = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv1d(hidden_units, hidden_units, kernel_size=1)
        self.dropout2 = torch.nn.Dropout(p=dropout_rate)

    def forward(self, inputs):
        outputs = self.dropout2(self.conv2(self.relu(self.dropout1(self.conv1(inputs.transpose(-1, -2))))))
        outputs = outputs.transpose(-1, -2) # as Conv1D requires (N, C, Length)
        outputs += inputs
        return outputs

In [None]:
class SASRec(torch.nn.Module):
    def __init__(self, user_num, item_num, args):
        super(SASRec, self).__init__()

        self.user_num = user_num
        self.item_num = item_num
        self.dev = args.device

        # TODO: loss += args.l2_emb for regularizing embedding vectors during training
        # https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
        self.item_emb = torch.nn.Embedding(self.item_num+1, args.hidden_units, padding_idx=0)
        self.pos_emb = torch.nn.Embedding(args.maxlen, args.hidden_units) # TO IMPROVE
        self.emb_dropout = torch.nn.Dropout(p=args.dropout_rate)

        self.attention_layernorms = torch.nn.ModuleList() # to be Q for self-attention
        self.attention_layers = torch.nn.ModuleList()
        self.forward_layernorms = torch.nn.ModuleList()
        self.forward_layers = torch.nn.ModuleList()

        self.last_layernorm = torch.nn.LayerNorm(args.hidden_units, eps=1e-8)

        for _ in range(args.num_blocks):
            new_attn_layernorm = torch.nn.LayerNorm(args.hidden_units, eps=1e-8)
            self.attention_layernorms.append(new_attn_layernorm)

            new_attn_layer =  torch.nn.MultiheadAttention(args.hidden_units,
                                                            args.num_heads,
                                                            args.dropout_rate)
            self.attention_layers.append(new_attn_layer)

            new_fwd_layernorm = torch.nn.LayerNorm(args.hidden_units, eps=1e-8)
            self.forward_layernorms.append(new_fwd_layernorm)

            new_fwd_layer = PointWiseFeedForward(args.hidden_units, args.dropout_rate)
            self.forward_layers.append(new_fwd_layer)

            # self.pos_sigmoid = torch.nn.Sigmoid()
            # self.neg_sigmoid = torch.nn.Sigmoid()

    def log2feats(self, log_seqs):
        seqs = self.item_emb(torch.LongTensor(log_seqs).to(self.dev))
        seqs *= self.item_emb.embedding_dim ** 0.5
        positions = np.tile(np.array(range(log_seqs.shape[1])), [log_seqs.shape[0], 1])
        seqs += self.pos_emb(torch.LongTensor(positions).to(self.dev))
        seqs = self.emb_dropout(seqs)

        timeline_mask = torch.BoolTensor(log_seqs == 0).to(self.dev)
        seqs *= ~timeline_mask.unsqueeze(-1) # broadcast in last dim

        tl = seqs.shape[1] # time dim len for enforce causality
        attention_mask = ~torch.tril(torch.ones((tl, tl), dtype=torch.bool, device=self.dev))

        for i in range(len(self.attention_layers)):
            seqs = torch.transpose(seqs, 0, 1)
            Q = self.attention_layernorms[i](seqs)
            mha_outputs, _ = self.attention_layers[i](Q, seqs, seqs, 
                                            attn_mask=attention_mask)
                                            # key_padding_mask=timeline_mask
                                            # need_weights=False) this arg do not work?
            seqs = Q + mha_outputs
            seqs = torch.transpose(seqs, 0, 1)

            seqs = self.forward_layernorms[i](seqs)
            seqs = self.forward_layers[i](seqs)
            seqs *=  ~timeline_mask.unsqueeze(-1)

        log_feats = self.last_layernorm(seqs) # (U, T, C) -> (U, -1, C)

        return log_feats

    def forward(self, user_ids, log_seqs, pos_seqs, neg_seqs): # for training        
        log_feats = self.log2feats(log_seqs) # user_ids hasn't been used yet

        pos_embs = self.item_emb(torch.LongTensor(pos_seqs).to(self.dev))
        neg_embs = self.item_emb(torch.LongTensor(neg_seqs).to(self.dev))

        pos_logits = (log_feats * pos_embs).sum(dim=-1)
        neg_logits = (log_feats * neg_embs).sum(dim=-1)

        # pos_pred = self.pos_sigmoid(pos_logits)
        # neg_pred = self.neg_sigmoid(neg_logits)

        return pos_logits, neg_logits # pos_pred, neg_pred

    def predict(self, user_ids, log_seqs, item_indices): # for inference
        log_feats = self.log2feats(log_seqs) # user_ids hasn't been used yet

        final_feat = log_feats[:, -1, :] # only use last QKV classifier, a waste

        item_embs = self.item_emb(torch.LongTensor(item_indices).to(self.dev)) # (U, I, C)

        logits = item_embs.matmul(final_feat.unsqueeze(-1)).squeeze(-1)

        # preds = self.pos_sigmoid(logits) # rank same item list for different users

        return logits # preds # (U, I)


In [None]:
import sys
import copy
import torch
import random
import numpy as np
from collections import defaultdict
from multiprocessing import Process, Queue

# sampler for batch generation
def random_neq(l, r, s):
    t = np.random.randint(l, r)
    while t in s:
        t = np.random.randint(l, r)
    return t


def sample_function(user_train, usernum, itemnum, batch_size, maxlen, result_queue, SEED):
    def sample():

        user = np.random.randint(1, usernum + 1)
        while len(user_train[user]) <= 1: user = np.random.randint(1, usernum + 1)

        seq = np.zeros([maxlen], dtype=np.int32)
        pos = np.zeros([maxlen], dtype=np.int32)
        neg = np.zeros([maxlen], dtype=np.int32)
        nxt = user_train[user][-1]
        idx = maxlen - 1

        ts = set(user_train[user])
        for i in reversed(user_train[user][:-1]):
            seq[idx] = i
            pos[idx] = nxt
            if nxt != 0: neg[idx] = random_neq(1, itemnum + 1, ts)
            nxt = i
            idx -= 1
            if idx == -1: break

        return (user, seq, pos, neg)

    np.random.seed(SEED)
    while True:
        one_batch = []
        for i in range(batch_size):
            one_batch.append(sample())

        result_queue.put(zip(*one_batch))


class WarpSampler(object):
    def __init__(self, User, usernum, itemnum, batch_size=64, maxlen=10, n_workers=1):
        self.result_queue = Queue(maxsize=n_workers * 10)
        self.processors = []
        for i in range(n_workers):
            self.processors.append(
                Process(target=sample_function, args=(User,
                                                      usernum,
                                                      itemnum,
                                                      batch_size,
                                                      maxlen,
                                                      self.result_queue,
                                                      np.random.randint(2e9)
                                                      )))
            self.processors[-1].daemon = True
            self.processors[-1].start()

    def next_batch(self):
        return self.result_queue.get()

    def close(self):
        for p in self.processors:
            p.terminate()
            p.join()


# train/val/test data generation
def data_partition(fname, type_dataset='train'):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open(fname, 'r')
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)

    for user in User:
      if type_dataset=='train':
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
      elif type_dataset=='final_train':
        user_train[user] = User[user]
      else:  
        user_test[user] = User[user]
    return [user_train, user_valid, user_test, usernum, itemnum]

# TODO: merge evaluate functions for test and val set
# evaluate on test set
def predict(model, dataset, args, itemnum_train, output_file, index_dict, reindex_dict=None, possible_items=None):
    f = open(output_file, 'w')
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    users = range(1, usernum + 1)
    
    # Ahora hay que pasar los id de los items para predecir. 
    # Como estamos en produccion, les pasamos simplemente todos los id
    users_with_problems = []
    if not possible_items:   
      item_idx = [x for x in range(1, itemnum_train + 1)]
    for u in tqdm(users):
      seq = np.zeros([args.maxlen], dtype=np.int32)
      idx = args.maxlen - 1


      # Ahora se agregan todos los de train a la secuencia
      for i in reversed(test[u]):
        seq[idx] = i
        idx -= 1
        if idx == -1: break

      if possible_items:
        #item_idx = [reindex_dict[x] if reindex_dict[x] != -1 else 1 for x in possible_items[u]]
        item_idx = []
        for x in possible_items[u-1]:
          item = reindex_dict[x]
          if item == -1:
            item = 1
            print(f'Item {x} no esta en reindex')
            users_with_problems.append(u)
          else:
            item_idx.append(item)
      try:
        predictions = model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        top10 = torch.topk(predictions[0], 10)
        indices = top10.indices.tolist()
      except:
        print(item_idx)
        print(predictions)
        print(top10)
        print(indices)
        break

      

      f.write(",".join((str(index_dict[item_idx[x]]) for x in indices)) + '\n')

      # Ya tenemos las 10 predicciones para el usuario. Ahora la escribimos en el output file

    f.close()
    return users_with_problems
    
def evaluate(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    HT = 0.0
    valid_user = 0.0

    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:

        if len(train[u]) < 1 or len(test[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        seq[idx] = valid[u][0]
        idx -= 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break
        rated = set(train[u])
        rated.add(0)
        item_idx = [test[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0] # - for 1st argsort DESC

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user


# evaluate on val set
def evaluate_valid(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    valid_user = 0.0
    HT = 0.0
    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break

        rated = set(train[u])
        rated.add(0)
        item_idx = [valid[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0]

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user

In [None]:
class Args:
  def __init__(self, dataset, train_dir, batch_size=128, lr=0.001, maxlen=30, hidden_units=64, num_blocks=3, num_epochs=50, num_heads=1, dropout_rate=0.5, l2_emb=0.0, device='cuda', inference_only=False, state_dict_path=None):
    self.dataset = dataset
    self.train_dir = train_dir
    self.batch_size = batch_size
    self.lr = lr
    self.maxlen = maxlen
    self.hidden_units = hidden_units
    self.num_blocks = num_blocks
    self.num_epochs = num_epochs
    self.num_heads = num_heads
    self.dropout_rate = dropout_rate
    self.l2_emb = l2_emb
    self.device = device
    self.inference_only = inference_only
    self.state_dict_path = state_dict_path

In [None]:
args = Args('/content/drive/Shareddrives/RecSys/Datasets/mercadolibre.txt', 'data')

In [None]:
dataset = data_partition(args.dataset, 'final_train')
[user_train, user_valid, user_test, usernum, itemnum] = dataset
num_batch = len(user_train) // args.batch_size # tail? + ((len(user_train) % args.batch_size) != 0)
print(f'Number of batches: {num_batch}')
cc = 0.0
for u in user_train:
    cc += len(user_train[u])
print('average sequence length: %.2f' % (cc / len(user_train)))

Number of batches: 4161
average sequence length: 16.62


In [None]:
model = SASRec(usernum, itemnum, args).to(args.device)  # no ReLU activation in original SASRec implementation?

In [None]:
sampler = WarpSampler(user_train, usernum, itemnum, batch_size=args.batch_size, maxlen=args.maxlen, n_workers=3)
 # no ReLU activation in original SASRec implementation?

for name, param in model.named_parameters():
    try:
        torch.nn.init.xavier_uniform_(param.data)
    except:
        pass # just ignore those failed init layers

# this fails embedding init 'Embedding' object has no attribute 'dim'
# model.apply(torch.nn.init.xavier_uniform_)

model.train() # enable model training

epoch_start_idx = 1
if args.state_dict_path is not None:
    try:
        model.load_state_dict(torch.load(args.state_dict_path, map_location=torch.device(args.device)))
        tail = args.state_dict_path[args.state_dict_path.find('epoch=') + 6:]
        epoch_start_idx = int(tail[:tail.find('.')]) + 1
    except: # in case your pytorch version is not 1.6 etc., pls debug by pdb if load weights failed
        print('failed loading state_dicts, pls check file path: ', end="")
        print(args.state_dict_path)
        print('pdb enabled for your quick check, pls type exit() if you do not need it')
        import pdb; pdb.set_trace()
        

if args.inference_only:
    model.eval()
    t_test = evaluate(model, dataset, args)
    print('test (NDCG@10: %.4f, HR@10: %.4f)' % (t_test[0], t_test[1]))

# ce_criterion = torch.nn.CrossEntropyLoss()
# https://github.com/NVIDIA/pix2pixHD/issues/9 how could an old bug appear again...
bce_criterion = torch.nn.BCEWithLogitsLoss() # torch.nn.BCELoss()
adam_optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98))


In [None]:
import time

#f = open(os.path.join(args.dataset + '_' + args.train_dir, 'log.txt'), 'w')
T = 0.0
t0 = time.time()

for epoch in range(epoch_start_idx, args.num_epochs + 1):
    if args.inference_only: break # just to decrease identition
    for step in range(num_batch): # tqdm(range(num_batch), total=num_batch, ncols=70, leave=False, unit='b'):
        u, seq, pos, neg = sampler.next_batch() # tuples to ndarray
        u, seq, pos, neg = np.array(u), np.array(seq), np.array(pos), np.array(neg)
        pos_logits, neg_logits = model(u, seq, pos, neg)
        pos_labels, neg_labels = torch.ones(pos_logits.shape, device=args.device), torch.zeros(neg_logits.shape, device=args.device)
        # print("\neye ball check raw_logits:"); print(pos_logits); print(neg_logits) # check pos_logits > 0, neg_logits < 0
        adam_optimizer.zero_grad()
        indices = np.where(pos != 0)
        loss = bce_criterion(pos_logits[indices], pos_labels[indices])
        loss += bce_criterion(neg_logits[indices], neg_labels[indices])
        for param in model.item_emb.parameters(): loss += args.l2_emb * torch.norm(param)
        loss.backward()
        adam_optimizer.step()
        if step % 100 == 0:
          print("loss in epoch {} iteration {}: {}".format(epoch, step, loss.item())) # expected 0.4~0.6 after init few epochs


    if epoch % 200 == 0: # Aqui lo cambie a 200 para que no haya evaluacion
        model.eval()
        t1 = time.time() - t0
        T += t1
        print('Evaluating', end='')
        t_test = evaluate(model, dataset, args)
        t_valid = evaluate_valid(model, dataset, args)
        print('epoch:%d, time: %f(s), valid (NDCG@10: %.4f, HR@10: %.4f), test (NDCG@10: %.4f, HR@10: %.4f)'
                % (epoch, T, t_valid[0], t_valid[1], t_test[0], t_test[1]))

        #f.write(str(t_valid) + ' ' + str(t_test) + '\n')
        #f.flush()
        t0 = time.time()
        model.train()

    if epoch == args.num_epochs:
        folder = '/content/drive/Shareddrives/RecSys/MeLi/checkpoints'
        fname = 'SASRec.epoch={}.lr={}.layer={}.head={}.hidden={}.maxlen={}.pth'
        fname = fname.format(args.num_epochs, args.lr, args.num_blocks, args.num_heads, args.hidden_units, args.maxlen)
        torch.save(model.state_dict(), os.path.join(folder, fname))

#f.close()
sampler.close()
print("Done")

loss in epoch 1 iteration 0: 1.3861044645309448
loss in epoch 1 iteration 100: 1.3078761100769043
loss in epoch 1 iteration 200: 1.2382941246032715
loss in epoch 1 iteration 300: 1.2038002014160156
loss in epoch 1 iteration 400: 1.178877353668213
loss in epoch 1 iteration 500: 1.1447324752807617
loss in epoch 1 iteration 600: 1.151376485824585
loss in epoch 1 iteration 700: 1.1258927583694458
loss in epoch 1 iteration 800: 1.1080471277236938
loss in epoch 1 iteration 900: 1.1122419834136963
loss in epoch 1 iteration 1000: 1.1020481586456299
loss in epoch 1 iteration 1100: 1.1086832284927368
loss in epoch 1 iteration 1200: 1.1112143993377686
loss in epoch 1 iteration 1300: 1.1118680238723755
loss in epoch 1 iteration 1400: 1.0771541595458984
loss in epoch 1 iteration 1500: 1.061126947402954
loss in epoch 1 iteration 1600: 1.076577067375183
loss in epoch 1 iteration 1700: 1.0782902240753174
loss in epoch 1 iteration 1800: 1.0417503118515015
loss in epoch 1 iteration 1900: 1.0680860280990

In [None]:
model.load_state_dict(torch.load("/content/drive/Shareddrives/RecSys/MeLi/checkpoints/SASRec.epoch=50.lr=0.001.layer=3.head=1.hidden=64.maxlen=30.pth"))

<All keys matched successfully>

In [None]:
model.eval()
t_test = evaluate(model, dataset, args)
print('test (NDCG@10: %.4f, HR@10: %.4f)' % (t_test[0], t_test[1]))

..................................................................................test (NDCG@10: 0.7270, HR@10: 0.8654)


In [None]:
model.eval()
dataset_test = data_partition('/content/drive/Shareddrives/RecSys/Datasets/mercadolibre_test.txt', 'xd')

In [33]:
import pickle

with open('/content/drive/Shareddrives/RecSys/MeLi/pickles/baseline231_50.pkl', 'rb') as f:
   predictions = pickle.load(f)

In [34]:
len(predictions)

177070

In [35]:
u_problems = predict(model, dataset_test, args, itemnum, f'/content/SASRec.epoch={args.num_epochs}.lr={args.lr}.layer={args.num_blocks}.head={args.num_heads}.hidden={args.hidden_units}.maxlen={args.maxlen}with_predictions.csv', final_index, final_reindex, predictions)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))

Item 506575 no esta en reindex
Item 1791625 no esta en reindex
Item 506575 no esta en reindex
Item 1791625 no esta en reindex
Item 1047106 no esta en reindex
Item 1047106 no esta en reindex
Item 241469 no esta en reindex
Item 1632751 no esta en reindex
Item 1873574 no esta en reindex
Item 1545772 no esta en reindex
Item 403558 no esta en reindex
Item 246833 no esta en reindex
Item 915201 no esta en reindex
Item 1047106 no esta en reindex
Item 2086215 no esta en reindex
Item 403558 no esta en reindex
Item 493812 no esta en reindex
Item 1800012 no esta en reindex
Item 230235 no esta en reindex
Item 749818 no esta en reindex
Item 417263 no esta en reindex
Item 1659656 no esta en reindex
Item 1492931 no esta en reindex
Item 2039233 no esta en reindex
Item 2039233 no esta en reindex
Item 16746 no esta en reindex
Item 435484 no esta en reindex
Item 512644 no esta en reindex
Item 2039233 no esta en reindex
Item 1707526 no esta en reindex
Item 1881931 no esta en reindex
Item 1276876 no esta en

In [None]:
from google.colab import files

files.download(f'/content/SASRec.epoch={args.num_epochs}.lr={args.lr}.layer={args.num_blocks}.head={args.num_heads}.hidden={args.hidden_units}.maxlen={args.maxlen}.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Preparar dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gzip
from collections import defaultdict
from datetime import datetime

def kk():
  return -1
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

def create_dataset_train(path_input, path_output):
  # Crea un archivo .inter para usar con recbole
  reindex = defaultdict(kk)
  index = defaultdict(kk)
  with open(f"{path_output}.txt", 'w') as file:
    user_id = 1
    item_count = 1
    user = {}
    for l in tqdm(parse(path_input)):
      biggest_timestamp = 0
      history = []
      for event in l['user_history']:     
        if event['event_type'] == 'view':
          item_id = event['event_info']
          if reindex[item_id] == -1:
            reindex[item_id] = item_count
            index[item_count] = item_id
            item = item_count
            item_count += 1
          else:
            item = reindex[item_id]
          time = int(datetime.strptime(event['event_timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z').timestamp())
          history.append((item, time))
          if time > biggest_timestamp:
            biggest_timestamp = time
      history.sort(key=lambda x: x[1])
      for x in history:
        file.write(' '.join([str(user_id), str(x[0])]) + '\n')
      if len(history) > 0:
        if reindex[l['item_bought']] == -1:
          reindex[l['item_bought']] = item_count
          index[item_count] = l['item_bought']
          item_count += 1
        file.write(' '.join([str(user_id), str(reindex[l['item_bought']])]) + '\n')
        user_id += 1
  return reindex, index, user_id, item_count

def add_test_dataset(path_input, path_dataset, reindex, index, last_user_id, last_item_id):
  """ 
  Esta funcion agrega los items del dataset de test al training. Solamente
  se agregan si es que son por lo menos 2 items en la secuencia, sino no tiene sentido
  """
  reindex = reindex.copy()
  index = index.copy()
  with open(f"{path_dataset}.txt", "a") as file:
    user_id = last_user_id
    item_count = last_item_id
    for l in tqdm(parse(path_input)):
      history = []
      for event in l['user_history']:
        if event['event_type'] == 'view':
          item_id = event['event_info']
          if reindex[item_id] == -1:
            reindex[item_id] = item_count
            index[item_count] = item_id
            item = item_count
            item_count += 1
          else:
            item = reindex[item_id]
          time = int(datetime.strptime(event['event_timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z').timestamp())
          history.append((item, time))
      history.sort(key=lambda x: x[1])
      if len(history) > 1:
        for x in history:
          file.write(' '.join([str(user_id), str(x[0])]) + '\n')
        user_id += 1
    return reindex, index
def create_dataset_test(path_input, path_output, reindex):
  with open(f"{path_output}.txt", 'w') as file:
    user_id = 1
    item_count = 1
    user = {}
    for l in tqdm(parse(path_input)):
      history = []
      for event in l['user_history']:     
        if event['event_type'] == 'view' and reindex[event['event_info']] != -1:
          item = reindex[event['event_info']]
          """
          if reindex[item_id] == -1:
            reindex[item_id] = item_count
            index[item_count] = item_id
            item = item_count
            item_count += 1
            print(item_id)
          else:
            item = reindex[item_id]
          """
          time = int(datetime.strptime(event['event_timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z').timestamp())
          history.append((item, time))
      history.sort(key=lambda x: x[1])
      for x in history:
        file.write(' '.join([str(user_id), str(x[0])]) + '\n')
      if len(history) == 0:
        item_id = 1
        file.write(' '.join([str(user_id), str(item_id)]) + '\n')
      user_id += 1
  return reindex


In [None]:
reindex_train, index_train, user_id, item_count = create_dataset_train('/content/drive/Shareddrives/RecSys/Datasets/train_dataset.jl.gz', 'mercadolibre')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
final_reindex, final_index = add_test_dataset('/content/drive/Shareddrives/RecSys/Datasets/test_dataset.jl.gz', 'mercadolibre', reindex_train, index_train, user_id, item_count)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
reindex_test = create_dataset_test('/content/drive/Shareddrives/RecSys/Datasets/test_dataset.jl.gz', 'mercadolibre_test', final_reindex)

177070it [01:30, 1963.57it/s]


In [None]:
item_count

1605251

In [None]:
itemnum

2101271

In [None]:
import pickle

In [None]:

with open('reindex.pkl', 'wb') as f:
  pickle.dump(final_reindex, f)

In [None]:
with open('index.pkl', 'wb') as f:
  pickle.dump(final_index, f)

In [None]:
with open('/content/drive/Shareddrives/RecSys/MeLi/reindex.pkl', 'rb') as f:
  final_reindex = pickle.load(f)

with open('/content/drive/Shareddrives/RecSys/MeLi/index.pkl', 'rb') as f:
  final_index = pickle.load(f)

In [None]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [None]:
file_len('/content/drive/Shareddrives/RecSys/MeLi/results/SASRec.epoch=50.lr=0.001.layer=3.head=1.hidden=64.maxlen=30.csv')

177070