In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import joblib as jb

import torch
from torch.nn import functional as F
from torch import nn

import json
import dask

import itertools
import joblib
import time

import tqdm

from dask.diagnostics import ProgressBar
ProgressBar().register()

In [None]:
train = pd.read_json("./data/sample_train.jl", lines=True)
test = pd.read_json("./data/sample_test.jl", lines=True)

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
item_data = pd.read_json("./data/sample_item.jl", lines=True)

item_title_map = item_data[['title', 'item_id']].set_index("item_id").squeeze().to_dict()
item_data.sample(10)

# data

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, item_data):
        self.item_data = item_data

    def __len__(self):
        return self.item_data.shape[0]

    def __getitem__(self, index):
        title = self.item_data.iloc[index]['title']

        return title

## Instancia o modelo bert e prepara os dados dos items para uso

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample
from torch.utils.data import DataLoader

pretrained = 'neuralmind/bert-large-portuguese-cased'

model = SentenceTransformer(pretrained,  device='cuda')
train_data = Dataset(item_data)
train_loader = DataLoader(train_data, batch_size=2048)

## Cria word embeding dos items e extrai as features

In [None]:
%%time
embs_list = list()
for data in tqdm.tqdm(train_loader):
    embs = model.encode(data)
    embs_list.append(embs)

In [None]:
embs_np = np.vstack(embs_list)

In [None]:
joblib.dump(embs_np, "22a_embs_np.pkl.z")

In [None]:
import nmslib

## initialize a new index, using a HNSW index on Cosine Similarity

In [None]:
%%time
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data=embs_np, ids=item_data['item_id'].values)
index.createIndex(print_progress=True)

## Calcula os k items mais similares baseado nas features dos nomes e calcula o recall

In [None]:
recall = 0
hs = list()
for elist, t in tqdm.tqdm(train[['user_history', 'item_bought']].values):
    #elist = json.loads(elist)
    rep = list()
    for e in elist:
        # Se for view adiciona o embeding
        if isinstance(e['event_info'], int) and e['event_info'] in item_data["item_id"].values:
            rep.append(item_emb_map[e['event_info']])
            #print(item_title_map[e['event_info']])
    # Calcula media por livra
    h = np.mean(rep, axis=0)
    #hs.append(h)
    #h = rep[0]
    #t = item_emb_map[t]
    
    #print()
    try:
        # Gera sugestoes
        k = index.knnQuery(h, k=50)
        # Adiciona ao recall se a compra esta nas sugestoes
        recall += int(t in set(k[0]))
    except:
        continue
    
    #for i,d in zip(k[0], k[1]):
    #    print(d, item_title_map[i])
    #print(recall)    
    #print(int(t in k[0]))
    #print()
    
    
    #print(item_title_map[t])
    #print("-"*10+"\n"*5)
print(recall/train.shape[0])

recall@10 - 0.13778097264275843
recall@20 - 0.15457821731374785
recall@100 - 0.18157240604797623
recall@1000 - 0.18950632074992194
recall cs = viewed - 0.29388401187908886

In [None]:
index.saveIndex("22a_sbert_neuralmind.nms")

# search

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, search_data):
        self.search_data = search_data

    def __len__(self):
        return len(self.search_data)

    def __getitem__(self, index):
        seq_index = self.search_data[index][0]
        search = self.search_data[index][1]
        #print(search)
        return seq_index, search

In [None]:
#%%time
search_data = set()
seq_index = 0
for hist, bought in tqdm.tqdm(train[['user_history', 'item_bought']].values):
    
    for item in hist:
        i = item['event_info']
        if item['event_type'] == 'search':
            search_data.add((seq_index, i.lower()))   
    seq_index += 1
search_data = list(search_data)

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample
from torch.utils.data import DataLoader

pretrained = 'neuralmind/bert-large-portuguese-cased'

model = SentenceTransformer(pretrained,  device='cuda')
train_data = Dataset(search_data)
train_loader = DataLoader(train_data, batch_size=2048)

In [None]:
#seq_index_embs_map = np.zeros((train.shape[0], 1024))
res = list()

for seq_ix, search in tqdm.tqdm(train_loader):
    #print(seq_i
    #print(search_list)
    emb = model.encode(search)
    seq_ix = seq_ix.numpy()
    for i in range(emb.shape[0]):
        res.append((seq_ix[i], emb[i, :]))

In [None]:
from collections import Counter
ctr = Counter([e[0] for e in res])

seq_index_embs_map = np.zeros((train.shape[0], 1024))
for seqix, emb in tqdm.tqdm(res):
    seq_index_embs_map[seqix, :] += emb

for i in tqdm.tqdm(range(train.shape[0])):
    seq_index_embs_map[i, :] /= ctr.get(i, 1)

In [None]:
joblib.dump(seq_index_embs_map, "22a_embs_search_np.pkl.z")

# teste

In [None]:
#%%time
search_data = set()
seq_index = 0
for hist in tqdm.tqdm(test['user_history'].values):
    
    for item in json.loads(hist):
        i = item['event_info']
        if item['event_type'] == 'search':
            search_data.add((seq_index, i.lower()))   
    seq_index += 1
search_data = list(search_data)

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample
from torch.utils.data import DataLoader

pretrained = 'neuralmind/bert-large-portuguese-cased'

model = SentenceTransformer(pretrained,  device='cuda')
test_data = Dataset(search_data)
test_loader = DataLoader(test_data, batch_size=2048)

In [None]:
#seq_index_embs_map = np.zeros((train.shape[0], 1024))
res = list()

for seq_ix, search in tqdm.tqdm(test_loader):
    #print(seq_i
    #print(search_list)
    emb = model.encode(search)
    seq_ix = seq_ix.numpy()
    for i in range(emb.shape[0]):
        res.append((seq_ix[i], emb[i, :]))

In [None]:
from collections import Counter
ctr = Counter([e[0] for e in res])

seq_index_embs_map = np.zeros((test.shape[0], 1024))
for seqix, emb in tqdm.tqdm(res):
    seq_index_embs_map[seqix, :] += emb

for i in tqdm.tqdm(range(test.shape[0])):
    seq_index_embs_map[i, :] /= ctr.get(i, 1)

In [None]:
joblib.dump(seq_index_embs_map, "22a_embs_search_test_np.pkl.z")