In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import joblib as jb

import json
import tqdm
import joblib
import gc

In [None]:
train = pd.read_json("./data/sample_train.jl", lines=True)
train.head()

In [None]:
test = pd.read_parquet("./data/test.parquet")
test.head()

In [None]:
item_data = pd.read_json("./data/item_data.jl", lines=True)
item_data.head()

## Gera três dicionarios em que a chave é o item_id e os valores são title, price e domain_id

In [None]:
item_title_map = item_data[['item_id', 'title']].drop_duplicates()
item_title_map = item_title_map.set_index("item_id").squeeze().to_dict()

item_price_map = item_data[['item_id', 'price']].drop_duplicates()
item_price_map = item_price_map.set_index("item_id").squeeze().to_dict()

item_domain_map = item_data[['item_id', 'domain_id']].drop_duplicates()
item_domain_map = item_domain_map.set_index("item_id").squeeze().to_dict()

# knn

## Importa os indices do knn
Dados de treino: features dos word embedings dos nomes dos items

In [None]:
%%time
import nmslib
index = nmslib.init()
index.loadIndex('22a_sbert_neuralmind.nms')

## Importa as features dos word embedings dos nomes dos items e cria um dicionário associando cada item_id aos valores

In [None]:
embs_np = joblib.load("22a_embs_np.pkl.z")
item_emb_map = {t: embs_np[i] for i, t in enumerate(item_data['item_id'].values)} 

In [None]:
k=50

# train

## Estruturacao dos dados adicionando info do item (join manual) e novas features

In [None]:
%%time
data = []
seq_index = 0
for hist, bought in tqdm.tqdm(train[['user_history', 'item_bought']].values):

    
    recall = False
    last_ts = None
    seq = 0
    ts = 0
    rep = dict()
    for item in json.loads(hist):
        i = item['event_info']
        
        # Adiciona o id, titulo, preco e domain_id do produto comprado
        item['bought_id'] = bought
        item['bought_title'] = item_title_map[bought]
        item['bought_price'] = item_price_map[bought]
        item['bought_domain'] = item_domain_map[bought]
        
        # Adiciona info do produto visto:
        # titulo, preco, domain_id, dummy do produto visto igual ao comprado, dummy pt
        if item['event_type'] == 'view':
            item['item_title'] = item_title_map[i]
            item['item_price'] = item_price_map[i]
            item['item_domain'] = item_domain_map[i]
            item['has_bought'] = int(bought == i)
            item['pt'] = int('MLB' in item['item_domain']) if item['item_domain'] else np.nan
            item['viewed'] = 1
            # Adiciona features do word embeding do nome do item
            rep[i] = item_emb_map[i]
        
        
        # Indice do item do usuario e entre usuarios
        item['seq_pos'] = seq
        item['seq_index'] = seq_index

        seq += 1
        data.append(item)
    
    lrep = list(rep.values())
    if len(lrep) == 0:
        view_embedding_mean = embs_search_np[seq_index, :] #search para quem nao tem views
    else:
        view_embedding_mean = np.mean(lrep, axis=0)
    for neighbor in index.knnQuery(view_embedding_mean, k=k)[0]: #features dos nomes dos itens
        item = dict()
        i = neighbor
        # Adiciona informacoes dos itens similares
        item['event_info'] = neighbor
        item['event_type'] = 'knn'
        item['bought_id'] = bought
        item['bought_title'] = item_title_map[bought]
        item['bought_price'] = item_price_map[bought]
        item['bought_domain'] = item_domain_map[bought]
        item['item_title'] = item_title_map[i]
        item['item_price'] = item_price_map[i]
        item['item_domain'] = item_domain_map[i]
        item['has_bought'] = int(bought == i)
        item['pt'] = int('MLB' in item['item_domain']) if item['item_domain'] else np.nan
        item['seq_pos'] = -1
        item['seq_index'] = seq_index
        item['viewed'] = 0
        
        data.append(item)
        
        
    seq_index += 1
        
df = pd.DataFrame(data)
del data, embs_search_np
gc.collect()
df['event_timestamp'] = pd.to_datetime(df['event_timestamp']).dt.tz_localize(None)
df[df['event_type'] != 'search'].to_parquet("./data/22_train_view_melted.parquet",engine='fastparquet', compression=None)
df[df['event_type'] == 'search'].to_parquet("./data/22_train_search_melted.parquet",engine='fastparquet', compression=None)
df.head()

In [None]:
%%time
data = []
seq_index = 0
for hist, bought in tqdm.tqdm(train[['user_history', 'item_bought']].values):

    
    recall = False
    last_ts = None
    seq = 0
    ts = 0
    rep = dict()
    for item in hist:
        i = item['event_info']
        item['bought_id'] = bought
        item['bought_title'] = item_title_map[bought]
        item['bought_price'] = item_price_map[bought]
        item['bought_domain'] = item_domain_map[bought]
        
        if item['event_type'] == 'view':
            item['item_title'] = item_title_map[i]
            item['item_price'] = item_price_map[i]
            item['item_domain'] = item_domain_map[i]
            item['has_bought'] = int(bought == i)
            item['pt'] = int('MLB' in item['item_domain']) if item['item_domain'] else np.nan
        
        print(item)
        
        data.append(item)

In [None]:
pd.DataFrame(data).head()

# test

## Estruturacao dos dados adicionando info do item (join manual) e novas features

In [None]:
embs_search_np = joblib.load("22a_embs_search_test_np.pkl.z")

In [None]:
# last k item matches bought item
data = []
seq_index = 0
for hist in tqdm.tqdm(test['user_history'].values):

    
    last_ts = None
    seq = 0
    ts = 0
    rep = dict()
    for item in json.loads(hist):
        i = item['event_info']
        
        if item['event_type'] == 'view':
            item['item_title'] = item_title_map[i]
            item['item_price'] = item_price_map[i]
            item['item_domain'] = item_domain_map[i]
            item['pt'] = int('MLB' in item['item_domain']) if item['item_domain'] else np.nan
            item['viewed'] = 1
            rep[i] = item_emb_map[i]
        
        item['seq_pos'] = seq
        item['seq_index'] = seq_index

        seq += 1
        data.append(item)
        
    lrep = list(rep.values())
    if len(lrep) == 0:
        view_embedding_mean = embs_search_np[seq_index, :]
    else:
        view_embedding_mean = np.mean(lrep, axis=0)
    for neighbor in index.knnQuery(view_embedding_mean, k=k)[0]:
        item = dict()
        i = neighbor
        item['event_info'] = neighbor
        item['event_type'] = 'knn'
        item['item_title'] = item_title_map[i]
        item['item_price'] = item_price_map[i]
        item['item_domain'] = item_domain_map[i]
        item['pt'] = int('MLB' in item['item_domain']) if item['item_domain'] else np.nan
        item['seq_pos'] = -1
        item['seq_index'] = seq_index
        item['viewed'] = 0
        
        data.append(item)
        
    seq_index += 1
        
df = pd.DataFrame(data)
del data, embs_search_np, embs_np, item_emb_map
gc.collect()
df['event_timestamp'] = pd.to_datetime(df['event_timestamp']).dt.tz_localize(None)
df[df['event_type'] != 'search'].to_parquet("./data/22_test_view_melted.parquet",engine='fastparquet', compression=None)
df[df['event_type'] == 'search'].to_parquet("./data/22_test_search_melted.parquet",engine='fastparquet', compression=None)
df.head()
