# Candidate Generator

In [1]:
#Import modules
import gzip
import json
import gc
import math
import random
import numpy as np
from collections import Counter, defaultdict
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split

In [3]:
import pickle

with open("reindex.pkl", 'rb') as f:
    reindex = pickle.load(f)

## Carga de datos

In [4]:
#Auxiliary function
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [5]:
path = Path('data')

In [4]:
rows = jl_to_list(path/'train_dataset.jl.gz')
    
rows_train, rows_test= train_test_split(rows, test_size=0.2, random_state=42)

In [6]:
rows_train = jl_to_list(path/'train_dataset.jl.gz')
rows_test = jl_to_list(path/'test_dataset.jl.gz')

In [7]:
item_data = jl_to_list(path/'item_data.jl.gz')
metadata = {x['item_id']:x for x in item_data} #We create a dictionary to access easily access the item metadata
all_items = list(metadata.keys())

In [9]:
del rows

## Métrica de NDCG@10 máximo obtenible

Esta métrica indica el NCDG@10 máximo que se puede obtener con los candidatos generados

In [8]:
def custom_score(y_true, y_pred, metadata):
    ndcg_sum = 0
    idcg = 12 + sum([(1 / np.log2(1 + i)) for i in range(2, 11) ])
    for i in tqdm(range(len(y_pred))):
        dcg = 0
        domain = metadata[y_true[i]]['domain_id']  
        if y_true[i] in y_pred[i]:
            dcg += 12 
            domain_hits = len([x for x in y_pred[i] if metadata[x]['domain_id'] == domain]) - 1
            dcg += sum([(1 / np.log2(1 + i)) for i in range(2, min(domain_hits + 3, 11)) ])
        else:
            domain_hits = len([x for x in y_pred[i] if metadata[x]['domain_id'] == domain])
            dcg += sum([(1 / np.log2(1 + i)) for i in range(1, min(domain_hits + 1, 11))])
        ndcg_sum += (dcg / idcg)
    return ndcg_sum / len(y_pred)

## Funciones y cargas de diccionarios para los tres baselines

In [9]:
def last_viewed(row):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom and reindex[item] != -1:
            recom.append(item)
    if len(recom) > 100:
        recom = recom[:100]
    return recom


In [10]:
views_purchases = defaultdict(lambda: defaultdict(int))
for row in tqdm(rows_train):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            views_purchases[int(ev['event_info'])][int(row['item_bought'])]+=1

100%|██████████| 413163/413163 [00:19<00:00, 20858.01it/s]


In [11]:
def get_item_scores(row):
    """
    Given a user history (row) returns a counter of the items purchased 
    for the items viewed by the user
    """
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in views_purchases[int(ev['event_info'])].items():
                item_scores[k]+=v

    return Counter(item_scores)

In [12]:
def domains_visited(row, max_views=30):
    """
    For a given user story (row), returns a Counter 
    of the domains visited by the user.
    """
    
    domains = Counter()
    
    viewed = [ev['event_info'] for ev in row['user_history'] 
              if ev['event_type']=='view']
    
    if len(viewed) > max_views:
        viewed = viewed[:15]
        
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
        
    return domains

In [13]:
def top_items(domain):
    """
    Given the sales_x_domain info and a certain domain, 
    returns the top k selling items in that domain.
    """
    
    top = sales_x_domain[domain]
    top = Counter(top)
    top = top.most_common()
    return [x[0] for x in top]

In [14]:
sales_x_domain = defaultdict(lambda: defaultdict(int))

for row in tqdm(rows_train):
    #viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    items = [row['item_bought']]
    for item in items:
        domain = metadata[item]['domain_id']
        sales_x_domain[domain][item]+=1

100%|██████████| 413163/413163 [00:01<00:00, 288501.49it/s]


## Ensambles

Este ensamble al llegar a utilizar último baseline (seleccionar ítems más populares del dominio más visitado), va seleccionando en 'profundidad' (DF) los items por dominio. Es decir, comienza con el dominio más popular y selecciona sus ítems desde el más popular al menos popular. Solo cuando se acaban los ítems de ese dominio se continúa a seleccionar ítems del segundo dominio más popular y asi sucesivamente hasta completar las recomendaciones. Este approach fue el utilizado ya que localmente dio mejores resultados en relación al segundo método que se muestra a continuación.

In [15]:
def get_recs_df(row, l):
    # Baseline 1: últimos visitados
    recom = last_viewed(row)
    # Baseline 2: dict view - purchases
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        if item not in recom and reindex[item] != -1:
            recom.append(item)
        if len(recom) == l:
            return recom
    # Baseline 3: dominio más popular
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items, k=l)
    most_common_domains = domains_visited(row).most_common()
    for domain, score in most_common_domains:
        for item in top_items(domain):
            if item not in recom and reindex[item] != -1:
                recom.append(item)
        if len(recom) == l:
            return recom
    # random 
    if len(recom) < l:
        k = l - len(recom)
        recom += random.choices(all_items, k=k)
    return recom


Este approach va seleccionando en 'amplitud' cuando se llega al baseline de dominio. Dado un valor de step, se toman los 5 top ítems del dominio más popular, despues los 5 ítems del segundo dominio más popular y asi sucesivamente hasta pasar por todos los dominios. Despues continua con los siguientes 5.

In [14]:
def get_recs_bf(row, step=10, n_domains=3):
    # Baseline 1: últimos visitados
    recom = last_viewed(row)
    # Baseline 2: dict view - purchases
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        if item not in recom and reindex[item] != -1:
            recom.append(item)
        if len(recom) == 100:
            return recom
    # Baseline 3: dominio más popular
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items, k=100)
    most_common_domains = domains_visited(row).most_common(n_domains)
    for k in range(0, 100, step):
        for domain, score in most_common_domains:
            items = top_items(domain)
            if len(items) >= k + step:
                for item in items[k:k + step]:
                    if item not in recom and reindex[item] != -1:
                        recom.append(item)
            elif len(items) >= k + 5 and len(items) >= k:
                for item in items[k:]:
                    if item not in recom and reindex[item] != -1:
                        recom.append(item)
            if len(recom) == 100:
                return recom
    # random
    if len(recom) < 100:
        k = 100 - len(recom)
        recom += random.choices(all_items, k=k)
    return recom

### Generación de recomendaciones

In [16]:
y_pred = []
for row in tqdm(rows_test):
    recom = get_recs_df(row, 20)
    y_pred.append(recom)

100%|██████████| 177070/177070 [16:25<00:00, 179.61it/s]


### Scoring local

In [16]:
y_true = [row['item_bought'] for row in rows_test]

In [17]:
#score = ndcg_score(y_true, y_pred1, item_data,n_predictions=20)
score = custom_score(y_true, y_pred, metadata)
print(f'Your score is: {score}')

100%|██████████| 82633/82633 [00:19<00:00, 4161.52it/s]Your score is: 0.4191720015167095



### Exportación de recomendaciones para envíar al ranker

In [17]:
import pickle

In [18]:
with open('y_pred_df_20_reindex.pkl', 'wb') as f:
    pickle.dump(y_pred, f)

In [19]:
len(y_pred)

177070