In [1]:
import pandas as pd
import numpy as np
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  29.823493242263794


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  21.753265142440796


In [20]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  11.645015001296997
177070


In [5]:
#samples = 100000
#if samples:
#    rows = rows[:samples]

rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


In [6]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

## Items mas vendidos del dominio mas visitado
### Reviso el domio que mas miro el usuario y tomo los 10 items que mas se venden en ese dominio.

In [7]:
# Diccionario de diccionarios: Para cada dominio tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese domino.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [8]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    return domains

In [9]:
dominios_visitados(rows_train[1])

Counter({'MLB-PERFUMES': 14, 'MLB-HAIR_SHAMPOOS_AND_CONDITIONERS': 1})

In [10]:
# Le paso un dominio, y devuelve cuales son los K items mas vendidos del dominio que le paso.
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [11]:
top_items('MLB-DRUM_BRAKE_WHEEL_CYLINDERS')

[896415,
 1232519,
 481256,
 1913520,
 1226818,
 1409684,
 1369096,
 1022320,
 2055221,
 1855874]

In [12]:
def top_by_best_domain(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    domain = dominios_visitados(row)
    domain = domain.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items(domain, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items(domain, k=k)

In [13]:
top_by_best_domain(rows_test[200])

[160527,
 179595,
 1557017,
 394243,
 1381571,
 324280,
 1178286,
 1022234,
 1620782,
 1977523]

In [16]:
y_pred = []
for row in tqdm_notebook(test_rows):
    recom = top_by_best_domain(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [17]:
df_pred = pd.DataFrame(y_pred)  
df_pred.to_csv("./submission/baseline1_submission.csv", index=False, header=False)

In [18]:
df_pred.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [19]:
len(df_pred)

82633

In [54]:
# Tomo del dataset de test cual es el que compro realmente, para computar el score de nuestro test.
y_true = [row['item_bought'] for row in rows_test]

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y=true, y_pred, item_data, n_predictions=10)
print (f'El score es: {score}')