In [1]:
import pandas as pd
import numpy as np
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  28.953083753585815


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  21.188710927963257


In [5]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  13.834031343460083
177070


In [6]:
#samples = 100000
#if samples:
#    rows = rows[:samples]

rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


In [7]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

## BL_1: Items mas vendidos del dominio mas visitado
### a) Reviso el domino que mas miro un usuario
### b) Para ese domino busco los 10 items mas vendidos
### c) Si hay menos de 10 relleno al azar
### Score: 0.13451

In [8]:
# Diccionario de diccionarios: Para cada dominio tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese domino.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [9]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    return domains

In [10]:
dominios_visitados(rows_train[1])

Counter({'MLB-PERFUMES': 14, 'MLB-HAIR_SHAMPOOS_AND_CONDITIONERS': 1})

In [11]:
# Le paso un dominio, y devuelve cuales son los K items mas vendidos del dominio que le paso.
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [12]:
top_items('MLB-DRUM_BRAKE_WHEEL_CYLINDERS')

[896415,
 1232519,
 481256,
 1913520,
 1226818,
 1409684,
 1369096,
 1022320,
 2055221,
 1855874]

In [13]:
def top_by_best_domain(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    domain = dominios_visitados(row)
    domain = domain.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items(domain, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items(domain, k=k)

In [14]:
top_by_best_domain(rows_test[200])

[160527,
 179595,
 1557017,
 394243,
 1381571,
 324280,
 1178286,
 1022234,
 1620782,
 1977523]

In [15]:
y_pred = []
for row in tqdm_notebook(test_rows):
    recom = top_by_best_domain(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [16]:
df_pred = pd.DataFrame(y_pred)  
#df_pred.to_csv("./submission/baseline1_submission.csv", index=False, header=False)

In [17]:
df_pred.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [18]:
len(df_pred)

177070

In [19]:
# Tomo del dataset de test cual es el que compro realmente, para computar el score de nuestro test.
y_true = [row['item_bought'] for row in rows_test]

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y=true, y_pred, item_data, n_predictions=10)
print (f'El score es: {score}')

## BL_2: Ultimos Items Vistos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, relleno al azar.
### Score: 0.20757379669640982 (relativo 33)


In [15]:
def last_viewed(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(all_items, k=k)
    return recom + relleno

In [16]:
last_viewed(test_rows[13])

[1837360,
 1589341,
 97061,
 1930888,
 612730,
 1188730,
 1919088,
 1762125,
 1958709,
 871242]

In [22]:
y_pred_2 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed(row)
    y_pred_2.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [23]:
t_0 = time.time()
df_pred_2 = pd.DataFrame(y_pred_2)  
#df_pred_2.to_csv("./submission/baseline2_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)

tiempo:  1.6090893745422363


In [24]:
len(df_pred_2)

177070

In [25]:
df_pred_2.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

## BL_3: Ultimos Items Vistos con relleno de los items mas vendidos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, con los items mas vendidos del dominio mas visitado por el usuario
### Score: 

In [17]:
random.choices(top_by_best_domain(test_rows[13]), k=4)

[225360, 225360, 1891622, 1891622]

In [46]:
def last_viewed_smart_filling(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_domain(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [48]:
last_viewed_smart_filling(test_rows[1])

[849692,
 26324,
 48097,
 788705,
 831243,
 943786,
 1125393,
 1761121,
 361733,
 1875533]

In [52]:
y_pred_3 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling(row)
    y_pred_3.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [53]:
t_0 = time.time()
df_pred_3 = pd.DataFrame(y_pred_3)  
df_pred_3.to_csv("./submission/baseline3_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)

tiempo:  1.906029462814331


In [54]:
len(df_pred_3)

177070

In [56]:
df_pred_3.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64