In [2]:
import pandas as pd
import numpy as np
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [3]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [4]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  27.855225563049316


In [5]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  20.489689826965332


In [6]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  12.030776023864746
177070


In [7]:
#samples = 100000
#if samples:
#    rows = rows[:samples]

rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


In [8]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

## BL_1: Items mas vendidos del dominio mas visitado
### a) Reviso el domino que mas miro un usuario
### b) Para ese domino busco los 10 items mas vendidos
### c) Si hay menos de 10 relleno al azar
### Score: 0.13451

In [22]:
# Diccionario de diccionarios: Para cada dominio tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese domino.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [23]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    return domains

In [24]:
dominios_visitados(rows_train[1])

Counter({'MLB-PERFUMES': 14, 'MLB-HAIR_SHAMPOOS_AND_CONDITIONERS': 1})

In [25]:
# Le paso un dominio, y devuelve cuales son los K items mas vendidos del dominio que le paso.
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [26]:
top_items('MLB-DRUM_BRAKE_WHEEL_CYLINDERS')

[896415,
 1232519,
 481256,
 1913520,
 1226818,
 1409684,
 1369096,
 1022320,
 2055221,
 1855874]

In [27]:
def top_by_best_domain(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    domain = dominios_visitados(row)
    domain = domain.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items(domain, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items(domain, k=k)

In [28]:
top_by_best_domain(rows_test[200])

[160527,
 179595,
 1557017,
 394243,
 1381571,
 324280,
 1178286,
 1022234,
 1620782,
 1977523]

In [15]:
y_pred = []
for row in tqdm_notebook(test_rows):
    recom = top_by_best_domain(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [16]:
df_pred = pd.DataFrame(y_pred)  
#df_pred.to_csv("./submission/baseline1_submission.csv", index=False, header=False)

In [17]:
df_pred.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [18]:
len(df_pred)

177070

In [19]:
# Tomo del dataset de test cual es el que compro realmente, para computar el score de nuestro test.
y_true = [row['item_bought'] for row in rows_test]

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y=true, y_pred, item_data, n_predictions=10)
print (f'El score es: {score}')

## BL_2: Ultimos Items Vistos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, relleno al azar.
### Score: 0.20757379669640982 (relativo 33)


In [29]:
def last_viewed(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(all_items, k=k)
    return recom + relleno

In [30]:
last_viewed(test_rows[13])

[1837360,
 1589341,
 1487760,
 12458,
 1274340,
 1412866,
 1591130,
 34265,
 2084079,
 204920]

In [22]:
y_pred_2 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed(row)
    y_pred_2.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [23]:
t_0 = time.time()
df_pred_2 = pd.DataFrame(y_pred_2)  
#df_pred_2.to_csv("./submission/baseline2_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)

tiempo:  1.6090893745422363


In [24]:
len(df_pred_2)

177070

In [25]:
df_pred_2.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

## BL_3: Vistas Compras (Quienes vieron esto, compraron lo otro)
### Score: 

In [9]:
# Diccionario de vistas compras
vistas_compras = defaultdict(lambda: defaultdict(int))
for row in tqdm_notebook(rows_train):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            vistas_compras[int(ev['event_info'])][int(row['item_bought'])] +=1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [10]:
#calculo de Scores
def get_item_score(row):
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in vistas_compras[int(ev['event_info'])].items():
                item_scores[k] += v
    return Counter(item_scores)

In [11]:
# Recomendador: Agarra los 10 items que mejor score tienen, y sino hay 10 rellena con random.
def vc_reco(row):
    reco = []
    scores = get_item_score(row)
    most_common = scores.most_common()
    for item, score in most_common:
        reco.append(item)
        if len(reco) == 10:
            return reco
    
    k=10-len(reco)
    relleno = random.choices(all_items, k=k)

    return reco + relleno


In [12]:
y_pred_3 = []
for row in tqdm_notebook(test_rows):
    recom = vc_reco(row)
    y_pred_3.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [20]:
t_0 = time.time()
df_pred_3 = pd.DataFrame(y_pred_3)  
df_pred_3.to_csv("./submission/baseline3_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_3))
print("")
print("Check de NAs")
print(df_pred_3.isnull().sum())

tiempo:  2.0694820880889893
longitud:  177070

Check de NAs
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


## Ensamble_basico_1: Ultimos Items Vistos con relleno de los items mas vendidos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, con los items mas vendidos del dominio mas visitado por el usuario
### Score: 0.2334158892435007

In [31]:
random.choices(top_by_best_domain(test_rows[13]), k=4)

[1896472, 1589341, 1112926, 889151]

In [32]:
def last_viewed_smart_filling(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_domain(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [33]:
last_viewed_smart_filling(test_rows[1])

[849692,
 26324,
 48097,
 788705,
 831243,
 943786,
 1125393,
 1676401,
 353783,
 1007213]

In [34]:
y_pred_4 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling(row)
    y_pred_4.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [35]:
t_0 = time.time()
df_pred_4 = pd.DataFrame(y_pred_4)  
df_pred_4.to_csv("./submission/ensablB1_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_4))
print("")
print("Check de NAs")
print(df_pred_4.isnull().sum())

tiempo:  1.6895322799682617
longitud:  177070

Check de NAs
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


## Ensamble_basico_2: Ultimos Items Vistos con relleno de los vistas_compra
### a) Lo mismo que antes pero con vc_reco
### Score: 0.22104

In [40]:
def last_viewed_smart_filling_2(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(vc_reco(row), k=k)
    #if len(relleno) > 7:
       #print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [41]:
last_viewed_smart_filling_2(test_rows[1])

[849692, 26324, 48097, 788705, 831243, 943786, 1125393, 849692, 1180937, 17614]

In [42]:
y_pred_5 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling_2(row)
    y_pred_5.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [43]:
t_0 = time.time()
df_pred_5 = pd.DataFrame(y_pred_5)  
df_pred_5.to_csv("./submission/ensablB2_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_5))
print("")
print("Check de NAs")
print(df_pred_5.isnull().sum())

tiempo:  1.6219103336334229
longitud:  177070

Check de NAs
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


## Ensamble_basico_3: Ultimos Items Vistos con relleno de los items mas vendidos de la "categoria" mas visitada por el usuario
### Score: 0.23334950336442892

In [None]:
# Diccionario de diccionarios: Para cada Categoria tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese Categoria
ventas_x_categoria = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['category_id']
        ventas_x_categoria[domain][item] += 1

In [None]:
# Devuelve una lista con todas las categories que visito un usuario y cuantas veces visito cada uno.
def categorias_visitadas(row, max_views=15):
    categories = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        category = metadata[item]['category_id']
        categories[category] +=1
    return categories

In [None]:
categorias_visitadas(rows[1])

In [None]:
# Le paso una categoria, y devuelve cuales son los K items mas vendidos de la categoria que pase.
def top_items_cat(category, k=10):
    top = ventas_x_categoria[category]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [None]:
top_items_cat('MLB269588')

In [None]:
def top_by_best_category(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    category = categorias_visitadas(row)
    category = category.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items_cat(category, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items_cat(category, k=k)

In [None]:
top_by_best_category(rows[200])

In [None]:
def last_viewed_smart_filling_cat(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_category(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [None]:
last_viewed_smart_filling_cat(rows[1])

In [None]:
y_pred_6 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling_cat(row)
    y_pred_6.append(recom)

In [None]:
t_0 = time.time()
df_pred_6 = pd.DataFrame(y_pred_6)  
#df_pred_6.to_csv("./submission/ensablB3_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_6))
print("")
print("Check de NAs")
print(df_pred_6.isnull().sum())

- Revisar si combinando los baselines, no puedo conbiar 3 para el caso de evitar lo mas posible los randoms.
- Hacer votar a cada baseline y elegir cuanto vale cada voto con algun algoritmo de ML.
- Usar estos baselines como generadores para un modelo tipo signal vs noise.