In [11]:
import pandas as pd
import numpy as np
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df
from challenge_metric import ndcg_score

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  27.493866682052612


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  21.31541895866394


In [5]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  13.160120964050293
177070


In [6]:
#samples = 1000
#if samples:
#    rows = rows[:samples]

rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


In [9]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

In [12]:
### Local Prediction ###
def local_prediction(y_pred):
    # Tomo del dataset de test cual es el que compro realmente, para computar el score de nuestro test.
    y_true = [row['item_bought'] for row in rows_test]
    print(len(y_true), len(y_pred))    
    local_score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
    print (f'El score local es: {local_score}')

## BL_1: Items mas vendidos del dominio mas visitado
### a) Reviso el domino que mas miro un usuario
### b) Para ese domino busco los 10 items mas vendidos
### c) Si hay menos de 10 relleno al azar
### Score: 0.13451

In [27]:
# Diccionario de diccionarios: Para cada dominio tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese domino.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [25]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    return domains

In [12]:
dominios_visitados(rows_train[1])

Counter({'MLB-DOLLS': 13})

In [24]:
# Le paso un dominio, y devuelve cuales son los K items mas vendidos del dominio que le paso.
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [14]:
top_items('MLB-DRUM_BRAKE_WHEEL_CYLINDERS')

[]

In [23]:
def top_by_best_domain(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    domain = dominios_visitados(row)
    domain = domain.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items(domain, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items(domain, k=k)

In [16]:
top_by_best_domain(rows_test[10])

[1045327,
 1343230,
 1086366,
 515702,
 723460,
 328824,
 371202,
 1241717,
 1253408,
 614475]

In [17]:
y_pred = []
# !!!
#"rows_test" for local score and "test_rows" for remote
for row in tqdm_notebook(rows_test):
    recom = top_by_best_domain(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [18]:
df_pred = pd.DataFrame(y_pred)  
#df_pred.to_csv("./submission/baseline1_submission.csv", index=False, header=False)

In [19]:
#df_pred.isnull().sum()

In [22]:
local_prediction(y_pred)

200 200
El score local es: 0.04777606090887466


## BL_2: Ultimos Items Vistos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, relleno al azar.
### Score: 0.20757379669640982 (relativo 33)


In [13]:
def last_viewed(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(all_items, k=k)
    return recom + relleno

In [14]:
last_viewed(test_rows[13])

[1837360,
 1589341,
 656661,
 690599,
 1774737,
 1233082,
 2026863,
 1258821,
 1314761,
 1947479]

In [16]:
y_pred_2 = []
# !!!
#"rows_test" for local score and "test_rows" for remote
for row in tqdm_notebook(rows_test):
    recom = last_viewed(row)
    y_pred_2.append(recom)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [17]:
t_0 = time.time()
df_pred_2 = pd.DataFrame(y_pred_2)  
#df_pred_2.to_csv("./submission/baseline2_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.5067980289459229


In [18]:
local_prediction(y_pred_2)

82633 82633
El score local es: 0.2088520215041085


## BL_3: Vistas Compras (Quienes vieron esto, compraron lo otro)
### Score: 

In [9]:
# Diccionario de vistas compras
vistas_compras = defaultdict(lambda: defaultdict(int))
for row in tqdm_notebook(rows_train):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            vistas_compras[int(ev['event_info'])][int(row['item_bought'])] +=1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [10]:
#calculo de Scores
def get_item_score(row):
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in vistas_compras[int(ev['event_info'])].items():
                item_scores[k] += v
    return Counter(item_scores)

In [11]:
# Recomendador: Agarra los 10 items que mejor score tienen, y sino hay 10 rellena con random.
def vc_reco(row):
    reco = []
    scores = get_item_score(row)
    most_common = scores.most_common()
    for item, score in most_common:
        reco.append(item)
        if len(reco) == 10:
            return reco
    
    k=10-len(reco)
    relleno = random.choices(all_items, k=k)

    return reco + relleno


In [12]:
y_pred_3 = []
for row in tqdm_notebook(test_rows):
    recom = vc_reco(row)
    y_pred_3.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [20]:
t_0 = time.time()
df_pred_3 = pd.DataFrame(y_pred_3)  
df_pred_3.to_csv("./submission/baseline3_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_3))
print("")
print("Check de NAs")
print(df_pred_3.isnull().sum())

tiempo:  2.0694820880889893
longitud:  177070

Check de NAs
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


## Ensamble_basico_1: Ultimos Items Vistos con relleno de los items mas vendidos
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, con los items mas vendidos del dominio mas visitado por el usuario
### Score: 0.2334158892435007

In [28]:
random.choices(top_by_best_domain(test_rows[13]), k=4)

[1589341, 889151, 225360, 1891622]

In [29]:
def last_viewed_smart_filling(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_domain(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [30]:
last_viewed_smart_filling(test_rows[1])

[849692,
 26324,
 48097,
 788705,
 831243,
 943786,
 1125393,
 1007213,
 1007213,
 1761121]

In [31]:
y_pred_4 = []
# !!!
#"rows_test" for local score and "test_rows" for remote
for row in tqdm_notebook(rows_test):
    recom = last_viewed_smart_filling(row)
    y_pred_4.append(recom)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [32]:
df_pred_4 = pd.DataFrame(y_pred_4)  
#df_pred_4.to_csv("./submission/ensablB1_submission.csv", index=False, header=False)
#print(df_pred_4.isnull().sum())

In [33]:
local_prediction(y_pred_4)

82633 82633
El score local es: 0.23528769806441457


## Ensamble_basico_2: Ultimos Items Vistos con relleno de los vistas_compra
### a) Lo mismo que antes pero con vc_reco
### Score: 0.22104

In [40]:
def last_viewed_smart_filling_2(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(vc_reco(row), k=k)
    #if len(relleno) > 7:
       #print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [41]:
last_viewed_smart_filling_2(test_rows[1])

[849692, 26324, 48097, 788705, 831243, 943786, 1125393, 849692, 1180937, 17614]

In [42]:
y_pred_5 = []
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling_2(row)
    y_pred_5.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [43]:
t_0 = time.time()
df_pred_5 = pd.DataFrame(y_pred_5)  
df_pred_5.to_csv("./submission/ensablB2_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_5))
print("")
print("Check de NAs")
print(df_pred_5.isnull().sum())

tiempo:  1.6219103336334229
longitud:  177070

Check de NAs
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


- Revisar si combinando los baselines, no puedo conbiar 3 para el caso de evitar lo mas posible los randoms.
- Hacer votar a cada baseline y elegir cuanto vale cada voto con algun algoritmo de ML.
- Usar estos baselines como generadores para un modelo tipo signal vs noise.