In [1]:
import pandas as pd
import numpy as np
import re
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df
from challenge_metric import ndcg_score

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  27.39821171760559


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  20.220482349395752


In [5]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  12.461353778839111
177070


In [5]:
#samples = 100000
#if samples:
#    rows = rows[:samples]

rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


In [6]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

In [7]:
#Creo lista de items por PAIS
items_Brasil = [d for d in item_data if re.search('MLB.+', str(d['domain_id']))]
items_Mexico = [d for d in item_data if re.search('MLM.+', str(d['domain_id']))]

In [8]:
metadata_brasil = {x['item_id']:x for x in items_Brasil}
metadata_mexico = {x['item_id']:x for x in items_Mexico}
all_items_brasil = list(metadata_brasil.keys())
all_items_mexico = list(metadata_mexico.keys())

In [18]:
### Local Prediction ###
def local_prediction(y_pred):
    # Tomo del dataset de test cual es el que compro realmente, para computar el score de nuestro test.
    y_true = [row['item_bought'] for row in rows_test]
    print(len(y_true), len(y_pred))    
    local_score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
    print (f'El score local es: {local_score}')

In [17]:
# Diccionario de diccionarios: Para cada dominio tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese domino.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [18]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    return domains

In [19]:
# Devuelve el pais del dominio mas visitado
def dominios_pais(domain):
    if re.search('MLB.+', str(domain)):
        return "Brasil"
    elif re.search('MLM.+', str(domain)):
        return "Mexico"  

In [20]:
dominios_pais('MLM-EARRINGS')

'Mexico'

In [21]:
# Le paso un dominio, y devuelve cuales son los K items mas vendidos del dominio que le paso.
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [22]:
top_items('MLB-STYLUSES')

[327390,
 853217,
 1227769,
 437622,
 1894243,
 676375,
 1142694,
 550317,
 193913,
 1649032]

In [30]:
## En vez de all_items tengo que pasar el del pais correspondiente
def top_by_best_domain(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    domain = dominios_visitados(row)
    domain = domain.most_common(1)[0][0]
    domain_country = dominios_pais(domain)
    #print(domain_country)
    tItems = top_items(domain, k=k)
    if len(tItems) < k:
            filling = random.choices(all_items,  k=k-len(tItems))
            tItemsWithFilling = tItems + filling
            return tItemsWithFilling
    return top_items(domain, k=k)

In [31]:
top_items('MLB-STYLUSES', k=10)

[327390,
 853217,
 1227769,
 437622,
 1894243,
 676375,
 1142694,
 550317,
 193913,
 1649032]

In [32]:
top_by_best_domain(rows_train[2], k=10)

[144417,
 1246365,
 734383,
 1178261,
 828545,
 1091806,
 174220,
 2091992,
 722431,
 1028116]

## Modelito 1: Ultimos Items Vistos con relleno de los items mas vendidos del mismo pais
### a) Tomo los items que miro el usuario.
### b) Selecciono los ultimos 10.
### c) Si hay menos de 10, con los items mas vendidos del dominio mas visitado por el usuario
### Score: 0.2334158892435007
### El score local es: 0.23514438282397668
### El score local es: 0.23525713095693013 (filling en top_by_best_domain)
### Score: 0.23344948510371125

In [33]:
def last_viewed_smart_filling(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_domain(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [34]:
y_pred_m1 = []
# !!!
#"rows_test" for local score and "test_rows" for remote
for row in tqdm_notebook(test_rows):
    recom = last_viewed_smart_filling(row)
    y_pred_m1.append(recom)

HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [35]:
df_pred_m1 = pd.DataFrame(y_pred_m1)  
df_pred_m1.to_csv("./submission/modelito1.1_submission.csv", index=False, header=False)
print(df_pred_m1.isnull().sum())

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64


In [42]:
local_prediction(y_pred_m1)

82633 82633
El score local es: 0.2352399149614516


## Modelito 2: Ultimos Items Vistos con relleno de los items mas vendidos de la "categoria" mas visitada por el usuario
### Revisar el filling
### Score: 0.23334950336442892
### El score local es: 0.23511979632660793

In [7]:
# Diccionario de diccionarios: Para cada Categoria tiene un diccionario con items, y dice cuatas ventas  tuvo ese item de ese Categoria
ventas_x_categoria = defaultdict(lambda: defaultdict(int))

for row in tqdm_notebook(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['category_id']
        ventas_x_categoria[domain][item] += 1

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




In [8]:
# Devuelve una lista con todas las categories que visito un usuario y cuantas veces visito cada uno.
def categorias_visitadas(row, max_views=15):
    categories = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        category = metadata[item]['category_id']
        categories[category] +=1
    return categories

In [9]:
categorias_visitadas(rows[1])

Counter({'MLB269588': 1, 'MLB264021': 7})

In [10]:
# Le paso una categoria, y devuelve cuales son los K items mas vendidos de la categoria que pase.
def top_items_cat(category, k=10):
    top = ventas_x_categoria[category]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [11]:
top_items_cat('MLB269588')

[1165777,
 285756,
 1639443,
 1172716,
 178545,
 986486,
 1468462,
 1982437,
 388651,
 445732]

In [12]:
def top_by_best_category(row, k=10):
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items,  k=k)
    category = categorias_visitadas(row)
    category = category.most_common(1)[0][0]
    #print(len(top_items(domain, k=k)))
    if len(top_items_cat(category, k=k)) < k:
        return random.choices(all_items,  k=k)
    return top_items_cat(category, k=k)

In [13]:
def last_viewed_smart_filling_cat(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        return recom[:10]
    if len(recom) == 10:
        return recom
    
    k=10-len(recom)    
    relleno = random.choices(top_by_best_category(row), k=k)
    #if len(relleno) < 10:
     #   print("filling:", k, "| len final:",len(recom + relleno))
    return recom + relleno

In [14]:
last_viewed_smart_filling_cat(rows[1])

[228737,
 1282813,
 206667,
 1943604,
 1156086,
 643652,
 864577,
 228737,
 868817,
 1341158]

In [16]:
y_pred_m2 = []
# !!!
#"rows_test" for local score and "test_rows" for remote
for row in tqdm_notebook(rows_test):
    recom = last_viewed_smart_filling_cat(row)
    y_pred_m2.append(recom)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [19]:
local_prediction(y_pred_m2)

82633 82633
El score local es: 0.23511979632660793


In [None]:
t_0 = time.time()
df_pred_6 = pd.DataFrame(y_pred_6)  
#df_pred_6.to_csv("./submission/ensablB3_submission.csv", index=False, header=False)
print ("tiempo: ", time.time() - t_0)
print("longitud: ",len(df_pred_6))
print("")
print("Check de NAs")
print(df_pred_6.isnull().sum())