In [1]:
import pandas as pd
import numpy as np
import time
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  27.142229318618774


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  20.389840364456177


In [5]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

tiempo:  14.62082314491272
177070


In [5]:
#samples = 10000
#if samples:
#    rows = rows[:samples]
rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

330530 82633


## Analisis Exploratorio

In [5]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

In [6]:
metadata_bought = {x['item_bought']:x for x in rows}
all_bought = list(metadata_bought.keys())
# Por lo que revise no hay dos compras del mismo producto.

In [7]:
item_data[1]

{'item_id': 871377,
 'title': 'Resident Evil Origins Collection Nintendo Switch (en D3gamer',
 'domain_id': 'MLM-VIDEO_GAMES',
 'product_id': '15270800',
 'price': '1392.83',
 'category_id': 'MLM151595',
 'condition': 'new'}

In [8]:
print(len(all_bought))
print(len(all_items))

64928
2102277


In [9]:
domains = {x['domain_id']:x for x in item_data}
categories = {x['category_id']:x for x in item_data}

In [10]:
print(len(domains))
print(len(categories))

7894
11493


In [12]:
# Me llevo los items que realmente terminaron comprando.
y_true = [row['item_bought'] for row in rows]

In [13]:
len(y_true)

413163

In [17]:
# Funcion que devuelve si un usuario compro un articulo de entre los que vio con anterioridad
def see_and_bought(row):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    bought = row['item_bought']
    boolean = bought in viewed
    return boolean

In [18]:
see_and_bought(rows[1])

True

In [24]:
count = 0
for row in tqdm_notebook(rows):    
    if see_and_bought(row) == True:
        count +=1
print("Totales: ",len(rows))
print("Compro de entre las que vio: ",count)
print(np.around(count*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales:  413163
Compro de entre las que vio:  121422
29.39 %


In [93]:
# Devuelve una lista con todos los dominios que visito un usuario.
def visited_domains(row):
    domains_set = set()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains_set.add(domain)
    return list(domains_set)

In [94]:
visited_domains(rows[1])

['MLB-SHOWER_HEADS', 'MLB-MILK_EXTRACTORS']

In [95]:
metadata[228737]['domain_id'] in visited_domains(rows[1])

True

In [117]:
# Funcion que devuelve si un usuario compro un articulo de los dominios que vio con anterioridad
def same_domain(row):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    domain_bought = metadata[row['item_bought']]['domain_id']
    boolean = domain_bought in visited_domains(row)
    return boolean

In [115]:
same_domain(rows[0])

MLB-SMARTWATCHES
['MLB-SMARTWATCHES']
True
--------------------------


True

In [120]:
count = 0
for row in tqdm_notebook(rows):    
    if same_domain(row) == True:
        count +=1
print("Totales: ",len(rows))
print("Compro en el mismo domio: ",count)
print(np.around(count*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales:  413163
Compro en el mismo domio:  203857
49.34 %
