In [1]:
import pandas as pd
import numpy as np
import re
import time, datetime
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  30.443601846694946


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  24.33327889442444


In [5]:
t_0 = time.time()
test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)
print(len(test_rows))

In [12]:
samples = 10000
if samples:
    rows = rows[:samples]
rows_train, rows_test =  train_test_split(rows, test_size=0.2, random_state=42)
print(len(rows_train), len(rows_test))

8000 2000


## Analisis Exploratorio

In [5]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

In [6]:
metadata_bought = {x['item_bought']:x for x in rows}
all_bought = list(metadata_bought.keys())
# Por lo que revise no hay dos compras del mismo producto.

In [7]:
item_data[1]

{'item_id': 871377,
 'title': 'Resident Evil Origins Collection Nintendo Switch (en D3gamer',
 'domain_id': 'MLM-VIDEO_GAMES',
 'product_id': '15270800',
 'price': '1392.83',
 'category_id': 'MLM151595',
 'condition': 'new'}

In [7]:
print(len(all_bought))
print(len(all_items))

64928
2102277


In [8]:
domains = {x['domain_id']:x for x in item_data}
categories = {x['category_id']:x for x in item_data}
all_domains = list(domains.keys())
all_categories = list(categories.keys())

In [9]:
print(len(all_domains))
print(len(all_categories))

7894
11493


In [11]:
# Funcion que devuelve si un usuario compro un articulo de entre los que vio con anterioridad
def see_and_bought(row):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    bought = row['item_bought']
    boolean = bought in viewed
    return boolean

In [12]:
see_and_bought(rows[1])

True

In [13]:
count = 0
for row in tqdm_notebook(rows):    
    if see_and_bought(row) == True:
        count +=1
print("Totales: ",len(rows))
print("Compro de entre las que vio: ",count)
print(np.around(count*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales:  413163
Compro de entre las que vio:  121422
29.39 %


In [14]:
# Devuelve una lista con todos los dominios que visito un usuario.
def visited_domains(row):
    domains_set = set()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains_set.add(domain)
    return list(domains_set)

In [15]:
visited_domains(rows[1])

['MLB-SHOWER_HEADS', 'MLB-MILK_EXTRACTORS']

In [16]:
metadata[228737]['domain_id'] in visited_domains(rows[1])

True

In [17]:
# Funcion que devuelve si un usuario compro un articulo de los dominios que vio con anterioridad
def same_domain(row):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    domain_bought = metadata[row['item_bought']]['domain_id']
    boolean = domain_bought in visited_domains(row)
    return boolean

In [18]:
same_domain(rows[0])

True

In [19]:
count = 0
for row in tqdm_notebook(rows):    
    if same_domain(row) == True:
        count +=1
print("Totales: ",len(rows))
print("Compro en el mismo domio: ",count)
print(np.around(count*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales:  413163
Compro en el mismo domio:  203857
49.34 %


## Revisar Views vs Search

In [20]:
all_domains[1]

'MLM-VIDEO_GAMES'

In [21]:
all_domains.count('MLM-VIDEO_GAMES')

1

In [23]:
# Cuantos dominios de cada pais hay
count_Mexico = 0
count_Brasil = 0
count_none = 0
for domain in all_domains:
    if re.search('MLM.+', str(domain)):   
        count_Mexico +=1
    elif re.search('MLB.+', str(domain)):   
        count_Brasil +=1
    else:
        count_none +=1
print(len(all_domains))        
print(count_none, count_Mexico, count_Brasil) 

7894
1 3688 4205


In [24]:
metadata[4]

{'item_id': 4,
 'title': 'Ps4 Playstation 4 Slim 500gb Em Campinas Sp Com Nota Fiscal',
 'domain_id': 'MLB-GAME_CONSOLES',
 'product_id': '10813731',
 'price': '2349.00',
 'category_id': 'MLB11172',
 'condition': 'new'}

In [25]:
# Creo una lista de items por pais (Mexico y Brasil)
brasil = dict()
brasil_list = []
for key, value in metadata.items():
    #print(key, ':', value)
    if re.search('MLB.+', str(value)):  
        brasil['key']=value
        brasil_list.append(brasil)

brasil

{'key': {'item_id': 2010306,
  'title': 'Kit Turbo Turbina Virtual Simulador Som Apito Carro E Moto',
  'domain_id': 'MLB-VEHICLE_ACCESSORIES',
  'product_id': None,
  'price': '64.99',
  'category_id': 'MLB117639',
  'condition': 'new'}}

In [26]:
item_data[4]

{'item_id': 934912,
 'title': 'Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windows 7',
 'domain_id': 'MLM-NOTEBOOKS',
 'product_id': None,
 'price': '1599.00',
 'category_id': 'MLM1652',
 'condition': 'used'}

In [27]:
# Funcion que revisa si una row tiene un search
def view_search(row):
    searched = [ev for ev in row['user_history'] if ev['event_type']=='search']
    return searched

In [28]:
count = 0
for row in tqdm_notebook(rows):    
    if view_search(row):
        count +=1
print("Totales: ",len(rows))
print("Filas con Search: ",count)
print(np.around(count*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales:  413163
Filas con Search:  340636
82.45 %


In [22]:
#rows[1]['user_history']
# DESMAMADEIRA ELETRICA

In [29]:
metadata[111260]['title'].lower()

'casa sola en venta con gran patio solo pago de contado.'

In [30]:
'DESMAMADEIRA ELETRICA' in metadata[111260]['title']

False

In [32]:
exampl = 'DESMAMADEIRA ELETRICA'
boolean = False
find = ''
for item in all_items:
    if(exampl.lower() in metadata[item]['title']):
        find =  metadata[item]['title']
        boolean = True
boolean   
find 
    

''

In [33]:
metadata[1]['domain_id']

'MLB-BOOTS_AND_BOOTIES'

### Busquedas dentro del mismo Pais

In [34]:
#Creo lista de items por PAIS
items_Brasil = [d for d in item_data if re.search('MLB.+', str(d['domain_id']))]
items_Mexico = [d for d in item_data if re.search('MLM.+', str(d['domain_id']))]

In [35]:
metadata_brasil = {x['item_id']:x for x in items_Brasil}
metadata_mexico = {x['item_id']:x for x in items_Mexico}
all_items_brasil = list(metadata_brasil.keys())
all_items_mexico = list(metadata_mexico.keys())

In [36]:
print(len(all_items_brasil), len(all_items_mexico))

1723216 378210


## Historial de Navegacion

In [37]:
len(rows[1]['user_history'])

13

In [40]:
count_men = 0
count_men_search = 0
for row in tqdm_notebook(rows):
   lenRow = len(row['user_history'])
   if (lenRow < 10):
       count_men+=1
       if(view_search(row)):
           count_men_search+=1

print('Totales:', len(rows) )
print('Men 10: ', count_men,' - ', np.around(count_men*100/len(rows), 2),"%")
print('Men 10 & search: ',count_men_search,' - ', np.around(count_men_search*100/len(rows), 2),"%")

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales: 413163
Men 10:  154207  -  37.32 %
Men 10 & search:  93596  -  22.65 %


In [41]:
rows[5]

{'user_history': [{'event_info': 'TAMPA TRASEIRA IPHONE8',
   'event_timestamp': '2019-10-16T13:19:43.349-0400',
   'event_type': 'search'},
  {'event_info': 2029385,
   'event_timestamp': '2019-10-16T13:19:47.561-0400',
   'event_type': 'view'},
  {'event_info': 'TAMPA TRASEIRA IPHONE8',
   'event_timestamp': '2019-10-16T13:20:01.542-0400',
   'event_type': 'search'},
  {'event_info': 794504,
   'event_timestamp': '2019-10-16T13:20:48.524-0400',
   'event_type': 'view'},
  {'event_info': 'CAMBIO C4 PALLAS',
   'event_timestamp': '2019-10-18T09:59:12.467-0400',
   'event_type': 'search'},
  {'event_info': 1135701,
   'event_timestamp': '2019-10-18T09:59:18.962-0400',
   'event_type': 'view'},
  {'event_info': 'CAMBIO C4 PALLAS',
   'event_timestamp': '2019-10-18T10:12:27.052-0400',
   'event_type': 'search'},
  {'event_info': 1135701,
   'event_timestamp': '2019-10-18T10:12:30.186-0400',
   'event_type': 'view'}],
 'item_bought': 1046119}

In [42]:
[d for d in item_data if d['item_id']==1135701] 

[{'item_id': 1135701,
  'title': 'Cambio Automatico C4 Pallas Automáticos Mooca Com Garantia',
  'domain_id': 'MLB-CAR_GEARBOXES',
  'product_id': None,
  'price': '4499.00',
  'category_id': 'MLB194806',
  'condition': 'new'}]

In [43]:
#item_data[1]['item_id']==871377
[d for d in item_data if d['item_id']==1046119]   

[{'item_id': 1046119,
  'title': 'Placa Conector Carga Micro Usb Moto G5 Xt1672 Flex ',
  'domain_id': 'MLB-CELLPHONES_AND_TELEPHONY',
  'product_id': None,
  'price': '13.90',
  'category_id': 'MLB1915',
  'condition': 'new'}]

## TimeStamp
#### https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior

In [44]:
#[ev['event_timestamp'] for ev in rows[1]['user_history'] if ev['event_type']=='view']
[ev['event_timestamp'] for ev in rows[1]['user_history']]

['2019-10-06T18:02:53.893-0400',
 '2019-10-07T09:45:29.322-0400',
 '2019-10-07T09:46:07.960-0400',
 '2019-10-07T09:46:17.100-0400',
 '2019-10-07T09:46:19.173-0400',
 '2019-10-07T09:47:53.958-0400',
 '2019-10-07T18:53:20.113-0400',
 '2019-10-07T18:53:26.670-0400',
 '2019-10-07T18:54:36.944-0400',
 '2019-10-07T18:54:50.998-0400',
 '2019-10-07T18:56:43.678-0400',
 '2019-10-07T19:01:44.718-0400',
 '2019-10-07T19:46:18.382-0400']

In [45]:
ts_1= datetime.strptime('2019-10-07T19:46:18.382-0400', '%Y-%m-%dT%H:%M:%S.%f%z')
ts_2= datetime.strptime('2019-10-07T09:47:53.958-0400', '%Y-%m-%dT%H:%M:%S.%f%z')
diff = ts_1 - ts_2
diff.seconds/3600

9.973333333333333

In [46]:
count_men = 0
count_men_search = 0
for row in tqdm_notebook(rows):
   lenRow = len(row['user_history'])
   if (lenRow < 10):
       count_men+=1
       if(view_search(row)):
           count_men_search+=1

print('Totales:', len(rows) )
print('Men 10: ', count_men)
print('Men 10 & search: ',count_men_search)

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))


Totales: 413163
Men 10:  154207
Men 10 & search:  93596


## Lista de dominios

In [10]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    #sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)    
    return domains

In [13]:
dominios_visitados(rows_train[1]).most_common()

[('MLB-RIVETS', 8),
 ('MLB-RIVET_GUNS', 3),
 ('MLB-ELECTRONIC_ACCESSORIES_AND_SPARE_PARTS', 1),
 ('MLB-HOME_APPLIANCE_CONTACTORS_AND_RELAYS', 1),
 ('MLB-DRILL_BITS', 1)]

In [14]:
dominios_visitados(rows_train[1]).most_common(1)[0][0]

'MLB-RIVETS'

In [15]:
item= rows_train[1]['item_bought']
domain_bought = metadata[item]['domain_id']
domain_bought

'MLB-CELLPHONES'

In [16]:
df = pd.DataFrame(dominios_visitados(rows_train[1]).most_common()) 
df 

Unnamed: 0,0,1
0,MLB-RIVETS,8
1,MLB-RIVET_GUNS,3
2,MLB-ELECTRONIC_ACCESSORIES_AND_SPARE_PARTS,1
3,MLB-HOME_APPLIANCE_CONTACTORS_AND_RELAYS,1
4,MLB-DRILL_BITS,1


In [55]:
#df_domains = pd.DataFrame(all_domains, columns =['Domain'])
#df_domains["count"] = 0
#df_domains = df_domains.transpose()
#df_domains.rename(columns=df_domains.iloc[0], inplace = True)
#df_domains

Unnamed: 0,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS,MLM-VEHICLE_ACCESSORIES,MLM-CELLPHONE_COVERS,MLM-WALL_AND_CEILING_LIGHTS,MLM-NAPKIN_HOLDERS,MLM-FLATS,...,MLB-SKATE_BOOT_COVERS,MLB-ADVERTISING_INFLATABLES,MLB-PRESSURE_TANKS,MLB-PRINTER_CLEANING_KITS,MLB-ELECTRIC_CREAM_SEPARATORS,MLB-BURETTES,MLB-GLASS_SAFETY_FILMS,MLB-SCREEN_PRINTING_KITS,MLB-ANTIQUE_AUDIO_ANTENNAS,MLB-RUGBY_HELMETS
Domain,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS,MLM-VEHICLE_ACCESSORIES,MLM-CELLPHONE_COVERS,MLM-WALL_AND_CEILING_LIGHTS,MLM-NAPKIN_HOLDERS,MLM-FLATS,...,MLB-SKATE_BOOT_COVERS,MLB-ADVERTISING_INFLATABLES,MLB-PRESSURE_TANKS,MLB-PRINTER_CLEANING_KITS,MLB-ELECTRIC_CREAM_SEPARATORS,MLB-BURETTES,MLB-GLASS_SAFETY_FILMS,MLB-SCREEN_PRINTING_KITS,MLB-ANTIQUE_AUDIO_ANTENNAS,MLB-RUGBY_HELMETS
count,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
all_domains.append("ID")
all_domains.append("domain_bought")

In [80]:
#Generos todos los id de Dominios
domains_array = np.array(all_domains)
domains_array

array(['MLM-INDIVIDUAL_HOUSES_FOR_SALE', 'MLM-VIDEO_GAMES', 'MLM-SKIRTS',
       ..., 'MLB-RUGBY_HELMETS', 'ID', 'domain_bought'], dtype=object)

In [123]:
df_domains = pd.DataFrame(np.zeros((4, len(domains_array))),columns = domains_array)
#df_domains[['domain_bought']] = df_domains[['domain_bought']].astype(str)
df_domains = df_domains.astype(str)
df_domains

Unnamed: 0,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS,MLM-VEHICLE_ACCESSORIES,MLM-CELLPHONE_COVERS,MLM-WALL_AND_CEILING_LIGHTS,MLM-NAPKIN_HOLDERS,MLM-FLATS,...,MLB-PRESSURE_TANKS,MLB-PRINTER_CLEANING_KITS,MLB-ELECTRIC_CREAM_SEPARATORS,MLB-BURETTES,MLB-GLASS_SAFETY_FILMS,MLB-SCREEN_PRINTING_KITS,MLB-ANTIQUE_AUDIO_ANTENNAS,MLB-RUGBY_HELMETS,ID,domain_bought
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
dominios_visitados(rows_train[1]).most_common()
first_domain = dominios_visitados(rows_train[1]).most_common()[0][0]
first_count = dominios_visitados(rows_train[1]).most_common()[0][1]
print(first_domain,first_count)

MLB-RIVETS 8


In [122]:
df_domains.loc[0]['ID']='1'
df_domains.loc[0]['MLM-INDIVIDUAL_HOUSES_FOR_SALE']='6'
df_domains

Unnamed: 0,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS,MLM-VEHICLE_ACCESSORIES,MLM-CELLPHONE_COVERS,MLM-WALL_AND_CEILING_LIGHTS,MLM-NAPKIN_HOLDERS,MLM-FLATS,...,MLB-PRESSURE_TANKS,MLB-PRINTER_CLEANING_KITS,MLB-ELECTRIC_CREAM_SEPARATORS,MLB-BURETTES,MLB-GLASS_SAFETY_FILMS,MLB-SCREEN_PRINTING_KITS,MLB-ANTIQUE_AUDIO_ANTENNAS,MLB-RUGBY_HELMETS,ID,domain_bought
0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,MLB-RIVETS
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
count = 0
for index in dominios_visitados(rows_train[1]).most_common():
    print(count,index[0],index[1])
    df_domains.loc[count][index[0]]=index[1]



0 MLB-RIVETS 8
0 MLB-RIVET_GUNS 3
0 MLB-ELECTRONIC_ACCESSORIES_AND_SPARE_PARTS 1
0 MLB-HOME_APPLIANCE_CONTACTORS_AND_RELAYS 1
0 MLB-DRILL_BITS 1


In [133]:
df_domains[0]

Unnamed: 0,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS,MLM-VEHICLE_ACCESSORIES,MLM-CELLPHONE_COVERS,MLM-WALL_AND_CEILING_LIGHTS,MLM-NAPKIN_HOLDERS,MLM-FLATS,...,MLB-PRESSURE_TANKS,MLB-PRINTER_CLEANING_KITS,MLB-ELECTRIC_CREAM_SEPARATORS,MLB-BURETTES,MLB-GLASS_SAFETY_FILMS,MLB-SCREEN_PRINTING_KITS,MLB-ANTIQUE_AUDIO_ANTENNAS,MLB-RUGBY_HELMETS,ID,domain_bought
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
