In [1]:
import pandas as pd
import numpy as np
import re
import time, datetime
import math
import json
import random
import warnings, sys, os, gzip, gc
from collections import Counter, defaultdict
from sklearn import datasets
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def jl_to_list(fname):
    output=[]
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
t_0 = time.time()
rows = jl_to_list('./data/raw/train_dataset.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  29.05925440788269


In [4]:
t_0 = time.time()
item_data = jl_to_list('./data/raw/item_data.jl.gz')
print ("tiempo: ", time.time() - t_0)

tiempo:  20.95676016807556


In [5]:
#t_0 = time.time()
#test_rows = jl_to_list('./data/raw/test_dataset.jl.gz')
#print ("tiempo: ", time.time() - t_0)
#print(len(test_rows))

In [6]:
samples = 10000
if samples:
    rows = rows[:samples]
print(len(rows))

10000


## Analisis Exploratorio

In [7]:
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

In [8]:
metadata_bought = {x['item_bought']:x for x in rows}
all_bought = list(metadata_bought.keys())
# Por lo que revise no hay dos compras del mismo producto.

In [9]:
domains = {x['domain_id']:x for x in item_data}
#categories = {x['category_id']:x for x in item_data}
all_domains = list(domains.keys())
#all_categories = list(categories.keys())

In [10]:
print(len(all_domains))
#print(len(all_categories))

7894


## Lista de dominios

In [11]:
# Devuelve una lista con todos los dominios que visito un usuario y cuantas veces visito cada uno.
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:15]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] +=1
    #sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)    
    return domains

In [17]:
dominios_visitados(rows[1]).most_common()

[('MLB-MILK_EXTRACTORS', 7), ('MLB-SHOWER_HEADS', 1)]

In [18]:
dominios_visitados(rows[1]).most_common(1)[0][0]

'MLB-MILK_EXTRACTORS'

In [19]:
item= rows[1]['item_bought']
domain_bought = metadata[item]['domain_id']
domain_bought

'MLB-MILK_EXTRACTORS'

In [20]:
df = pd.DataFrame(dominios_visitados(rows[1]).most_common()) 
df 

Unnamed: 0,0,1
0,MLB-MILK_EXTRACTORS,7
1,MLB-SHOWER_HEADS,1


In [21]:
dominios_visitados(rows[1]).most_common()
first_domain = dominios_visitados(rows[1]).most_common()[0][0]
first_count = dominios_visitados(rows[1]).most_common()[0][1]
print(first_domain,first_count)

MLB-MILK_EXTRACTORS 7


In [22]:
list_perf = []
row_id = 0
for row in tqdm_notebook(rows[:10000]):
    row_perf = dict()        
    row_perf['ID']=row_id
    row_perf['domain_bought'] = metadata[row['item_bought']]['domain_id']
    for index in dominios_visitados(row).most_common():        
        #print(count,index[0],index[1])
        row_perf[index[0]]=index[1]
    row_id +=1
    list_perf.append(row_perf)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [23]:
#del [rows]
#del [item_data]
#del [all_items]
#del [all_domains]
#del [all_bought]
#gc.collect()

In [24]:
len(list_perf)

10000

In [25]:
list_perf[2500]

{'ID': 2500,
 'domain_bought': 'MLB-SHORTS',
 'MLB-CLOTHING': 3,
 'MLB-TACTICAL_AND_SPORTING_KNIVES_AND_BLADES': 3,
 'MLB-BOOKS': 2,
 'MLB-KEYCHAINS': 2,
 'MLB-BACKPACKS': 1,
 'MLB-FANNY_PACKS': 1,
 'MLB-HAIRDRESSING_SCISSORS': 1,
 'MLB-PANTS': 1,
 'MLB-T_SHIRTS': 1}

In [26]:
df_list_perf = pd.DataFrame(list_perf)
#df_list_perf

In [27]:
len(df_list_perf.columns)

3130

In [28]:
df_list_perf.fillna(0, inplace=True)

In [29]:
df_list_perf

Unnamed: 0,ID,domain_bought,MLB-SMARTWATCHES,MLB-MILK_EXTRACTORS,MLB-SHOWER_HEADS,MLB-TOWEL_SETS,MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,MLB-OFFICE_SOFTWARE,MLB-TENTS,MLB-TOYS_AND_GAMES,...,MLB-PASTAS,MLM-DEPILATORY_WAXES,MLM-HAIR_REMOVAL_STRIPS,MLB-RAZOR_CARTRIDGES,MLM-MAKEUP_TRAIN_CASES,MLM-HOUSE_NUMBERS,MLM-BABY_BATHTUBS,MLB-BICYCLE_WRENCHES,MLB-LAMINATORS,MLB-IDENTIFICATION_AND_PRESENTATION_CARD_CUTTERS
0,0,MLB-SMARTWATCHES,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,MLB-MILK_EXTRACTORS,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,MLB-CELLPHONE_ACCESSORIES,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,MLB-CARD_PAYMENT_TERMINALS,0.0,0.0,0.0,0.0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,MLB-SMARTWATCHES,5.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,MLB-WRISTWATCHES,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,9996,MLB-CELLPHONES,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,9997,MLB-SPEAKERS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,9998,MLB-CRIB_BEDDING_SETS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df_list_perf.to_csv("./data/df_list_perf.csv", index=False, header=True)

### Prediccion

In [47]:
df_prefiles = pd.read_csv("./data/df_list_perf.csv")

In [49]:
df_prefiles['domain_bought'] = df_prefiles['domain_bought'].astype('category')
df_prefiles['ID'] = df_prefiles['ID'].astype('float64')
print(df_prefiles.dtypes)

ID                                                   float64
domain_bought                                       category
MLB-SMARTWATCHES                                     float64
MLB-MILK_EXTRACTORS                                  float64
MLB-SHOWER_HEADS                                     float64
                                                      ...   
MLM-HOUSE_NUMBERS                                    float64
MLM-BABY_BATHTUBS                                    float64
MLB-BICYCLE_WRENCHES                                 float64
MLB-LAMINATORS                                       float64
MLB-IDENTIFICATION_AND_PRESENTATION_CARD_CUTTERS     float64
Length: 3130, dtype: object


In [50]:
#cols=[i for i in df_prefiles.columns if i not in ['domain_bought']]
#for col in cols:
    #df_prefiles[col]=df_prefiles[col].astype('int')

In [56]:
full_x = df_prefiles.drop(['domain_bought'], axis=1)
full_y = df_prefiles[['domain_bought']]
X_train, X_test, y_train, y_test = train_test_split(full_x, full_y, test_size=0.2, random_state=0)

In [57]:
import lightgbm as lgb

In [61]:
#lgb_train = lgb.Dataset(full_x, full_y)
lgb_train = lgb.Dataset(X_train, y_train)

In [62]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
# acotar hojas y profundidad
# Staking

In [63]:
print('Starting training...')
# train
#gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval,early_stopping_rounds=5)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)
# save model to file
#gbm.save_model('model.txt')

Starting training...


ValueError: DataFrame.dtypes for label must be int, float or bool