### Dependencies

In [88]:
#importo librerias
import gzip
import pandas as pd
import urllib
import tarfile
import urllib.request
import numpy as np
import random
import json
import bisect


### Data Sets

In [89]:
url_item_data = "https://meli-data-challenge.s3.amazonaws.com/2020/item_data.jl.gz"
url_train_data = "https://meli-data-challenge.s3.amazonaws.com/2020/train_dataset.jl.gz"

In [90]:
train_data = []
with urllib.request.urlopen(url_train_data) as handle:
  gz = gzip.GzipFile(fileobj=handle)
  for i, line in enumerate(gz):
    train_data.append(json.loads(line.strip().decode('utf-8')))

In [91]:
df = pd.DataFrame(train_data)
df.head()

Unnamed: 0,user_history,item_bought
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207


In [92]:
item_data = []
with urllib.request.urlopen(url_item_data) as handle:
  gz = gzip.GzipFile(fileobj=handle)
  for i, line in enumerate(gz):
    item_data.append(json.loads(line.strip().decode('utf-8')))

itemdf = pd.DataFrame(item_data)

In [93]:
itemdf.head()

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.0,MLM170527,new
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800.0,1392.83,MLM151595,new
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.0,MLM7697,new
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.0,MLM9761,used
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.0,MLM1652,used


### Exploración de los Data Sets

In [94]:
print('Compras totales en el dataset: ', len(df))
print('-'*50)
print('Item comprados (item_bought) únicos: ', len(df.item_bought.unique()))
print('-'*50)
print('Proporción de items únicos en el dataset: ', ( len(df.item_bought.unique())/len(df))*100)

Compras totales en el dataset:  413163
--------------------------------------------------
Item comprados (item_bought) únicos:  64928
--------------------------------------------------
Proporción de items únicos en el dataset:  15.714863141181567


### Unión de los data sets por item bought

In [95]:
df_joined = pd.merge(df, itemdf, left_on='item_bought', right_on='item_id', how='left')
df_joined.shape

(413163, 9)

### Exploración y curación del data set

#### **Manejo de datos faltantes**

In [96]:
# Check for nulls
df_joined.isna().sum()

user_history         0
item_bought          0
item_id              0
title                0
domain_id            0
product_id      305666
price                0
category_id          0
condition            0
dtype: int64

In [97]:
#Solo la columna 'product_id' presenta valores nulos. Y un considerable numero: 305666 de 413163. Por ello, la eliminamos:

del df_joined['product_id']

#### **Eliminar registros duplicados**

Por el momento no se observan registros duplicados. 

### División del data set según Mexico o Brasil. 

In [98]:
df_joined[['country', 'domain']] = df_joined['domain_id'].str.split('-', expand=True)
df_joined.head()

Unnamed: 0,user_history,item_bought,item_id,title,domain_id,price,category_id,condition,country,domain
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830,1748830,Relógio Medidor Inteligente Pulso Freqüência C...,MLB-SMARTWATCHES,90.0,MLB135384,new,MLB,SMARTWATCHES
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737,228737,Bomba Eletrica Tira Leite Materno Bivolt G-tech,MLB-MILK_EXTRACTORS,169.0,MLB264021,new,MLB,MILK_EXTRACTORS
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110,1909110,"Kit Youtuber Tripé 1,20 Microfone Lapela Anel ...",MLB-CELLPHONE_ACCESSORIES,300.0,MLB5092,new,MLB,CELLPHONE_ACCESSORIES
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370,1197370,Leia A Descrição Por Favor - Maquininha Point ...,MLB-CARD_PAYMENT_TERMINALS,16.9,MLB277951,new,MLB,CARD_PAYMENT_TERMINALS
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES


In [99]:
print('Publicaciones totales: ', len(itemdf))
print('-'*50)
print('Total de dominios: ', len(df_joined.domain.unique()))
print('-'*50)
print('País en el que se realizo la compra'+ '\n'*1, df_joined.country.value_counts())

Publicaciones totales:  2102277
--------------------------------------------------
Total de dominios:  2275
--------------------------------------------------
País en el que se realizo la compra
 MLB    354907
MLM     58256
Name: country, dtype: int64


In [100]:
df_mlb = df_joined[df_joined['country'] == 'MLB']
df_mlm = df_joined[df_joined['country'] == 'MLM']

print(df_mlb.shape)
print(df_mlm.shape)

df_mlb

(354907, 10)
(58256, 10)


Unnamed: 0,user_history,item_bought,item_id,title,domain_id,price,category_id,condition,country,domain
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830,1748830,Relógio Medidor Inteligente Pulso Freqüência C...,MLB-SMARTWATCHES,90.00,MLB135384,new,MLB,SMARTWATCHES
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737,228737,Bomba Eletrica Tira Leite Materno Bivolt G-tech,MLB-MILK_EXTRACTORS,169.00,MLB264021,new,MLB,MILK_EXTRACTORS
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110,1909110,"Kit Youtuber Tripé 1,20 Microfone Lapela Anel ...",MLB-CELLPHONE_ACCESSORIES,300.00,MLB5092,new,MLB,CELLPHONE_ACCESSORIES
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370,1197370,Leia A Descrição Por Favor - Maquininha Point ...,MLB-CARD_PAYMENT_TERMINALS,16.90,MLB277951,new,MLB,CARD_PAYMENT_TERMINALS
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES
...,...,...,...,...,...,...,...,...,...,...
413157,"[{'event_info': 912949, 'event_timestamp': '20...",459011,459011,Torneira Cozinha Gourmet Monocomando Mangueira...,MLB-KITCHEN_FAUCETS,399.90,MLB270023,new,MLB,KITCHEN_FAUCETS
413159,"[{'event_info': 289961, 'event_timestamp': '20...",1845503,1845503,Kit Unhas Gel Uv Acrigel Mini Lixa Eletrica Ca...,MLB-GEL_NAIL_KITS,169.57,MLB196796,new,MLB,GEL_NAIL_KITS
413160,"[{'event_info': 'ALUGUEL BOB CAT', 'event_time...",2022477,2022477,Bateria Celular Positivo Twist S430 S430b Bt-s...,MLB-CELLPHONE_BATTERIES,46.88,MLB3812,new,MLB,CELLPHONE_BATTERIES
413161,"[{'event_info': 'XAOMI', 'event_timestamp': '2...",1111021,1111021,Capa Anti Queda Xiaomi Redmi Mi 9t/ K20 + Pelí...,MLB-CELLPHONE_COVERS,22.99,MLB5095,new,MLB,CELLPHONE_COVERS


### División del set de BRASIL en train, validation y test


*   Usamos el de Brasil xq es el más grande de ambos dataset y además, como deseamos analizar texto, necesitamos que dicho texto esté todo en el mismo idioma (trabajaremos con las columnas en portugués). 



###### **Empleamos sklearn para la división**

In [101]:
# Separamos el target (variable a predecir) del conjunto de datos a utilizar para predecirla

xtrain = df_mlb.drop(['item_bought'], axis=1)
labels = df_mlb['item_bought']

In [102]:
from sklearn.model_selection import train_test_split

x, x_test, y, y_test = train_test_split(xtrain,labels,test_size=0.2,train_size=0.8)
x_train, x_validation, y_train, y_validation = train_test_split(x,y,test_size = 0.25,train_size =0.75)

In [103]:
print('Size set for train: ', x_train.shape)
print('Size set for validation: ', x_validation.shape)
print('Size set for test: ', x_test.shape)
print('-'*50)
print('Size labels for train: ', y_train.shape)
print('Size labels for validation: ', y_validation.shape)
print('Size labels for test: ', y_test.shape)

Size set for train:  (212943, 9)
Size set for validation:  (70982, 9)
Size set for test:  (70982, 9)
--------------------------------------------------
Size labels for train:  (212943,)
Size labels for validation:  (70982,)
Size labels for test:  (70982,)


### Análisis de los campos de texto. 


*   Realizaremos el análisis sobre el dataset de training (x_train,y_train).  




#### Primero desglosamos la columna user_history del df de training

In [104]:
#x_train=x_train.sample(1000)


In [105]:
##Agregamos columna "user_id". 
x_train['user_id']=x_train.index
x_train['user_id2']=x_train['user_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [106]:
x_train.set_index(x_train.user_id,inplace=True)

x_train.head()

Unnamed: 0_level_0,user_history,item_id,title,domain_id,price,category_id,condition,country,domain,user_id,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
103284,"[{'event_info': 'PLATAFORMA DAY TRADE', 'event...",1590141,Kit Com 10 Cuecas Boxer De Cotton 4.0 - Polo M...,MLB-MALE_UNDERWEAR,59.9,MLB108789,new,MLB,MALE_UNDERWEAR,103284,103284
341159,"[{'event_info': 'XIAOMU NOTE 8', 'event_timest...",1213787,Xiaomi Mi 9t Dual Sim 64 Gb Preto-carvão 6 Gb Ram,MLB-CELLPHONES,2000.0,MLB1055,new,MLB,CELLPHONES,341159,341159
200269,"[{'event_info': 'BRANCA NEVE', 'event_timestam...",1078512,Kit 35 Bolas De Vinil Personalizadas Em Alto B...,MLB-SOUVENIRS,145.4,MLB40189,new,MLB,SOUVENIRS,200269,200269
10352,"[{'event_info': 'GELADEIRAS', 'event_timestamp...",675977,Rack Com Painel Para Tv Até 65 Polegadas Mades...,MLB-TV_STORAGE_UNITS,349.9,MLB33443,new,MLB,TV_STORAGE_UNITS,10352,10352
300508,"[{'event_info': 'CAPA TATICA MODULAR', 'event_...",1114176,Fone Ouvido Xiaomi Redmi Airdots - Original - ...,MLB-HEADPHONES,209.98,MLB7457,new,MLB,HEADPHONES,300508,300508


In [107]:
#El siguiente codigo desglosa la columna user_history del df. Se obtiene un nuevo df (final_df)
#que contiene las columnas de df y las columnas event_type, event_timestamp y event_info. 

import time
start_time = time.time()


#total_users=len(x_train)

#x_train['user_id']=np.arange(1,total_users+1)
#

new_df=pd.DataFrame()
new_df2=pd.DataFrame()

print("Data Import: --- %s seconds ---" % (time.time() - start_time))






#####################COMENTADO LO DE ABAJO ASI PRUEBO OPTIMIZACION DE NATAN #######
def clean_json(idx,col):
    parsed = pd.json_normalize(col)
    parsed['user_id']=idx
    return parsed

print('Desglosando user_history...')

for label, content in x_train.user_history.items():
  newcol=clean_json(label,content)
  if label==0:
    new_df=newcol
  else:
    new_df=new_df.append(newcol)
  if (np.mod(label,1000)==0): ## ir cada 1000 parece ser lo más optimo. 
        new_df2 = new_df2.append(new_df)
        del new_df
        new_df=pd.DataFrame()
        
  if (np.mod(label,20000)==0):
        #print('Voy por la fila n°:',label)
        print('Desglosando user_history...')
  del newcol
        

#new_df2.set_index(new_df2.user_id,inplace=True)


#final_df=new_df2.join(x_train,how='left',lsuffix='_desglose')


Data Import: --- 0.29790282249450684 seconds ---
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...
Desglosando user_history...


In [108]:
new_df2.head()

Unnamed: 0,event_info,event_timestamp,event_type,user_id
0,PLATAFORMA DAY TRADE,2019-10-26T23:10:35.919-0400,search,103284
0,XIAOMU NOTE 8,2019-10-23T15:07:00.036-0400,search,341159
1,381738,2019-10-23T17:08:08.746-0400,view,341159
2,381738,2019-10-23T17:12:04.246-0400,view,341159
3,381738,2019-10-23T17:14:45.215-0400,view,341159


In [109]:
new_df2.set_index(new_df2.user_id,inplace=True)
final_df=new_df2.join(x_train,how='left',lsuffix='_desglose')

In [110]:
final_df.head()

Unnamed: 0_level_0,event_info,event_timestamp,event_type,user_id_desglose,user_history,item_id,title,domain_id,price,category_id,condition,country,domain,user_id,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4,AMAZFIT BIP,2019-09-25T08:38:43.284-0400,search,4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES,4,4
4,130440,2019-09-25T08:38:59.237-0400,view,4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES,4,4
4,130440,2019-09-25T08:39:15.804-0400,view,4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES,4,4
4,AMAZFIT BIP,2019-09-25T08:39:27.624-0400,search,4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES,4,4
4,AMAZFIT BIPAMAZFIT BIP LITE,2019-09-25T08:39:47.235-0400,search,4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,MLB-SMARTWATCHES,355.99,MLB135384,new,MLB,SMARTWATCHES,4,4


In [111]:
#Eliminamos columnas remanentes que no nos interesan. 
del final_df['user_id_desglose'] 
del final_df['user_history']
del final_df['user_id']
del final_df['domain_id']

El dataset final contiene 6112470 filas. Los datos corresponden a 212943 usuarios (los usuarios de MLB). Por cada usuario aparecen tantas filas como eventos tengan resgitrados durante el tiempo de observacion de su historial. 

In [112]:
final_df.head()

Unnamed: 0_level_0,event_info,event_timestamp,event_type,item_id,title,price,category_id,condition,country,domain,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4,AMAZFIT BIP,2019-09-25T08:38:43.284-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,130440,2019-09-25T08:38:59.237-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,130440,2019-09-25T08:39:15.804-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIP,2019-09-25T08:39:27.624-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIPAMAZFIT BIP LITE,2019-09-25T08:39:47.235-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4


In [113]:
final_df.to_csv('Train_Data_Desglosado.csv') # Guardamos este dataset en caso de que lo necesitemos. 

####Eliminar registros duplicados y analizamos valores nulos. 

In [114]:
#Analizamos si hay filas duplicadas. Para ello, vemos si hay valores repetidos en la columna "time_stamp"
final_df.event_timestamp.duplicated().sum()

9424

No aparecen filas duplicadas. Seguramente algunos usuarios registran el mismo evento realizado en distintos tiempos. Por el momento, no trabajaremos con eso. 

Veamos si aparecen valores nulos. 

In [115]:
final_df.isna().sum()

event_info         0
event_timestamp    0
event_type         0
item_id            0
title              0
price              0
category_id        0
condition          0
country            0
domain             0
user_id2           0
dtype: int64

No se observan valores nulos. 

In [116]:
#Por las dudas, eliminamos los que lleguen a aparecer. 
final_df = final_df.dropna()

In [117]:
final_df[:50]

Unnamed: 0_level_0,event_info,event_timestamp,event_type,item_id,title,price,category_id,condition,country,domain,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4,AMAZFIT BIP,2019-09-25T08:38:43.284-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,130440,2019-09-25T08:38:59.237-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,130440,2019-09-25T08:39:15.804-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIP,2019-09-25T08:39:27.624-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIPAMAZFIT BIP LITE,2019-09-25T08:39:47.235-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIPAMAZFIT BIP LITE,2019-09-25T08:40:14.645-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIPAMAZFIT BIP,2019-09-25T08:40:22.743-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,AMAZFIT BIP,2019-09-25T08:40:33.414-0400,search,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,2049207,2019-09-25T08:40:39.726-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4
4,2049207,2019-09-25T08:41:05.024-0400,view,2049207,Smartwatch Xiaomi Amazfit Bip Huami A1608 Orig...,355.99,MLB135384,new,MLB,SMARTWATCHES,4


Nos queda entonces un dataset de 6112470 filas y 11 columnas. 

In [118]:
#cols_text=final_df.select_dtypes(include=['object', 'category']).columns.to_list()

#### Seleccionamos las columnas con texto (en portugues) e importamos las librerias necesarias.

In [119]:
final_df2=final_df.copy(deep=True)

cols_text=['event_info', 'title', 'domain', 'category_id'] #tokenizamos solo columnas en portugues

import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.stem.snowball import PortugueseStemmer


##### Definimos las funciones que nos servirán para procesar nuestros textos: 
*Pasamos todo a minuscula: 
*Eliminamos el underscore de la columna "domain"
*Definimimos la función de tokenizacion.
*Removemos las stopwords.
*Aplicamos el PortugueseStemmer
*Armamos una función de funciones ("normalize") que aplica todos estos cambios en un unico paso. 

In [120]:
#https://medium.com/datos-y-ciencia/preprocesamiento-de-datos-de-texto-un-tutorial-en-python-5db5620f1767
#Definimos la funcion que pone todo en minuscula.
final_df2=final_df2.sample(10000)
def minus(cell):
  aux=str(cell).lower() # Lowercase string 
  
  return aux

###########################################################################
# Eliminamos el "guion bajo" de la columna "domain"

#Definimos la funcion que reemplaza _ por un espacio vacio.
def replace(cell):
  aux=str(cell).replace('_',' ') 
  
  return aux

###########################################################################
#Definimos la funcion de tokenizacion. Separa cada celda en una lista de palabras. 

def token(cell):

  aux=nltk.word_tokenize(cell) # Tokenization
  
  return aux

###########################################################################
#Eliminamos palabras del tipo "stopwords" del idioma portugues.
#Son palabras tales como las preposiciones las que se eliminan, palabras que 
#se repiten mucho y sin valor semantico.

def remove_stopwords(cell):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in cell:
        if word not in stopwords.words('portuguese'):
            new_words.append(word)
    return new_words

###########################################################################

# Definimos una funcion que stemmizará cada celda cada celda. 
#Empleamos un stemmizador en portugues.
#Buscamos simplificar las palabras cortando su extremo con la intención de unificar palabras
#con similar significado tales como "reloj", "relojes", "relojeria", por ejemplo, con el fin 
#de lograr comparaciones correctas entre palabras.


def stem_words(cell):
    """Stem words in list of tokenized words"""
    stemmer = PortugueseStemmer()
    stems = []
    for word in cell:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

###########################################################################
#funcion de funciones:

def normalize(words):
    words = minus(words)
    words = replace(words)
    words = token(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    return words



# Modificamos las columnas de texto aplicando las funciones mencionadas.
for col in cols_text:
  print(col)
  final_df2.loc[:,col] = final_df2[col].apply(normalize)  


event_info
title
domain
category_id


In [121]:
final_df2[:20]

Unnamed: 0_level_0,event_info,event_timestamp,event_type,item_id,title,price,category_id,condition,country,domain,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
318661,"[mes, comput]",2019-10-01T08:52:47.810-0400,search,722856,"[microfon, fio, jwl, u-585, dupl, uhf, profiss...",565.0,[mlb4469],new,MLB,[microphon],318661
309346,"[pot, vidr, gel]",2019-10-17T20:46:52.931-0400,search,923252,"[selador, vácu, freshpac, sac, plástic, 30, cm...",173.9,[mlb49073],new,MLB,"[industrial, and, commercial, bag, sealers]",309346
231975,[1204536],2019-10-29T07:06:51.495-0400,view,1474739,"[celul, lg, k8, plus, azul, 16gb, 2gb, ram, te...",599.0,[mlb1055],new,MLB,[cellphon],231975
28753,[climatiz],2019-10-28T13:24:15.749-0400,search,1547526,"[celul, lg, k12, plus, pret, 32gb, 3gb, androi...",719.0,[mlb1055],new,MLB,[cellphon],28753
220300,"[duch, carr]",2019-10-26T11:45:49.433-0400,search,79388,"[lavador, alta, pressã, 1400w, rod, alça, long...",384.9,[mlb120294],new,MLB,"[electric, pressur, washers]",220300
184352,"[capinh, a50]",2019-10-02T21:19:44.726-0400,search,1225041,"[xiaom, redm, not, 7, (, 48, mpx, ), dual, sim...",1566.99,[mlb1055],new,MLB,[cellphon],184352
295863,[1030804],2019-10-23T19:01:41.529-0400,view,187074,"[kit, 4, bermud, masculin, sarj, color, atac, ...",129.0,[mlb188064],new,MLB,[shorts],295863
162552,"[xiaom, redm, 7, dual, sim, 32gb, azul]",2019-10-03T09:34:03.080-0400,search,545438,"[celul, xiaom, redm, 7, 32gb, 4g, capa+películ...",759.0,[mlb1055],new,MLB,[cellphon],162552
406050,[871031],2019-10-22T15:03:27.911-0400,view,871031,"[kit, 10, sai, evangél, god, bab, ret, ofert]",197.0,[mlb185489],new,MLB,[skirts],406050
401008,[615934],2019-10-12T18:44:24.161-0400,view,426665,"[banquet, alta, bistrô, aço, encost, cozinh, a...",89.0,[mlb106882],new,MLB,[stools],401008


##### Lemmatizamos.
No encontramos lemmatizador en portugues.

Se busca la raíz de los verbos con el objetivo de eliminar ambiguedades con las distintas conjugaciones verbales. 

In [122]:
# Definimos una funcion que lemmatizará cada celda

def lemmatize_verbs(cell):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in cell:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [123]:
#No hemos aplicado la función. Por el momento no la consideramos importante, ya que segun entendemos,
#busca raices de verbos y no tenemos verbos en nuestros textos. 

#for col in cols_text:
 # final_df2.loc[:,col] = final_df2[col].apply(lemmatize_verbs) 

In [124]:
final_df2.head()

Unnamed: 0_level_0,event_info,event_timestamp,event_type,item_id,title,price,category_id,condition,country,domain,user_id2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
318661,"[mes, comput]",2019-10-01T08:52:47.810-0400,search,722856,"[microfon, fio, jwl, u-585, dupl, uhf, profiss...",565.0,[mlb4469],new,MLB,[microphon],318661
309346,"[pot, vidr, gel]",2019-10-17T20:46:52.931-0400,search,923252,"[selador, vácu, freshpac, sac, plástic, 30, cm...",173.9,[mlb49073],new,MLB,"[industrial, and, commercial, bag, sealers]",309346
231975,[1204536],2019-10-29T07:06:51.495-0400,view,1474739,"[celul, lg, k8, plus, azul, 16gb, 2gb, ram, te...",599.0,[mlb1055],new,MLB,[cellphon],231975
28753,[climatiz],2019-10-28T13:24:15.749-0400,search,1547526,"[celul, lg, k12, plus, pret, 32gb, 3gb, androi...",719.0,[mlb1055],new,MLB,[cellphon],28753
220300,"[duch, carr]",2019-10-26T11:45:49.433-0400,search,79388,"[lavador, alta, pressã, 1400w, rod, alça, long...",384.9,[mlb120294],new,MLB,"[electric, pressur, washers]",220300
