<a href="https://colab.research.google.com/github/natanrajch/DiploDatos/blob/main/MELI/MELI_NLP_domain_top_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gzip
import pandas as pd
import urllib
import tarfile
import urllib.request
import numpy as np
import random
import json
import bisect
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from tqdm.notebook import tqdm

In [None]:
url_item_data = "https://meli-data-challenge.s3.amazonaws.com/2020/item_data.jl.gz"
url_train_data = "https://meli-data-challenge.s3.amazonaws.com/2020/train_dataset.jl.gz"
url_test_data = "https://meli-data-challenge.s3.amazonaws.com/2020/test_dataset.jl.gz"

In [None]:
item_data = []
with urllib.request.urlopen(url_item_data) as handle:
  gz = gzip.GzipFile(fileobj=handle)
  for i, line in enumerate(gz):
    item_data.append(json.loads(line.strip().decode('utf-8')))

In [None]:
item_df = pd.DataFrame(item_data)
del item_data
del gz
item_df

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.00,MLM170527,new
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800,1392.83,MLM151595,new
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.00,MLM7697,new
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.00,MLM9761,used
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.00,MLM1652,used
...,...,...,...,...,...,...,...
2102272,1099649,Carrinho De Bebê Stoke,MLB-BABY_STROLLERS,,1600.00,MLB1386,used
2102273,1482187,Grelha Para Hambúrguer Preta Com Cabo Em Madei...,MLB-KITCHEN_SUPPLIES,,69.90,MLB193425,new
2102274,1118904,Meia Tam 7/8 Anti Embolia Trombose Antitrombo,MLB-SOCKS,,118.00,MLB108791,new
2102275,237229,Pano De Boca Cremer Menina Luxo Bordado C/3 Und,MLB-DISPOSABLE_BABY_DIAPERS,,26.90,MLB40629,new


In [None]:
dfm = item_df[item_df.domain_id.str[:3] == 'MLM']
dfb = item_df[item_df.domain_id.str[:3] == 'MLB']

In [None]:
import nltk
nltk.download('all')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sw = stopwords.words('spanish') + stopwords.words('portuguese')
sw = list(set(sw))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words=sw)

features = tfidf.fit_transform(item_df[~item_df.domain_id.isna()].title)
labels = item_df[~item_df.domain_id.isna()].domain_id
features.shape

(2101426, 493117)

In [None]:
#solo para dfb a ver si corre mas rapido
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words=sw)

features = tfidf.fit_transform(dfb.title)
labels = dfb.domain_id
features.shape

(1723216, 412295)

In [None]:
from sklearn.feature_selection import chi2
from random import sample
domain_list = labels.unique().tolist()
# random_sample = sample(domain_list,2)
# print(random_sample)
# input()
N = 10
domain_top_words = {}
for domain_id in tqdm(domain_list):
  features_chi2 = chi2(features, labels == domain_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1][-N:]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2][-N:]
  domain_top_words[domain_id] = ' '.join(unigrams + bigrams)

HBox(children=(FloatProgress(value=0.0, max=4205.0), HTML(value='')))




In [None]:
#ahora hago dfm. El otro tardó 3hs... pero este es muchísimo más chico.
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words=sw)

features = tfidf.fit_transform(dfm.title)
labels = dfm.domain_id
features.shape

(378210, 96465)

In [None]:
from sklearn.feature_selection import chi2
from random import sample
domain_list = labels.unique().tolist()
# random_sample = sample(domain_list,2)
# print(random_sample)
# input()
N = 10

for domain_id in tqdm(domain_list):
  features_chi2 = chi2(features, labels == domain_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1][-N:]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2][-N:]
  domain_top_words[domain_id] = ' '.join(unigrams + bigrams)

HBox(children=(FloatProgress(value=0.0, max=3688.0), HTML(value='')))




In [None]:
domain_top_words_df = pd.DataFrame.from_dict(domain_top_words, orient='index')
domain_top_words_df.to_csv('domain_top_words.csv')