<a href="https://colab.research.google.com/github/mfilipak/AFRAC_IA/blob/main/002_Embeddings1_Portal_da_Transpar%C3%AAncia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EM CONSTRUÇÃO

# Experimento 002
##Objetivo: Experimentos com word embeddings usando como corpo o campo de descrição do dataset público do portal da transparência.
###Descrição: Experimentos iniciais para criação e visualização de word embeddings

Dica: No COLAB Use CTRL SPACE ao invés de TAB para "autocompletar". Ex:pd.re [CTRL SPACE] vai mostrar uma lista contendo as funções e atributos que começam com pd.re (como read_csv, ...) 

In [1]:
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pprint

from gensim.models import Word2Vec
#from sentence_transformers import SentenceTransformer
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#1 - Carga do dataset

In [3]:
#Copia os dados das NFEs do portal da cidadância pro drive virtual.
import requests  
file_url = "https://raw.githubusercontent.com//mfilipak/AFRAC_IA/main/DATASET/202201_NFe_NotaFiscalItem.zip"
r = requests.get(file_url, stream = True) 

with open("portal.zip", "wb") as file:  
    for block in r.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block)

DATA_FILE = "portal.zip"
df = pd.read_csv(DATA_FILE, encoding="CP1252",sep=";")
print("O dataframe completo contém:",len(df),"linhas")

O dataframe completo contém: 324056 linhas


In [4]:
df3 = df[['DATA EMISSÃO','DESCRIÇÃO DO PRODUTO/SERVIÇO', 'CÓDIGO NCM/SH', 'CFOP']]
df3.columns = ["DATA", "DESCR", "NCM", "CFOP"]
df3 = df3[df3["NCM"]!=-1] #Filtrando NCMs = -1

text_lengths = np.array([len(_) for _ in df3['DESCR']])
df3 = df3[text_lengths>=3]

#Caso queira eliminar as repetições rodar a linha abaixo
df3 = df3.drop_duplicates(subset=["DESCR"])

#2 - Carga do text corpus

In [5]:
#No jargão do doc2vec, cada descrição é um document e o conjunto de docs um corpus
#Link: https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
random.seed(42)
#text_corpus = random.sample(list(df3["DESCR"]), 100)
text_corpus = random.sample(list(df3["DESCR"][:30]), 10)
text_corpus
#Deixar bem curtinho o corpo pra ir arrumando o código e testando os resultados. Pra sequência abaixo fazer mais sentido é bom que tenha alguma repetição de palavras

['GASOLINA COMUM',
 'MLTD203UXAZ CARTUCHO DE TONER PRETO 15K PAGINAS',
 'HP RESERVATORIO DE RESIDUO DE TONER',
 'UVA ITALIA',
 'CLTC603LXAZ CARTUCHO DE TONER CIANO 10K PAGINAS',
 'CARTUCHO DE TONER AMARELO 3.5K PAGINAS',
 'CENOURA',
 'CLTC506LXAZ CARTUCHO DE TONER CIANO 3.5K PAGINAS',
 'PLACA LED ILUMINACAO',
 'OLEO DIESEL B S10 ADITIVADO GRID']

#3 - Determinando as palavras muito frequentes 

In [6]:
all_words = []
for d in list(df3["DESCR"]):
    all_words += d.split()
words_counts = pd.DataFrame(all_words).value_counts()
print("Palavras mais frequentemente encontradas")
print(" ".join([_[0] for _ in words_counts.index[:30]]))

Palavras mais frequentemente encontradas
- DE E PARA X COM 1 EM TIPO de A | DO FILTRO C/ DA MM 2 / PARAFUSO CABO Lote: P/ OLEO 100 Ed O MATERIAL 10 KG


#4 - Seguindo com o tutorial do doc2vec (Que usa o modelo mais simples tf-idf)

In [7]:
stoplist = set('- DE E PARA X COM EM TIPO de A | DO C/ DA / P/ Ed O'.split(' '))

In [8]:
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
#Ignorei esse filtro pois com poucos exemplos iria cortar quase tudo
processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts]
pprint.pprint(processed_corpus)

[['gasolina', 'comum'],
 ['mltd203uxaz', 'cartucho', 'toner', 'preto', '15k', 'paginas'],
 ['hp', 'reservatorio', 'residuo', 'toner'],
 ['uva', 'italia'],
 ['cltc603lxaz', 'cartucho', 'toner', 'ciano', '10k', 'paginas'],
 ['cartucho', 'toner', 'amarelo', '3.5k', 'paginas'],
 ['cenoura'],
 ['cltc506lxaz', 'cartucho', 'toner', 'ciano', '3.5k', 'paginas'],
 ['placa', 'led', 'iluminacao'],
 ['oleo', 'diesel', 'b', 's10', 'aditivado', 'grid']]


In [9]:
processed_corpus[0] = processed_corpus[0]+processed_corpus[0] #Forcei uma palavra repetida na descrição somente para ter um exemplo
processed_corpus

[['gasolina', 'comum', 'gasolina', 'comum'],
 ['mltd203uxaz', 'cartucho', 'toner', 'preto', '15k', 'paginas'],
 ['hp', 'reservatorio', 'residuo', 'toner'],
 ['uva', 'italia'],
 ['cltc603lxaz', 'cartucho', 'toner', 'ciano', '10k', 'paginas'],
 ['cartucho', 'toner', 'amarelo', '3.5k', 'paginas'],
 ['cenoura'],
 ['cltc506lxaz', 'cartucho', 'toner', 'ciano', '3.5k', 'paginas'],
 ['placa', 'led', 'iluminacao'],
 ['oleo', 'diesel', 'b', 's10', 'aditivado', 'grid']]

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(29 unique tokens: ['comum', 'gasolina', '15k', 'cartucho', 'mltd203uxaz']...)


In [11]:
pprint.pprint(dictionary.token2id)

{'10k': 13,
 '15k': 2,
 '3.5k': 16,
 'aditivado': 23,
 'amarelo': 17,
 'b': 24,
 'cartucho': 3,
 'cenoura': 18,
 'ciano': 14,
 'cltc506lxaz': 19,
 'cltc603lxaz': 15,
 'comum': 0,
 'diesel': 25,
 'gasolina': 1,
 'grid': 26,
 'hp': 8,
 'iluminacao': 20,
 'italia': 11,
 'led': 21,
 'mltd203uxaz': 4,
 'oleo': 27,
 'paginas': 5,
 'placa': 22,
 'preto': 6,
 'reservatorio': 9,
 'residuo': 10,
 's10': 28,
 'toner': 7,
 'uva': 12}


In [12]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)
#The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token.

[[(0, 2), (1, 2)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1)],
 [(3, 1), (5, 1), (7, 1), (13, 1), (14, 1), (15, 1)],
 [(3, 1), (5, 1), (7, 1), (16, 1), (17, 1)],
 [(18, 1)],
 [(3, 1), (5, 1), (7, 1), (14, 1), (16, 1), (19, 1)],
 [(20, 1), (21, 1), (22, 1)],
 [(23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [13]:
new_doc = "gasolina hp hp interaction uva"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
#Como interaction não existe no dicionário ele não insere o token 

[(1, 1), (8, 2), (12, 1)]


In [14]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
words = "gasolina hp CIANO".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(1, 0.633907694445084), (8, 0.633907694445084), (14, 0.44308246393491596)]


#5 - Agora com word2vec
Base: https://github.com/mfilipak/AFRAC_IA/blob/main/Analise_Descricao_versus_NCM.ipynb

In [15]:
random.seed=42
text_corpus = random.sample(list(df3["DESCR"]),50000)
len(text_corpus)
#Deixar bem curtinho o corpo pra ir arrumando o código e testando os resultados. Pra sequência abaixo fazer mais sentido é bom que tenha alguma repetição de palavras

50000

In [16]:
text_corpus[:5]

['MEDIDOR DE CONDILO',
 'PULVERIZADOR BOMBA FLITZ 370ML - GUARANY',
 'MACA ARGENTINA KG',
 'Case para Arduino Uno em MDF',
 'REAGENTE PLACA 96 POCOS OTICA']

#5.1 - Tokeniza

In [17]:
stoplist = set('- DE E PARA X COM EM TIPO de A | DO C/ DA / P/ Ed O'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
#Ignorei esse filtro pois com poucos exemplos iria cortar quase tudo
processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts]
pprint.pprint(processed_corpus[:5])

[['medidor', 'condilo'],
 ['pulverizador', 'bomba', 'flitz', '370ml', 'guarany'],
 ['maca', 'argentina', 'kg'],
 ['case', 'para', 'arduino', 'uno', 'em', 'mdf'],
 ['reagente', 'placa', '96', 'pocos', 'otica']]


#5.2 - Cria modelo

In [18]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(processed_corpus)]

In [19]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        try:
          loss = model.get_latest_training_loss()
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
        except:
          print(f"EXCEPTION - Epoch:{self.epoch}")
        self.epoch += 1

#model = Word2Vec(common_texts, size=100, window=5, min_count=1, 
#                 compute_loss=True, callbacks=[callback()])

In [20]:
#Para 5000 itens e 200 epochs leva cerca de 2min
#Para 50000 e 10 epochs cerca de 1 minuto
model = Doc2Vec(tagged_data, vector_size = 50, window = 2, min_count = 1, epochs = 10, callbacks=[callback()])

EXCEPTION - Epoch:0
EXCEPTION - Epoch:1
EXCEPTION - Epoch:2
EXCEPTION - Epoch:3
EXCEPTION - Epoch:4
EXCEPTION - Epoch:5
EXCEPTION - Epoch:6
EXCEPTION - Epoch:7
EXCEPTION - Epoch:8
EXCEPTION - Epoch:9


In [21]:
[ [k,model.wv.vocab[k]] for k in list(model.wv.vocab.keys())[:10] ] 

[['medidor', <gensim.models.keyedvectors.Vocab at 0x7f9f2fc562d0>],
 ['condilo', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7190>],
 ['pulverizador', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b71d0>],
 ['bomba', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7210>],
 ['flitz', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7450>],
 ['370ml', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7250>],
 ['guarany', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7390>],
 ['maca', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b75d0>],
 ['argentina', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b73d0>],
 ['kg', <gensim.models.keyedvectors.Vocab at 0x7f9f2f1b7410>]]

In [22]:
model.wv.most_similar('banana')

[('melancia', 0.9952705502510071),
 ('condimento', 0.9929630160331726),
 ('molho', 0.992384672164917),
 ('linguica', 0.9921910166740417),
 ('toucinho', 0.9918076992034912),
 ('maracuja', 0.9914619326591492),
 ('alcatra', 0.9911345839500427),
 ('cong.', 0.9908254742622375),
 ('suina', 0.9902585744857788),
 ('americana', 0.9899332523345947)]

In [23]:
def print_res(train_sentences, df, test):    
    for t in test:      
        print(t[1], " | Descrição Original: ", df['xProd'].iloc[t[0]], " | NCM: ", df['NCM'].iloc[t[0]])

In [24]:
test_doc = word_tokenize("papel sulfite".lower())
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])
test = model.docvecs.most_similar(positive = [test_doc_vector])
#print_res(processed_corpus, df, test)

In [25]:
test

[(14062, 0.9049729704856873),
 (9776, 0.8736184239387512),
 (29021, 0.865247368812561),
 (29780, 0.8649479150772095),
 (36060, 0.8638114333152771),
 (7148, 0.8631036877632141),
 (48947, 0.8609281182289124),
 (42317, 0.8584352731704712),
 (15712, 0.8551925420761108),
 (16211, 0.8550198674201965)]

In [26]:
[[f, text_corpus[i]] for i,f in test]

[[0.9049729704856873, 'FILTR0 DE COMBUSTIVEL SEDMENTADOR'],
 [0.8736184239387512,
  'TINTA ACRILICA COMPONENTES AGUA RESINA ACRILICA PIGMENTOS ORGANICOS E INORGANICOS ASPECTO FISICO LIQUIDO VISCOSO COR BRA'],
 [0.865247368812561, 'PAPEL HIG. FOLHA DUPLA 16X4X30MT QUALITE'],
 [0.8649479150772095, 'TINTA ESMALTE SINTÉTICO BRANCO NEVE 3600ML'],
 [0.8638114333152771,
  'COMPRESSA GAZE, MATERIAL RAYON, LARGURA 7,50 CM, COMPRIMENTO 15 CM, CARACTERISTICAS ADICIONAIS EMBEBIDA EM OLEO DERMOPRO'],
 [0.8631036877632141,
  'CARTUCHO CILINDRO IMPRESSORA HP, ORIGINAL COR PRETA,REFERÊNCIA: W1104A'],
 [0.8609281182289124, 'PAPEL A4 75G C/500 FLS - ARMAZEM DO PAPEL'],
 [0.8584352731704712, 'Papel A4 Mega Paper Resma 500fls'],
 [0.8551925420761108, 'ANTI RESPINGO SEM SILICONE SPRAY 280G MUNDIAL PRIME'],
 [0.8550198674201965,
  'PAPEL DE IMPRESSAO FORMATADO, TAMANHO A4, GRAMATURA 75G, COR COLORIDO, PH ALCALINO']]

In [33]:
model.save("w2vec_pdt_50000")

In [34]:
model2 = Word2Vec.load("w2vec_pdt_50000")

In [31]:
test_doc_vector

array([ 0.01800523, -0.04991013,  0.0185887 , -0.00257981, -0.02103942,
        0.01591977,  0.00751989,  0.00833964, -0.02726729, -0.04064235,
       -0.04550948,  0.00619743,  0.00495163,  0.01858483, -0.00775046,
        0.01305139, -0.01468516,  0.02675515,  0.01028167, -0.03538405,
        0.00446837, -0.00993148, -0.01049118,  0.0071055 ,  0.03721777,
        0.04068387, -0.00681104,  0.0143122 ,  0.02843466,  0.00777871,
       -0.00082526,  0.03841264,  0.03397006, -0.02204213, -0.00027338,
        0.00143366, -0.00556263,  0.01352011, -0.00914467,  0.02682112,
        0.00754324,  0.01171425,  0.05972837,  0.04511961,  0.0012959 ,
        0.01109084, -0.01625376, -0.0004921 , -0.02112934,  0.00968779],
      dtype=float32)

In [32]:
model2.infer_vector(test_doc)

array([ 0.01882276, -0.04699792,  0.03210271, -0.00119155, -0.02704529,
        0.01287987,  0.02143767, -0.00590753, -0.03285823, -0.03793808,
       -0.03860652,  0.00632336,  0.0052625 ,  0.02911218,  0.00118628,
        0.01616548, -0.01733688,  0.0238259 ,  0.01753198, -0.03056025,
        0.00812793, -0.00881012, -0.0065726 ,  0.00220483,  0.04933191,
        0.03196358, -0.00600734,  0.01025085,  0.02780269,  0.01065063,
       -0.01800748,  0.03328613,  0.03862325, -0.01533649,  0.00803987,
       -0.00407112, -0.00162246,  0.0198715 , -0.01594796,  0.03032946,
        0.01261197,  0.01210154,  0.05338154,  0.03924512, -0.00113517,
        0.01921435, -0.02555567,  0.00249498, -0.02810187,  0.0132193 ],
      dtype=float32)