# Descrição x NCM


Identificar um NCM através da Descrição do Produto.

In [1]:
import pandas as pd
import numpy as np
import warnings

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 64)
pd.set_option('display.max_rows', 100000)
pd.set_option('float_format', '{:f}'.format)
pd.options.display.max_colwidth = None

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)

In [5]:
def remove_punctuation(data):
    symbols = "ãƒâ‰‡!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [6]:
def remove_stop_words(data,stop_words):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [7]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [8]:
def preprocess(data):
    data = convert_lower_case(data) #data.lower()
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    stop_words = stopwords.words('portuguese')
    data = remove_stop_words(data, stop_words)
    return data

## Carga e limpeza dos dados

In [9]:
df = pd.read_csv('./notas_doadas.csv',sep=',',encoding='utf-8')
df['descricao_limpa'] = df['xProd'].apply(lambda row: preprocess(row.lower()))
df.drop_duplicates(subset=['descricao_limpa'],inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,CFOP,NCM,cEAN,cEANTrib,cProd,indTot,qCom,qTrib,uCom,uTrib,vProd,vUnCom,vUnTrib,xProd,comb,CEST,nItemPed,indEscala,EXTIPI,xPed,vDesc,cBenef,nFCI,med,vOutro,vFrete,rastro,descricao_limpa
0,0,5102,64019200,0,0,1,1,2.0,2.0,UN,UN,107.0,53.5,53.5,PAR DE SAPATO DE SEGURANÃƒÂ‡A MASCULINO NÃ‚Âº 39,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,par sapato seguran masculino 39
1,1,5102,64019200,0,0,2,1,2.0,2.0,UN,UN,107.0,53.5,53.5,PAR DE SAPATO DE SEGURANÃƒÂ‡A NÃ‚Âº 40,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,par sapato seguran 40
2,2,5102,64034000,0,0,3,1,2.0,2.0,UN,UN,240.0,120.0,120.0,PAR DE COTURNO DE CADARÃƒÂ‡O NÃ‚Âº 40,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,par coturno cadar 40
3,3,5102,39233000,0,0,4,1,5.0,5.0,UN,UN,150.0,30.0,30.0,GARRAFA TÃƒÂ‰RMICA 5 LT,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,garrafa rmica lt
4,4,5102,20079990,0,0,1609,1,20.0,20.0,UN,UN,26.8,1.34,1.34,BANANA C/ D. LEITE,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,banana leite


## Tokenização

In [10]:
train_sentences = list(df['descricao_limpa'].str.lower().values)
tokenized_sent = []
for s in train_sentences:
    tokenized_sent.append(word_tokenize(s.lower()))
tokenized_sent[:5]

[['par', 'sapato', 'seguran', 'masculino', '39'],
 ['par', 'sapato', 'seguran', '40'],
 ['par', 'coturno', 'cadar', '40'],
 ['garrafa', 'rmica', 'lt'],
 ['banana', 'leite']]

## Cria Modelo

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]


In [12]:
model = Doc2Vec(tagged_data, vector_size = 50, window = 2, min_count = 1, epochs = 200)
model.wv.vocab

{'par': <gensim.models.keyedvectors.Vocab at 0x20b7e8b7f88>,
 'sapato': <gensim.models.keyedvectors.Vocab at 0x20b7e8b96c8>,
 'seguran': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9708>,
 'masculino': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9748>,
 '39': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9808>,
 '40': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9888>,
 'coturno': <gensim.models.keyedvectors.Vocab at 0x20b7e8b98c8>,
 'cadar': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9908>,
 'garrafa': <gensim.models.keyedvectors.Vocab at 0x20b7e8b97c8>,
 'rmica': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9848>,
 'lt': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9948>,
 'banana': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9988>,
 'leite': <gensim.models.keyedvectors.Vocab at 0x20b7e8b99c8>,
 'bolinho': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9a08>,
 'aipim': <gensim.models.keyedvectors.Vocab at 0x20b7e8b9a48>,
 'cachorrinho': <gensim.models.keyedvectors.Vocab at

## Testes e Análises

In [13]:
model.wv.most_similar('leite')

[('bis', 0.961297869682312),
 ('lacta', 0.9604376554489136),
 ('elege', 0.9524224996566772),
 ('126g', 0.9462490081787109),
 ('banana', 0.9457299113273621),
 ('camponesa', 0.9351269006729126),
 ('bombinha', 0.9139132499694824),
 ('430ml', 0.8857569694519043),
 ('folhado', 0.881889283657074),
 ('erva', 0.8813177347183228)]

In [14]:
def print_res(train_sentences, df, test):    
    for t in test:      
        print(t[1], " | Descrição Original: ", df['xProd'].iloc[t[0]], " | NCM: ", df['NCM'].iloc[t[0]])

In [15]:
test_doc = word_tokenize("sapato masculino".lower())
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])
test = model.docvecs.most_similar(positive = [test_doc_vector])
print_res(train_sentences, df, test)

0.8891231417655945  | Descrição Original:  PAR DE SAPATO DE SEGURANÃƒÂ‡A NÃ‚Âº 40  | NCM:  64019200
0.8835933208465576  | Descrição Original:  PAR DE SAPATO DE SEGURANÃƒÂ‡A MASCULINO NÃ‚Âº 39  | NCM:  64019200
0.8347089886665344  | Descrição Original:  SAPATO DE PREGO  | NCM:  64022000
0.816364049911499  | Descrição Original:  PAR DE COTURNO DE CADARÃƒÂ‡O NÃ‚Âº 40  | NCM:  64034000
0.7463095784187317  | Descrição Original:  CAIXA SAPATO PQ     60000 ORDENE  | NCM:  39249000
0.7360522747039795  | Descrição Original:  CAIXA SAPATO MD     60200 ORDENE  | NCM:  39249000
0.7114656567573547  | Descrição Original:  PARES DE CAIXA ACUSTICA CLARITY CL 100P  | NCM:  85182200
0.7097424268722534  | Descrição Original:  ACABAMENTO VP 40 .1/2 3/4 1 ABS  | NCM:  84819010
0.705060601234436  | Descrição Original:  CERATO 40%  | NCM:  30043290
0.7037459015846252  | Descrição Original:  CAIXA DE BEGÃƒÂ”NIAS COM 15 MUDAS  | NCM:  6039000


In [16]:
test_doc = word_tokenize(preprocess("creme de leite camponesa tp"))
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])
test = model.docvecs.most_similar(positive = [test_doc_vector])
print_res(train_sentences, df, test)

0.9520816206932068  | Descrição Original:  CREME DE LEITE CAMPONESA TP 200G  | NCM:  4015021
0.9366070628166199  | Descrição Original:  BANANA C/ D. LEITE  | NCM:  20079990
0.9262491464614868  | Descrição Original:  LEITE ELEGE  | NCM:  4021010
0.9195842146873474  | Descrição Original:  BIS LACTA AO LEITE          126G  | NCM:  19053200
0.8897718787193298  | Descrição Original:  CROISSANT CHOCOLATE AO LEITE CT  | NCM:  19022000
0.8764390349388123  | Descrição Original:  BRIG GOURMET AO LEITE  | NCM:  20079990
0.8556551933288574  | Descrição Original:  BOMBINHA DOCE LEITE  | NCM:  20079990
0.8440322279930115  | Descrição Original:  BOMBINHA CHOCOLATE  | NCM:  20079990
0.8403213024139404  | Descrição Original:  LEITE FLUIDO  | NCM:  4022110
0.8291023373603821  | Descrição Original:  ROCAMBOLE DOCE DE LEITE  | NCM:  20079990
