

### Termos de NLP

- Token - um token significa uma 'palavra'.
- Documento - Um documento refere-se a uma frase ou parágrafo.
- Corpus - Refere-se a uma coleção de documentos como um saco de palavras (BoW).
- Dicionário - O objeto de dicionário é usado para criar um corpus de saco de palavras (BoW) que posteriormente é usado como entrada para a modelagem de tópicos e outros modelos também
- BOW - O modelo Bag-Of-Words é uma representação simplificada utilizada no processamento de linguagem natural e na recuperação de informações. Neste modelo, o texto é representado como um multiconjunto de suas palavras, desconsiderando a estrutura gramatical e até mesmo a ordenação delas, mas mantendo sua multiplicidade.

In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
documents = ["kolkata big city india trade", "mumbai financial capital india", "delhi capital india",
       "kolkata capital colonial times",
       "bangalore tech hub india software", "mumbai hub trade commerce stock exchange", "kolkata victoria memorial",
       "delhi india gate",
       "mumbai gate way india trade business", "delhi red fort india", "kolkata metro oldest india",
       "delhi metro largest metro network india"]
print("Documentos = ", documents, "\nQtd de Sentencas = ", len(documents))

Documentos =  ['kolkata big city india trade', 'mumbai financial capital india', 'delhi capital india', 'kolkata capital colonial times', 'bangalore tech hub india software', 'mumbai hub trade commerce stock exchange', 'kolkata victoria memorial', 'delhi india gate', 'mumbai gate way india trade business', 'delhi red fort india', 'kolkata metro oldest india', 'delhi metro largest metro network india'] 
Qtd de Sentencas =  12


In [4]:
tokens = []
for sentence in documents:
    words = nltk.word_tokenize(sentence.lower())
    for word in words:
        tokens.append(word)
print("Tokens = ", tokens, "\nQtd de Tokens = ", len(tokens))

Tokens =  ['kolkata', 'big', 'city', 'india', 'trade', 'mumbai', 'financial', 'capital', 'india', 'delhi', 'capital', 'india', 'kolkata', 'capital', 'colonial', 'times', 'bangalore', 'tech', 'hub', 'india', 'software', 'mumbai', 'hub', 'trade', 'commerce', 'stock', 'exchange', 'kolkata', 'victoria', 'memorial', 'delhi', 'india', 'gate', 'mumbai', 'gate', 'way', 'india', 'trade', 'business', 'delhi', 'red', 'fort', 'india', 'kolkata', 'metro', 'oldest', 'india', 'delhi', 'metro', 'largest', 'metro', 'network', 'india'] 
Qtd de Tokens =  53


In [5]:
vocabulary = []
for token in tokens:
    if token not in vocabulary:
        vocabulary.append(token)
print("Vocabulario = ", vocabulary, "\nQtd de tokens unicos = ", len(vocabulary))

Vocabulario =  ['kolkata', 'big', 'city', 'india', 'trade', 'mumbai', 'financial', 'capital', 'delhi', 'colonial', 'times', 'bangalore', 'tech', 'hub', 'software', 'commerce', 'stock', 'exchange', 'victoria', 'memorial', 'gate', 'way', 'business', 'red', 'fort', 'metro', 'oldest', 'largest', 'network'] 
Qtd de tokens unicos =  29


In [6]:
word2count = {}
for sentence in documents:
    words = nltk.word_tokenize(sentence.lower())
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
print("Contagem de palavras = ", word2count)

Contagem de palavras =  {'kolkata': 4, 'big': 1, 'city': 1, 'india': 9, 'trade': 3, 'mumbai': 3, 'financial': 1, 'capital': 3, 'delhi': 4, 'colonial': 1, 'times': 1, 'bangalore': 1, 'tech': 1, 'hub': 2, 'software': 1, 'commerce': 1, 'stock': 1, 'exchange': 1, 'victoria': 1, 'memorial': 1, 'gate': 2, 'way': 1, 'business': 1, 'red': 1, 'fort': 1, 'metro': 3, 'oldest': 1, 'largest': 1, 'network': 1}


In [7]:
bow = []
for word in vocabulary:
    aux = []
    for sentence in documents:
        if word in sentence:
            aux.append(1)
        else:
            aux.append(0)
    bow.append(aux)

bow = np.asarray(bow).T
print("Bag of Word\n\n", bow)

Bag of Word

 [[1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1]]


In [8]:
df_bow = pd.DataFrame(
    data=bow,
    index=[f"sentence {i + 1}" for i in range(bow.shape[0])],
    columns=[word for word in vocabulary]
)

In [9]:
word2count_row = pd.Series(data=word2count, name="Count")
df_bow = df_bow.append(word2count_row, ignore_index=False)

In [10]:
df_bow

Unnamed: 0,kolkata,big,city,india,trade,mumbai,financial,capital,delhi,colonial,...,memorial,gate,way,business,red,fort,metro,oldest,largest,network
sentence 1,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sentence 2,0,0,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
sentence 3,0,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
sentence 4,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
sentence 5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sentence 6,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sentence 7,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
sentence 8,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
sentence 9,0,0,0,1,1,1,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
sentence 10,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0


### Criando bag of word com Sklearn

In [11]:
voca
vectorizer  = CountVectorizer(vocabulary=vocabulario)
word_count = vectorizer .fit_transform(documents)
word_count.shape

NameError: name 'vocabulario' is not defined