# Modelagem de Tópicos - Parte II

## Exemplo 1 - LDA básico
    
#### 1 - Limpeza dos dados

In [1]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

import random
text_data = []
with open('./Data/comments.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .8:
            print(tokens)
            text_data.append(tokens)

['quantitative', 'assured', 'forwarding', 'service']
['business', 'policy', 'modeling', 'enforcement', 'database']
['optimized', 'numerical', 'mapping', 'scheme', 'filter', 'based', 'location', 'using', 'quasi', 'newton', 'algorithm']
['power', 'oriented', 'delay', 'budgeting', 'combinational', 'circuit']
['sensor', 'network', 'navigation', 'without', 'location']
['analysis', 'price', 'competition', 'slotted', 'resource', 'allocation']
['language', 'shading', 'lighting', 'calculation']
['adaptive', 'electricity', 'scheduling', 'microgrids']
['parallelizing', 'query', 'optimization']
['distributed', 'amplifier', 'based', 'dispersive', 'delay']
['simple', 'sybil', 'proof', 'mechanism', 'multi', 'level', 'marketing']
['summary', 'demand', 'query', 'linked']
['compressive', 'sampling', 'signal']
['calm', 'cloud', 'assisted', 'medium', 'streaming', 'globalized', 'demand', 'region', 'diversity']
['analysis', 'power', 'supply', 'bandgap', 'reference']
['framework', 'benchmarking', 'entity', '

['heuristic', 'algorithm', 'joint', 'configuration', 'optical', 'electrical', 'layer', 'multi', 'wavelength', 'routing', 'network']
['stochastic', 'learning', 'algorithm', 'application', 'contextual', 'advertising']
['handling', 'multiprocessor', 'database', 'computer', 'using', 'partition', 'tuning']
['decentralized', 'accurate', 'network', 'bandwidth', 'prediction']
['optimal', 'configuration', 'aggregate']
['polymesh', 'level', 'algorithm']
['model', 'directed', 'transaction', 'constrained', 'modality']
['energy', 'scalable', 'margin', 'propagation', 'based', 'analog', 'support', 'vector', 'machine']
['potocol', 'checker', 'efficient', 'debugging', 'mechanism']
['bandwidth', 'broadband', 'amplifier', 'ethernet', 'application']
['interactive', 'control', 'avatar', 'animated', 'human', 'motion']
['analysis', 'blocking', 'probability', 'noise', 'crosstalk', 'impaired', 'optical', 'network']
['fault', 'aware', 'dynamic', 'routing', 'algorithm', 'network']
['architecture', 'integrating',

['network', 'measurement', 'architecture', 'adaptive', 'application']
['scalable', 'cycle', 'breaking', 'algorithm', 'gigabit', 'ethernet', 'backbone']
['opportunistic', 'routing', 'congestion', 'diversity', 'wireless', 'multi', 'network']
['robust', 'channel', 'digital', 'timing', 'recovery', 'multi', 'format', 'optical']
['multi', 'column', 'foreign', 'discovery']
['novel', 'current', 'fully', 'differential', 'implementation']
['capacitor', 'scaling', 'power', 'design', 'cyclic', 'analog', 'digital', 'converter']
['radialize', 'social', 'listening', 'experience', 'based', 'radio', 'station', 'program']
['modeling', 'change', 'information']
['finding', 'significant', 'difference', 'network', 'stream']
['study', 'consensus', 'complex', 'social', 'network', 'using', 'eigen', 'theory']
['simrank++', 'query', 'rewriting', 'analysis', 'clickgraph', 'poster']
['revenue', 'maximizing', 'pricing', 'capacity', 'expansion', 'user', 'regime']
['ultrafast', 'photonic', 'label', 'switch', 'asynchr

#### 2 - Preparando os dados para o Gensim

In [2]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)],
 [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(12, 1), (26, 1), (27, 1), (28, 1), (29, 1)],
 [(30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)],
 [(36, 1), (37, 1), (38, 1), (39, 1)],
 [(40, 1), (41, 1), (42, 1), (43, 1)],
 [(44, 1), (45, 1), (46, 1)],
 [(10, 1), (23, 1), (47, 1), (48, 1), (49, 1)],
 [(50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)],
 [(46, 1), (57, 1), (58, 1), (59, 1)],
 [(60, 1), (61, 1), (62, 1)],
 [(57, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1)],
 [(25, 1), (31, 1), (71, 1), (72, 1), (73, 1)],
 [(74, 1), (75, 1), (76, 1), (77, 1), (78, 1)],
 [(79, 1), (80, 1), (81, 1)],
 [(9, 1),
  (10, 1),
  (11, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  

#### 3 - Solicitamos para separar em 3 tópicos

In [3]:
import gensim
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"based" + 0.012*"network" + 0.011*"image" + 0.010*"system"')
(1, '0.036*"network" + 0.012*"system" + 0.009*"database" + 0.009*"sensor"')
(2, '0.021*"based" + 0.021*"using" + 0.014*"model" + 0.011*"efficient"')


#### 4 - Classificando um novo documento

In [4]:
new_doc = 'The database stopped working'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(5, 1)]
[(0, 0.16677234), (1, 0.66410553), (2, 0.16912217)]


#### 5 - Visualização LDA

In [8]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

lda_display = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(


## Exemplo 2 - LDA & LSI

#### 6 - Carregando bibliotecas e dados

In [10]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

import pandas as pd

data = pd.read_csv('./Data/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

documents.head()



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


#### 7 - Limpeza dos dados

In [11]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


doc_sample = documents[documents['index'] == 10].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['australia', 'to', 'contribute', '10', 'million', 'in', 'aid', 'to', 'iraq']


 tokenized and lemmatized document: 
['australia', 'contribute', 'million', 'iraq']


In [12]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

#### 8 - Criando um dicionário

In [13]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


#### 9 - Filtrando o dicionário

In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#### 10 - Mala de palavras

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(35, 1), (37, 1), (40, 1), (41, 1)]

#### 11 - Executar o LDA usando a mala de palavras

In [16]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=6)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [17]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"council" + 0.020*"market" + 0.019*"queensland" + 0.019*"open" + 0.017*"australian" + 0.016*"rise" + 0.014*"price" + 0.013*"share" + 0.013*"time" + 0.010*"tasmania"
Topic: 1 
Words: 0.029*"charge" + 0.026*"court" + 0.026*"police" + 0.019*"kill" + 0.019*"murder" + 0.018*"attack" + 0.015*"face" + 0.015*"woman" + 0.015*"jail" + 0.014*"accuse"
Topic: 2 
Words: 0.020*"election" + 0.018*"say" + 0.016*"state" + 0.014*"labor" + 0.014*"power" + 0.010*"mine" + 0.010*"hill" + 0.009*"need" + 0.009*"party" + 0.009*"royal"
Topic: 3 
Words: 0.022*"north" + 0.021*"test" + 0.010*"appeal" + 0.008*"say" + 0.008*"damage" + 0.008*"lose" + 0.008*"storm" + 0.008*"right" + 0.008*"korea" + 0.007*"england"
Topic: 4 
Words: 0.024*"change" + 0.020*"miss" + 0.015*"news" + 0.014*"australia" + 0.013*"dead" + 0.012*"search" + 0.012*"final" + 0.011*"beat" + 0.009*"tasmanian" + 0.008*"asylum"
Topic: 5 
Words: 0.031*"interview" + 0.020*"country" + 0.016*"hospital" + 0.016*"hour" + 0.013*"gold" + 0

#### 12 - TF-IDF

In [18]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5955827965774221),
 (1, 0.3948194439634864),
 (2, 0.49764520713864374),
 (3, 0.49167874359312774)]


#### 13 - Executar o LSI com TF-IDF

In [19]:
lda_model_tfidf = gensim.models.LsiModel(corpus_tfidf, num_topics=10, id2word=dictionary)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.572*"police" + 0.329*"charge" + 0.184*"court" + 0.164*"murder" + 0.154*"crash" + 0.151*"face" + 0.112*"kill" + 0.110*"woman" + 0.110*"death" + 0.109*"miss"
Topic: 1 Word: 0.489*"govt" + -0.387*"police" + 0.318*"plan" + 0.198*"council" + 0.197*"urge" + 0.190*"fund" + -0.170*"charge" + 0.130*"water" + 0.114*"say" + 0.105*"boost"
Topic: 2 Word: 0.573*"charge" + -0.512*"police" + 0.334*"court" + 0.326*"face" + 0.219*"murder" + 0.115*"accuse" + -0.112*"miss" + -0.097*"probe" + -0.088*"crash" + -0.087*"search"
Topic: 3 Word: -0.581*"govt" + 0.433*"kill" + 0.331*"interview" + 0.262*"crash" + -0.230*"police" + 0.146*"iraq" + -0.123*"charge" + -0.105*"urge" + 0.099*"rural" + 0.091*"plan"
Topic: 4 Word: -0.856*"interview" + 0.349*"kill" + 0.200*"crash" + 0.094*"iraq" + -0.081*"tsunami" + -0.080*"police" + 0.068*"bomb" + 0.059*"council" + -0.057*"michael" + 0.056*"attack"
Topic: 5 Word: -0.528*"govt" + -0.424*"kill" + 0.320*"plan" + -0.286*"interview" + -0.273*"crash" + 0.212*"co

#### 14 - Classificando uma amostra

In [20]:
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7864028187610842	 
Topic: 0.717*"iraq" + -0.488*"crash" + -0.129*"die" + 0.129*"troop" + -0.122*"fatal" + 0.095*"attack" + -0.094*"plan" + -0.092*"road" + 0.092*"say" + -0.092*"council"

Score: 0.2080020004613419	 
Topic: -0.581*"govt" + 0.433*"kill" + 0.331*"interview" + 0.262*"crash" + -0.230*"police" + 0.146*"iraq" + -0.123*"charge" + -0.105*"urge" + 0.099*"rural" + 0.091*"plan"

Score: 0.12075459439896494	 
Topic: 0.489*"govt" + -0.387*"police" + 0.318*"plan" + 0.198*"council" + 0.197*"urge" + 0.190*"fund" + -0.170*"charge" + 0.130*"water" + 0.114*"say" + 0.105*"boost"

Score: 0.090950570587123	 
Topic: -0.856*"interview" + 0.349*"kill" + 0.200*"crash" + 0.094*"iraq" + -0.081*"tsunami" + -0.080*"police" + 0.068*"bomb" + 0.059*"council" + -0.057*"michael" + 0.056*"attack"

Score: 0.08620637705031027	 
Topic: 0.572*"police" + 0.329*"charge" + 0.184*"court" + 0.164*"murder" + 0.154*"crash" + 0.151*"face" + 0.112*"kill" + 0.110*"woman" + 0.110*"death" + 0.109*"miss"

Score: 0

## Exemplo 3 - Sklearn

#### 15 - Carregando corpus

In [None]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

#### 16 - Preparando a mala de palavras

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
  
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(data)

#### 17 - Executando modelagem de tópicos com NMF

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

 
NUM_TOPICS = 10
 
# Build a Latent Dirichlet Allocation Model
#lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
#lda_Z = lda_model.fit_transform(data_vectorized)
#print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
#lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
#lsi_Z = lsi_model.fit_transform(data_vectorized)
#print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

In [None]:
nmf_Z[0]

#### 18 - Inspecionando tópicos NMF

In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

#### 19 - Classificando um documento não visto

In [None]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)