# Tópicos

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
>>> from gensim import corpora
>>>
>>> documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

2018-08-06 15:56:13,234 : INFO : 'pattern' package not found; tag filters are not available for English


In [17]:
len(documents)

9

In [3]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [4]:
>>> dictionary = corpora.Dictionary(texts)
>>> dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
>>> print(dictionary)

2018-08-06 15:58:30,805 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-06 15:58:30,809 : INFO : built Dictionary(12 unique tokens: ['trees', 'graph', 'survey', 'response', 'interface']...) from 9 documents (total 29 corpus positions)
2018-08-06 15:58:30,811 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2018-08-06 15:58:30,813 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['trees', 'graph', 'survey', 'response', 'interface']...)


In [5]:
>>> print(dictionary.token2id)

{'trees': 9, 'graph': 10, 'survey': 4, 'response': 3, 'interface': 2, 'user': 7, 'time': 6, 'human': 1, 'minors': 11, 'system': 5, 'computer': 0, 'eps': 8}


In [6]:
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


In [8]:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
>>> pprint(corpus)

2018-08-06 15:59:12,714 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2018-08-06 15:59:12,717 : INFO : saving sparse matrix to /tmp/deerwester.mm
2018-08-06 15:59:12,719 : INFO : PROGRESS: saving document #0
2018-08-06 15:59:12,721 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2018-08-06 15:59:12,722 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [10]:
# import os
# >>> from gensim import corpora, models, similarities
# >>> if (os.path.exists("/tmp/deerwester.dict")):
# >>>    dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
# >>>    corpus = corpora.MmCorpus('/tmp/deerwester.mm')
# >>>    print("Used files generated from first tutorial")
# >>> else:
# >>>    print("Please run first tutorial to generate data set")

In [11]:
>>> tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

2018-08-06 16:01:07,401 : INFO : collecting document frequencies
2018-08-06 16:01:07,405 : INFO : PROGRESS: processing document #0
2018-08-06 16:01:07,410 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [12]:
>>> doc_bow = [(0, 1), (1, 1)] # new_vec coming from document 1
>>> print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.70710678118654757), (1, 0.70710678118654757)]


In [13]:
>>> corpus_tfidf = tfidf[corpus]
>>> for doc in corpus_tfidf:
...     print(doc)

[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)]
[(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)]
[(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.70710678118654746), (10, 0.70710678118654746)]
[(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)]
[(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]


In [37]:
>>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=3) # initialize an LSI transformation
>>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

2018-08-06 16:29:28,516 : INFO : using serial LSI version on this node
2018-08-06 16:29:28,521 : INFO : updating model with new documents
2018-08-06 16:29:28,526 : INFO : preparing a new chunk of documents
2018-08-06 16:29:28,529 : INFO : using 100 extra samples and 2 power iterations
2018-08-06 16:29:28,531 : INFO : 1st phase: constructing (12, 103) action matrix
2018-08-06 16:29:28,533 : INFO : orthonormalizing (12, 103) action matrix
2018-08-06 16:29:28,536 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-08-06 16:29:28,539 : INFO : computing the final decomposition
2018-08-06 16:29:28,541 : INFO : keeping 3 factors (discarding 31.801% of energy spectrum)
2018-08-06 16:29:28,542 : INFO : processed documents up to #9
2018-08-06 16:29:28,543 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2018-08-06 16:29:28,545 : INFO

In [38]:
lsi.num_topics

3

In [39]:
>>> lsi.print_topics(3)

2018-08-06 16:29:34,648 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2018-08-06 16:29:34,656 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"
2018-08-06 16:29:34,657 : INFO : topic #2(1.191): 0.456*"time" + 0.456*"response" + -0.352*"eps" + -0.340*"human" + -0.318*"interface" + -0.277*"system" + 0.272*"survey" + 0.213*"user" + -0.183*"trees" + 0.114*"minors"


[(0,
  '-0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"'),
 (2,
  '0.456*"time" + 0.456*"response" + -0.352*"eps" + -0.340*"human" + -0.318*"interface" + -0.277*"system" + 0.272*"survey" + 0.213*"user" + -0.183*"trees" + 0.114*"minors"')]

In [40]:
>>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
...     print(doc)

[(0, -0.066007833960905093), (1, -0.52007033063618413), (2, -0.37649581219168937)]
[(0, -0.19667592859142732), (1, -0.76095631677000486), (2, 0.50806745810016563)]
[(0, -0.089926399724467088), (1, -0.72418606267525032), (2, -0.40898973155376483)]
[(0, -0.075858476521784221), (1, -0.632055158600342), (2, -0.53935336057338967)]
[(0, -0.1015029918498031), (1, -0.57373084830029608), (2, 0.67093385852958987)]
[(0, -0.70321089393783032), (1, 0.1611518021402604), (2, -0.18266089635241509)]
[(0, -0.87747876731198238), (1, 0.16758906864659689), (2, -0.10880822642632884)]
[(0, -0.90986246868185705), (1, 0.14086553628719281), (2, 0.00087117874886882829)]
[(0, -0.61658253505692806), (1, -0.053929075663892309), (2, 0.25568697959599385)]
