# Tópicos

In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from collections import Counter

from gensim import corpora, models

## Loading data

In [3]:
with open('data/out/cleaned_tomo_1.txt') as f:
    doc1 = f.read()

with open('data/out/cleaned_tomo_5a.txt') as f:
    doc5a = f.read()

with open('data/out/cleaned_tomo_5b.txt') as f:
    doc5b = f.read()

with open('data/out/cleaned_tomo_7.txt') as f:
    doc7 = f.read()

## Cleaning data

In [4]:
documents = [doc1, doc5a, doc5b, doc7]

In [5]:
document = ' '.join(documents)

In [6]:
counter = Counter(document.split())

In [7]:
texts = [[word for word in document.split() if counter[word] > 1] for document in documents]

## Processing data

### LSI

In [8]:
dictionary = corpora.Dictionary(texts)

In [9]:
print(dictionary)

Dictionary(10541 unique tokens: ['miseria', 'ucrós', 'destrucción', 'enfrentamiento', 'agradecer']...)


In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
tfidf = models.TfidfModel(corpus)

In [12]:
print(tfidf)

TfidfModel(num_docs=4, num_nnz=26861)


In [13]:
corpus_tfidf = tfidf[corpus]

In [14]:
N_TOPICS = 4

In [15]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [16]:
lsi.num_topics

4

In [17]:
lsi.print_topics(lsi.num_topics)

[(0,
  '0.262*"cbm" + 0.126*"piloto" + 0.118*"exploratorio" + 0.108*"descontaminación" + 0.102*"ubpd" + 0.098*"panel" + 0.096*"sexual" + 0.095*"orejón" + 0.089*"informes" + 0.084*"esclarecer"'),
 (1,
  '0.382*"exploratorio" + 0.240*"henry" + 0.217*"epa" + 0.201*"secreto" + -0.171*"panel" + 0.145*"preparatorio" + 0.137*"dígales" + 0.137*"calarcá" + -0.135*"paneles" + 0.121*"exploratorias"'),
 (2,
  '-0.230*"panel" + -0.228*"exploratorio" + -0.181*"paneles" + -0.147*"henry" + -0.134*"mesas" + -0.131*"epa" + -0.129*"socialización" + -0.121*"secreto" + -0.121*"gráfico" + 0.120*"cbm"'),
 (3,
  '0.405*"cbm" + -0.137*"piloto" + -0.135*"descontaminación" + -0.122*"sentencias" + -0.116*"hdc" + 0.115*"cdv" + -0.109*"ubpd" + -0.108*"orejón" + -0.103*"apn" + 0.103*"victimizante"')]

In [18]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, 0.34204791329787609), (1, 0.81477063176363063), (2, -0.46733697479608949), (3, 0.027353146300071757)]
[(0, 0.74145425742050486), (1, -0.11152117545813098), (2, 0.23379310000824158), (3, 0.61899062833455898)]
[(0, 0.70775978189230526), (1, 0.059047027532580176), (2, 0.41165320402068289), (3, -0.57107896064801955)]
[(0, 0.43531499353014075), (1, -0.54625698387884414), (2, -0.70029022373322314), (3, -0.14730161748982434)]


In [19]:
corpus_lsi[0]

[(0, 0.34204791329787609),
 (1, 0.81477063176363063),
 (2, -0.46733697479608949),
 (3, 0.027353146300071757)]

### RP

In [20]:
N_TOPICS = 4

In [21]:
rp = models.RpModel(corpus_tfidf, num_topics=N_TOPICS)
corpus_rp = rp[corpus_tfidf]

In [22]:
rp.num_topics

4

In [23]:
rp.projection

array([[-1., -1., -1., ...,  1.,  1., -1.],
       [ 1., -1.,  1., ...,  1.,  1., -1.],
       [ 1.,  1., -1., ..., -1., -1.,  1.],
       [ 1.,  1.,  1., ..., -1., -1., -1.]], dtype=float32)

In [24]:
for doc in corpus_rp:
    print(doc)

[(0, -0.49495112895965576), (1, 0.017419002950191498), (2, -0.8641291260719299), (3, -0.6086920499801636)]
[(0, -0.20614978671073914), (1, -0.15201213955879211), (2, 0.22678709030151367), (3, 0.7231939435005188)]
[(0, -0.7200427055358887), (1, 0.11906775832176208), (2, -0.7675713300704956), (3, 0.2852909564971924)]
[(0, 0.3205200433731079), (1, 0.3057404160499573), (2, 1.2161310911178589), (3, 0.20154863595962524)]


### LDA

In [25]:
N_TOPICS = 2

In [26]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS)
corpus_lda = lda[corpus]

In [27]:
lda.num_topics

2

In [28]:
lda.print_topics()

[(0,
  '0.016*"paz" + 0.011*"víctimas" + 0.009*"conflicto" + 0.009*"gobierno" + 0.008*"farc-ep" + 0.007*"proceso" + 0.007*"acuerdo" + 0.007*"nacional" + 0.006*"mesa" + 0.005*"participación"'),
 (1,
  '0.015*"paz" + 0.010*"víctimas" + 0.009*"conflicto" + 0.007*"farc-ep" + 0.007*"gobierno" + 0.007*"nacional" + 0.006*"acuerdo" + 0.006*"colombia" + 0.006*"mujeres" + 0.006*"conversaciones"')]

In [29]:
for doc in corpus_lda:
    print(doc)

[(0, 0.63620955), (1, 0.36379039)]
[(0, 0.50800985), (1, 0.49199018)]
[(0, 0.74987602), (1, 0.25012398)]
[(0, 0.27493426), (1, 0.72506577)]


### HDP

In [30]:
N_TOPICS = 4

In [31]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [32]:
hdp.print_topics()

[(0,
  '0.017*paz + 0.012*conflicto + 0.012*víctimas + 0.009*gobierno + 0.009*farc-ep + 0.008*acuerdo + 0.007*verdad + 0.006*ciclo + 0.006*proceso + 0.006*justicia'),
 (1,
  '0.019*paz + 0.015*mujeres + 0.012*nacional + 0.011*participación + 0.009*mesa + 0.009*organizaciones + 0.009*género + 0.008*foro + 0.007*conversaciones + 0.007*acuerdo'),
 (2,
  '0.019*víctimas + 0.014*paz + 0.013*conflicto + 0.008*farc-ep + 0.008*gobierno + 0.007*proceso + 0.007*ciclo + 0.006*mesa + 0.006*colombia + 0.006*derechos'),
 (3,
  '0.013*gobierno + 0.012*paz + 0.012*farc-ep + 0.008*acuerdo + 0.008*conflicto + 0.007*colombia + 0.007*proceso + 0.006*presidente + 0.006*encuentro + 0.005*país'),
 (4,
  '0.001*uniforme + 0.001*exper + 0.001*medición + 0.001*étnica + 0.000*surgir + 0.000*fácilmente + 0.000*promoción + 0.000*elige + 0.000*benedetti + 0.000*inimaginables'),
 (5,
  '0.001*ferviente + 0.001*definido + 0.001*relatores + 0.001*aproximaciones + 0.000*ejecutiva + 0.000*pnis + 0.000*capacitar + 0.000*

In [33]:
for doc in corpus_hdp:
    print(doc)

[(3, 0.99996327660929818)]
[(2, 0.99985605339839179)]
[(0, 0.99995984986946496)]
[(1, 0.99998297201473074)]
