# Tópicos

In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from collections import Counter

from gensim import corpora, models

## Loading data

In [3]:
with open('data/out/cleaned_tomo_1.txt') as f:
    doc1 = f.read()

with open('data/out/cleaned_tomo_5a.txt') as f:
    doc5a = f.read()

with open('data/out/cleaned_tomo_5b.txt') as f:
    doc5b = f.read()

with open('data/out/cleaned_tomo_7.txt') as f:
    doc7 = f.read()

## Cleaning data

In [4]:
documents = [doc1, doc5a, doc5b, doc7]

In [5]:
document = ' '.join(documents)

In [6]:
counter = Counter(document.split())

In [7]:
texts = [[word for word in document.split() if counter[word] > 1] for document in documents]

## Processing data

### LSI

In [8]:
dictionary = corpora.Dictionary(texts)

In [9]:
print(dictionary)

Dictionary(10541 unique tokens: ['compartieron', 'obliga', 'enterada', 'cárceles', 'diferente']...)


In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
tfidf = models.TfidfModel(corpus)

In [12]:
print(tfidf)

TfidfModel(num_docs=4, num_nnz=26861)


In [13]:
corpus_tfidf = tfidf[corpus]

In [14]:
N_TOPICS = 4

In [15]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [16]:
lsi.num_topics

4

In [17]:
lsi.print_topics(lsi.num_topics)

[(0,
  '0.262*"cbm" + 0.126*"piloto" + 0.118*"exploratorio" + 0.108*"descontaminación" + 0.102*"ubpd" + 0.098*"panel" + 0.096*"sexual" + 0.095*"orejón" + 0.089*"informes" + 0.084*"esclarecer"'),
 (1,
  '-0.382*"exploratorio" + -0.240*"henry" + -0.217*"epa" + -0.201*"secreto" + 0.171*"panel" + -0.145*"preparatorio" + -0.137*"dígales" + -0.137*"calarcá" + 0.135*"paneles" + -0.121*"exploratorias"'),
 (2,
  '-0.230*"panel" + -0.228*"exploratorio" + -0.181*"paneles" + -0.147*"henry" + -0.134*"mesas" + -0.131*"epa" + -0.129*"socialización" + -0.121*"secreto" + -0.121*"gráfico" + 0.120*"cbm"'),
 (3,
  '-0.405*"cbm" + 0.137*"piloto" + 0.135*"descontaminación" + 0.122*"sentencias" + 0.116*"hdc" + -0.115*"cdv" + 0.109*"ubpd" + 0.108*"orejón" + 0.103*"apn" + -0.103*"victimizante"')]

In [18]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, 0.34204791329790379), (1, -0.81477063176349873), (2, -0.46733697479631214), (3, -0.02735314630006434)]
[(0, 0.74145425742051863), (1, 0.11152117545806482), (2, 0.23379310000832354), (3, -0.61899062833450635)]
[(0, 0.70775978189226996), (1, -0.059047027532677425), (2, 0.41165320402066591), (3, 0.57107896064807118)]
[(0, 0.43531499353015007), (1, 0.54625698387905353), (2, -0.70029022373305283), (3, 0.14730161748979806)]


### RP

In [19]:
N_TOPICS = 4

In [20]:
rp = models.RpModel(corpus_tfidf, num_topics=N_TOPICS)
corpus_rp = rp[corpus_tfidf]

In [21]:
rp.num_topics

4

In [22]:
rp.projection

array([[ 1., -1.,  1., ...,  1., -1.,  1.],
       [-1.,  1.,  1., ..., -1.,  1., -1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ..., -1.,  1.,  1.]], dtype=float32)

In [23]:
for doc in corpus_rp:
    print(doc)

[(0, 0.10112765431404114), (1, -0.8606229424476624), (2, -0.736624538898468), (3, 0.8527173399925232)]
[(0, 0.08899615705013275), (1, -0.19061729311943054), (2, 0.4063926935195923), (3, -0.3256755471229553)]
[(0, 0.15313038229942322), (1, -0.6347262263298035), (2, -0.3414880037307739), (3, 0.20387393236160278)]
[(0, -0.14046519994735718), (1, -0.045565567910671234), (2, 1.147280216217041), (3, 0.3387052118778229)]


### LDA

In [24]:
N_TOPICS = 2

In [25]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS)
corpus_lda = lda[corpus]

In [26]:
lda.num_topics

2

In [27]:
lda.print_topics()

[(0,
  '0.016*"paz" + 0.011*"víctimas" + 0.009*"conflicto" + 0.008*"nacional" + 0.008*"farc-ep" + 0.007*"gobierno" + 0.007*"acuerdo" + 0.006*"mujeres" + 0.006*"conversaciones" + 0.005*"proceso"'),
 (1,
  '0.015*"paz" + 0.010*"víctimas" + 0.010*"conflicto" + 0.009*"gobierno" + 0.008*"farc-ep" + 0.007*"mesa" + 0.006*"proceso" + 0.006*"colombia" + 0.006*"acuerdo" + 0.006*"participación"')]

In [28]:
for doc in corpus_lda:
    print(doc)

[(0, 0.125499), (1, 0.87450099)]
[(0, 0.37531364), (1, 0.62468636)]
[(0, 0.49615833), (1, 0.5038417)]
[(0, 0.81025672), (1, 0.18974333)]


### HDP

In [29]:
N_TOPICS = 4

In [35]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [36]:
hdp.print_topics()

[(0,
  '0.017*paz + 0.012*conflicto + 0.012*víctimas + 0.009*gobierno + 0.009*farc-ep + 0.008*acuerdo + 0.007*verdad + 0.006*ciclo + 0.006*proceso + 0.006*justicia'),
 (1,
  '0.019*paz + 0.015*mujeres + 0.012*nacional + 0.011*participación + 0.009*mesa + 0.009*organizaciones + 0.009*género + 0.008*foro + 0.007*conversaciones + 0.007*acuerdo'),
 (2,
  '0.019*víctimas + 0.014*paz + 0.013*conflicto + 0.008*farc-ep + 0.008*gobierno + 0.007*proceso + 0.007*ciclo + 0.006*mesa + 0.006*colombia + 0.006*derechos'),
 (3,
  '0.013*gobierno + 0.012*paz + 0.012*farc-ep + 0.008*acuerdo + 0.008*conflicto + 0.007*colombia + 0.007*proceso + 0.006*presidente + 0.006*encuentro + 0.005*país'),
 (4,
  '0.001*incorporado + 0.001*sobrevivido + 0.000*suficiente + 0.000*rezagos + 0.000*asumí + 0.000*terrepaz + 0.000*compañera + 0.000*ops + 0.000*niñas + 0.000*deben'),
 (5,
  '0.001*útil + 0.001*marginados + 0.001*bautista + 0.001*armamento + 0.001*ciudad + 0.000*brindarle + 0.000*wola + 0.000*dividendo + 0.000

In [37]:
for doc in corpus_hdp:
    print(doc)

[(3, 0.99996328044163219)]
[(2, 0.99985524951969851)]
[(0, 0.99996025652488585)]
[(1, 0.9999829712922258)]
