# Tomo 1

In [1]:
import os
from collections import Counter, defaultdict
from string import punctuation

import nltk
import numpy as np
from gensim import corpora, models
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
def key_sort_files(x):
    return int(x[:-4].split('-')[0])

path = 'data/aux/biblioteca/text_parts/1/'
unsorted_file_list = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
file_list = sorted(unsorted_file_list, key=key_sort_files)

raw_texts = []
for filename in file_list:
    with open(path + filename) as f:
        raw_texts.append(f.read())

## Cleaning data

In [3]:
chars = []
for text in raw_texts:
    for c in text:
        if not c.isalnum():
            chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_texts = [clean(text) for text in raw_texts]

In [8]:
char_counter = defaultdict(int)
for c in characters:
    for text in cleaned_texts:
        if c in text:
            char_counter[c]+=text.count(c)
char_counter

defaultdict(int, {' ': 37776, '-': 417})

## Preparing data

In [9]:
documents = cleaned_texts

In [10]:
document = ' '.join(documents)

In [11]:
# Only texts that contain MORE THAN or EQUALS TO 100 words
texts = [[word for word in document.split()] for document in documents if len(document.split()) >= 100]

In [12]:
len(texts)

35

In [13]:
dictionary = corpora.Dictionary(texts)

In [14]:
len(dictionary)

8832

## Processing data

In [15]:
%%time

RS = 3

lsi_topics_lst = []
lda_topics_lst = []
hdp_topics_lst = []

MIN_DFs = [1, 2, 4, 7]
MAX_DFs = [0.6, 0.7, 0.8, 0.9]
MAX_FTs = [100, 1000, len(dictionary)]
N_TOPICSs = [4, 10, 35, 100]

total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * len(N_TOPICSs)
iteration = 1
for MIN_DF in MIN_DFs:
    for MAX_DF in MAX_DFs:
        for MAX_FT in MAX_FTs:
            dictionary = corpora.Dictionary(texts)
            dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF, keep_n=MAX_FT)
            corpus = [dictionary.doc2bow(text) for text in texts]
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            
            for N_TOPICS in N_TOPICSs:
                print('Iteration: {} / {}'.format(iteration, total_iterations), \
                      '- MIN_DF:', MIN_DF, '- MAX_DF:', MAX_DF, '- MAX_FT:', MAX_FT, '- N_TOPICS:', N_TOPICS)
                iteration += 1
                
                # LSI
#                 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
#                 corpus_lsi = lsi[corpus_tfidf]
#                 ts = lsi.show_topics(lsi.num_topics, formatted=False)
#                 result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lsi)
#                 lsi_topics_lst.append(result)
            
                # LDA
                lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=RS)
                corpus_lda = lda[corpus]
                ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lda)
                lda_topics_lst.append(result)
            
                # HDP
                hdp = models.HdpModel(corpus, id2word=dictionary, random_state=RS)
                corpus_hdp = hdp[corpus]
                ts = hdp.show_topics(N_TOPICS, num_words=10, formatted=False)
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_hdp)
                hdp_topics_lst.append(result)

Iteration: 1 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 4
Iteration: 2 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 3 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 35
Iteration: 4 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 5 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 4
Iteration: 6 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 7 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 35
Iteration: 8 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 9 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 8832 - N_TOPICS: 4
Iteration: 10 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 8832 - N_TOPICS: 10
Iteration: 11 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 8832 - N_TOPICS: 35
Iteration: 12 / 192 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 8832 - N_TOPICS: 100
Iteration: 13 / 192 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 4
Iteration: 14 / 192 - MIN_D

Iteration: 111 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 35
Iteration: 112 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 113 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 4
Iteration: 114 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 115 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 35
Iteration: 116 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 117 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 8832 - N_TOPICS: 4
Iteration: 118 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 8832 - N_TOPICS: 10
Iteration: 119 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 8832 - N_TOPICS: 35
Iteration: 120 / 192 - MIN_DF: 4 - MAX_DF: 0.7 - MAX_FT: 8832 - N_TOPICS: 100
Iteration: 121 / 192 - MIN_DF: 4 - MAX_DF: 0.8 - MAX_FT: 100 - N_TOPICS: 4
Iteration: 122 / 192 - MIN_DF: 4 - MAX_DF: 0.8 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 123 / 192 - MIN_DF: 4 - MAX_DF: 0.8 - MAX_FT: 100 - N_TOPICS: 35
Iter

In [16]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = [word for word, weight in t]
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    return dict(t)

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [17]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def best_topics(ts_lst):
    r = []
    for ts in ts_lst:
        ts_vector = vectorize(ts[0])
        d = multiple_topics_distance(ts_vector)
        r.append((d, ts))
    r = sorted(r, reverse=True)
    return r[0]

In [18]:
%%time

# print('LSI')
# lsi_best_topics = best_topics(lsi_topics_lst)
print('LDA')
lda_best_topics = best_topics(lda_topics_lst)
print('HDP')
hdp_best_topics = best_topics(hdp_topics_lst)

LDA
HDP
CPU times: user 32.8 s, sys: 292 ms, total: 33.1 s
Wall time: 32.8 s


## Results

### LSI

#### Topics

In [19]:
# lsi_best_topics[1][0]

#### Association document-topics

In [20]:
# corpus = lsi_best_topics[1][2]
# for doc in corpus:
#     print(doc)

#### Topics sorted by recurrence

In [21]:
# total_weights = defaultdict(float)
# for doc in corpus:
#     for topic, weight in doc:
#         total_weights[topic] += abs(weight)
# total_weights = dict(total_weights)

In [22]:
# Counter(total_weights).most_common()

### LDA

#### Topics

In [23]:
lda_best_topics[1][0]

[(0,
  [('general', 0.015070368),
   ('mesa', 0.011503357),
   ('marco', 0.00990339),
   ('internacional', 0.0096022291),
   ('ver', 0.0095808012),
   ('lucha', 0.008745254),
   ('años', 0.0082478374),
   ('colombiano', 0.0077144005),
   ('ejemplo', 0.0072955685),
   ('temas', 0.0070030349)]),
 (1,
  [('víctimas', 0.044044901),
   ('ley', 0.028937127),
   ('reparación', 0.022934349),
   ('farc', 0.016626451),
   ('hoy', 0.01286369),
   ('carlos', 0.012476814),
   ('sociedad', 0.011274189),
   ('atención', 0.010892768),
   ('zona', 0.010325162),
   ('justicia', 0.01002058)]),
 (2,
  [('agenda', 0.014964212),
   ('participación', 0.013239958),
   ('fase', 0.012774641),
   ('mesa', 0.010934199),
   ('conversaciones', 0.010594905),
   ('desarrollo', 0.0090510938),
   ('puntos', 0.0086087752),
   ('acordado', 0.0086054588),
   ('terminación', 0.0081842206),
   ('farc', 0.008114134)]),
 (3,
  [('equipo', 0.015996316),
   ('farc', 0.0098988106),
   ('importante', 0.0095360698),
   ('forma', 0

#### Association document-topics

In [24]:
corpus = lda_best_topics[1][2]
for doc in corpus:
    print(doc)

[(54, 0.99673271)]
[(2, 0.044665169), (20, 0.033743046), (60, 0.83309066), (82, 0.01972905), (93, 0.050558787)]
[(57, 0.23077063), (76, 0.75342292)]
[(2, 0.11798243), (6, 0.045397252), (17, 0.08273299), (20, 0.05890942), (25, 0.063138582), (34, 0.076957859), (39, 0.052816395), (43, 0.010576108), (54, 0.082798593), (59, 0.018047627), (60, 0.12912481), (77, 0.21283244), (92, 0.016223356), (93, 0.019003602)]
[(34, 0.9930281)]
[(20, 0.87713802), (80, 0.12125546)]
[(67, 0.98714286)]
[(25, 0.99589211)]
[(20, 0.016903166), (41, 0.28414685), (54, 0.074459374), (60, 0.022531917), (65, 0.57187194)]
[(2, 0.073010653), (17, 0.15234599), (20, 0.090350166), (55, 0.44309044), (59, 0.024978546), (67, 0.054867692), (70, 0.15798695)]
[(70, 0.98349994)]
[(6, 0.47893724), (17, 0.16948582), (59, 0.064065687), (70, 0.28189719)]
[(63, 0.9943428)]
[(59, 0.98643839)]
[(2, 0.012056808), (25, 0.018005235), (41, 0.036852323), (54, 0.72699249), (81, 0.16277973), (93, 0.036786981)]
[(17, 0.59160578), (34, 0.2287870

#### Topics sorted by recurrence

In [25]:
total_weights = defaultdict(float)
for doc in corpus:
    for topic, weight in doc:
        total_weights[topic] += abs(weight)
total_weights = dict(total_weights)

In [26]:
Counter(total_weights).most_common()

[(17, 3.1244077254086733),
 (2, 2.7160263359546661),
 (54, 2.1148478537797928),
 (93, 2.1090674083679914),
 (60, 2.0912751704454422),
 (41, 1.5464098192751408),
 (70, 1.4233634769916534),
 (34, 1.2986726015806198),
 (20, 1.2376726446673274),
 (25, 1.1556511064991355),
 (59, 1.1440475583076477),
 (39, 1.1064300257712603),
 (96, 1.1050218716263771),
 (67, 1.1013595480471849),
 (92, 1.0036254096776247),
 (62, 0.99700003862380981),
 (63, 0.99434280395507812),
 (85, 0.97524994611740112),
 (65, 0.96176449954509735),
 (87, 0.93601882457733154),
 (43, 0.76728417631238699),
 (55, 0.76269321888685226),
 (76, 0.75283116102218628),
 (15, 0.66009402275085449),
 (6, 0.55159792490303516),
 (40, 0.45285332202911377),
 (82, 0.37025244906544685),
 (53, 0.26204594969749451),
 (57, 0.23136238753795624),
 (77, 0.21516917645931244),
 (24, 0.20858719944953918),
 (81, 0.1618371456861496),
 (80, 0.15548631548881531),
 (30, 0.026036109775304794)]

### HDP

#### Topics

In [27]:
hdp_best_topics[1][0]

[(0,
  [('mesa', 0.0075346773828548277),
   ('fase', 0.0051975000963632298),
   ('conversaciones', 0.0049505642434960985),
   ('agenda', 0.00486544241929568),
   ('general', 0.004844063991800188),
   ('diálogo', 0.0045111058902907383),
   ('noruega', 0.0043126942813266501),
   ('delegados', 0.0042413638479691478),
   ('agosto', 0.0039877273342128353),
   ('jaramillo', 0.0038794211596285279)]),
 (1,
  [('desarrollo', 0.0055187465428750476),
   ('gran', 0.0054166570017025444),
   ('hoy', 0.0051105905410830238),
   ('colombianos', 0.0049448370964122242),
   ('exploratorio', 0.004139841714708545),
   ('seguridad', 0.0038930326825079055),
   ('social', 0.0035023685209391356),
   ('vida', 0.0033652822808889994),
   ('mundo', 0.0033367307861352281),
   ('fuerzas', 0.0032790124686234586)]),
 (2,
  [('negociación', 0.012126821911735374),
   ('farc', 0.00970372253076319),
   ('víctimas', 0.0074994395105206424),
   ('sistema', 0.0042129006490405144),
   ('paso', 0.0035948127365890525),
   ('justi

#### Association document-topics

In [28]:
corpus = hdp_best_topics[1][2]
for doc in corpus:
    print(doc)

[(12, 0.9982108900405563)]
[(2, 0.99961348842383657)]
[(26, 0.99142230968466494)]
[(0, 0.9996518433846785)]
[(0, 0.89715709430113677), (1, 0.099494712319693437)]
[(8, 0.99912483862812485)]
[(22, 0.99434558515896632)]
[(0, 0.99802454979367861)]
[(1, 0.99964994992828582)]
[(15, 0.99804615975468725)]
[(29, 0.99035525672952196)]
[(16, 0.9969028261701578)]
[(17, 0.99685575005364735)]
[(23, 0.99240447174089108)]
[(4, 0.99931056173366384)]
[(9, 0.82657819079915429), (14, 0.16998285134874885)]
[(20, 0.9968564286411804)]
[(19, 0.9963528270119385)]
[(14, 0.9980449148040289)]
[(6, 0.99926037152639569)]
[(7, 0.99916415781389278)]
[(11, 0.99881707897653127)]
[(24, 0.99070659833138441)]
[(10, 0.99907388799799945)]
[(1, 0.17869457540427702), (10, 0.81934901816722061)]
[(18, 0.99656662112623118)]
[(3, 0.99836908159405346)]
[(9, 0.99872933470312686)]
[(5, 0.99927352528591651)]
[(3, 0.99935220510323264)]
[(25, 0.98588484083552275)]
[(21, 0.99685170760359054)]
[(0, 0.99963863470644854)]
[(13, 0.998198474

#### Topics sorted by recurrence

In [29]:
total_weights = defaultdict(float)
for doc in corpus:
    for topic, weight in doc:
        total_weights[topic] += abs(weight)
total_weights = dict(total_weights)

In [30]:
Counter(total_weights).most_common()

[(0, 4.8928092380961345),
 (3, 1.9977212866972862),
 (9, 1.8253075255022813),
 (10, 1.8184229061652202),
 (1, 1.2778392376522563),
 (14, 1.1680277661527778),
 (2, 0.99961348842383657),
 (4, 0.99931056173366384),
 (5, 0.99927352528591651),
 (6, 0.99926037152639569),
 (7, 0.99916415781389278),
 (8, 0.99912483862812485),
 (11, 0.99881707897653127),
 (12, 0.9982108900405563),
 (13, 0.99819847465909206),
 (15, 0.99804615975468725),
 (16, 0.9969028261701578),
 (20, 0.9968564286411804),
 (17, 0.99685575005364735),
 (21, 0.99685170760359054),
 (18, 0.99656662112623118),
 (19, 0.9963528270119385),
 (22, 0.99434558515896632),
 (23, 0.99240447174089108),
 (26, 0.99142230968466494),
 (24, 0.99070659833138441),
 (29, 0.99035525672952196),
 (25, 0.98588484083552275)]