# Tomo 1

In [1]:
import os
from collections import Counter, defaultdict
from pprint import pprint
from string import punctuation

import nltk
import numpy as np
from gensim import corpora, models
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
def key_sort_files(x):
    return int(x[:-4].split('-')[0])

path = 'data/aux/biblioteca/text_parts/5a/'
unsorted_file_list = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
file_list = sorted(unsorted_file_list, key=key_sort_files)

raw_texts = []
for filename in file_list:
    with open(path + filename) as f:
        raw_texts.append(f.read())

## Cleaning data

In [3]:
chars = []
for text in raw_texts:
    for c in text:
        if not c.isalnum():
            chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_texts = [clean(text) for text in raw_texts]

In [8]:
char_counter = defaultdict(int)
for c in characters:
    for text in cleaned_texts:
        if c in text:
            char_counter[c]+=text.count(c)
char_counter

defaultdict(int, {' ': 49834, '-': 416})

## Preparing data

In [9]:
documents = cleaned_texts

In [10]:
document = ' '.join(documents)

In [11]:
# Only texts that contain MORE THAN or EQUALS TO 100 words
texts = [[word for word in document.split()] for document in documents if len(document.split()) >= 100]

In [12]:
len(texts)

79

In [13]:
dictionary = corpora.Dictionary(texts)

In [14]:
len(dictionary)

9203

## Processing data

In [15]:
%%time

lsi_topics_lst = []
lda_topics_lst = []
hdp_topics_lst = []

MIN_DFs = [1, 2, 4, 7]
MAX_DFs = [0.6, 0.7, 0.8, 0.9]
MAX_FTs = [100, 1000, len(dictionary)]
N_TOPICSs = [10, len(texts), 100]

total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * len(N_TOPICSs)
iteration = 1
for MIN_DF in MIN_DFs:
    for MAX_DF in MAX_DFs:
        for MAX_FT in MAX_FTs:
            dictionary = corpora.Dictionary(texts)
            dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF, keep_n=MAX_FT)
            corpus = [dictionary.doc2bow(text) for text in texts]
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            
            for N_TOPICS in N_TOPICSs:
                print('Iteration: {} / {}'.format(iteration, total_iterations), \
                      '- MIN_DF:', MIN_DF, '- MAX_DF:', MAX_DF, '- MAX_FT:', MAX_FT, '- N_TOPICS:', N_TOPICS)
                iteration += 1
                
                # LSI
#                 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
#                 corpus_lsi = lsi[corpus_tfidf]
#                 ts = lsi.show_topics(lsi.num_topics, formatted=False)
#                 result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lsi)
#                 lsi_topics_lst.append(result)
            
                # LDA
                lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=1)
                corpus_lda = lda[corpus]
                ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lda)
                lda_topics_lst.append(result)
            
                # HDP
                hdp = models.HdpModel(corpus, id2word=dictionary, random_state=1)
                corpus_hdp = hdp[corpus]
                ts = hdp.show_topics(N_TOPICS, num_words=10, formatted=False)
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_hdp)
                hdp_topics_lst.append(result)

Iteration: 1 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 2 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 79
Iteration: 3 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 4 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 5 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 79
Iteration: 6 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 7 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 10
Iteration: 8 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 79
Iteration: 9 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 100
Iteration: 10 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 11 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 79
Iteration: 12 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 13 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 14 / 144 - M

Iteration: 109 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 110 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 79
Iteration: 111 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 112 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 113 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 79
Iteration: 114 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 115 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 10
Iteration: 116 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 79
Iteration: 117 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 9203 - N_TOPICS: 100
Iteration: 118 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 119 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 79
Iteration: 120 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 121 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
I

In [16]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = [word for word, weight in t]
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    return dict(t)

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [17]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def best_topics(ts_lst):
    r = []
    for ts in ts_lst:
        ts_vector = vectorize(ts[0])
        d = multiple_topics_distance(ts_vector)
        r.append((d, ts))
    r = sorted(r, reverse=True)
    return r[0]

In [18]:
%%time

# print('LSI')
# lsi_best_topics = best_topics(lsi_topics_lst)
print('LDA')
lda_best_topics = best_topics(lda_topics_lst)
print('HDP')
hdp_best_topics = best_topics(hdp_topics_lst)

LDA
HDP
CPU times: user 48.6 s, sys: 292 ms, total: 48.9 s
Wall time: 48.5 s


## Results

### LSI

#### Topics

In [19]:
# lsi_best_topics[1][0]

#### Association document-topics

In [20]:
# corpus = lsi_best_topics[1][2]
# for doc in corpus:
#     print(doc)

#### Topics sorted by recurrence

In [21]:
# total_weights = defaultdict(float)
# for doc in corpus:
#     for topic, weight in doc:
#         total_weights[topic] += abs(weight)
# total_weights = dict(total_weights)

In [22]:
# Counter(total_weights).most_common()

### LDA

#### Topics

In [23]:
lda_best_topics[1][0]

[(0,
  [('personas', 0.054340605),
   ('social', 0.040126812),
   ('mujeres', 0.035297837),
   ('población', 0.034095619),
   ('actores', 0.029941056),
   ('unidas', 0.028695222),
   ('trabajo', 0.025829362),
   ('miembros', 0.0255627),
   ('pública', 0.025284709),
   ('años', 0.025072102)]),
 (1,
  [('comisión', 0.065741092),
   ('verdad', 0.041718908),
   ('subcomisión', 0.03377616),
   ('justicia', 0.032139521),
   ('miembros', 0.031344246),
   ('caso', 0.029626328),
   ('trabajo', 0.029033326),
   ('propósito', 0.02649484),
   ('histórica', 0.025025403),
   ('política', 0.024725597)]),
 (2,
  [('participación', 0.053949397),
   ('reparación', 0.047677197),
   ('hecho', 0.044441342),
   ('organizaciones', 0.034843814),
   ('forma', 0.027645558),
   ('mujeres', 0.025006913),
   ('propuestas', 0.024591701),
   ('delegaciones', 0.022547232),
   ('especial', 0.022386797),
   ('reconocimiento', 0.021511683)]),
 (3,
  [('guerra', 0.095660999),
   ('comisión', 0.048105508),
   ('verdad', 0

#### Association document-topics

In [24]:
corpus = lda_best_topics[1][2]
for doc in corpus:
    print(doc)

[(18, 0.38770822), (97, 0.60810375)]
[(17, 0.98900002)]
[(24, 0.69450033), (62, 0.10637833), (81, 0.19390625)]
[(47, 0.82375038), (52, 0.16044317)]
[(17, 0.91407937), (52, 0.081276126)]
[(82, 0.99074769)]
[(26, 0.85192722), (74, 0.024861116), (97, 0.11945202)]
[(84, 0.9888764)]
[(80, 0.91305679), (88, 0.05533034)]
[(74, 0.98428571)]
[(17, 0.12341655), (47, 0.29506788), (52, 0.18135269), (88, 0.14262256), (99, 0.24170698)]
[(54, 0.17569135), (74, 0.27440244), (78, 0.53125238)]
[(18, 0.064989574), (26, 0.057812355), (28, 0.87155849)]
[(18, 0.10874125), (26, 0.18061376), (47, 0.3432301), (84, 0.33116791), (99, 0.019690556)]
[(88, 0.96192306)]
[(16, 0.068327837), (62, 0.83042675), (95, 0.095539495)]
[(99, 0.96700001)]
[(16, 0.21776897), (62, 0.7532956), (88, 0.023262961)]
[(7, 0.80797863), (28, 0.11022753), (84, 0.076968014)]
[(35, 0.98585719)]
[(54, 0.97585362)]
[(29, 0.24214211), (33, 0.52676696), (53, 0.10212834), (59, 0.11999062)]
[(90, 0.9668988), (99, 0.02726787)]
[(26, 0.47010174), 

#### Topics sorted by recurrence

In [25]:
total_weights = defaultdict(float)
for doc in corpus:
    for topic, weight in doc:
        total_weights[topic] += abs(weight)
total_weights = dict(total_weights)

In [26]:
Counter(total_weights).most_common()

[(84, 5.0802265778183937),
 (64, 3.9276603162288666),
 (62, 3.4696711674332619),
 (52, 3.2865892387926579),
 (26, 2.9114286080002785),
 (18, 2.8646447770297527),
 (17, 2.759033590555191),
 (47, 2.5672200173139572),
 (28, 2.2984330430626869),
 (80, 2.0651719626039267),
 (16, 1.9360956847667694),
 (7, 1.7051036097109318),
 (82, 1.6809869408607483),
 (24, 1.6523680686950684),
 (53, 1.6166069284081459),
 (88, 1.5976390819996595),
 (78, 1.5544845089316368),
 (76, 1.4147489443421364),
 (97, 1.378586933016777),
 (74, 1.2974521601572633),
 (5, 1.2831802070140839),
 (99, 1.2823323234915733),
 (3, 1.1669191271066666),
 (54, 1.1516365855932236),
 (86, 1.1462900787591934),
 (40, 1.1017125844955444),
 (14, 1.0466789864003658),
 (15, 1.0382961463183165),
 (94, 1.0366873145103455),
 (90, 1.0348463952541351),
 (29, 0.9947698563337326),
 (66, 0.99392634630203247),
 (69, 0.99146550893783569),
 (35, 0.98585718870162964),
 (57, 0.98585718870162964),
 (37, 0.98544120788574219),
 (46, 0.98522388935089111),


#### Top 10 topics

In [27]:
lda_best_topics_dct = dict(lda_best_topics[1][0])
for topic_id, weight in Counter(total_weights).most_common()[:10]:
    print(topic_id, '-', weight)
    pprint(lda_best_topics_dct[topic_id])
    print()

84 - 5.08022657782
[('país', 0.040736727),
 ('social', 0.040395759),
 ('justicia', 0.03205939),
 ('verdad', 0.029511023),
 ('partes', 0.026317887),
 ('construcción', 0.024091218),
 ('años', 0.023968088),
 ('reconciliación', 0.022870962),
 ('medidas', 0.021772444),
 ('debe', 0.020869808)]

64 - 3.92766031623
[('delegaciones', 0.085597225),
 ('marco', 0.067213424),
 ('garantes', 0.038221899),
 ('acompañantes', 0.038130127),
 ('comisión', 0.037031617),
 ('países', 0.036621835),
 ('verdad', 0.034247603),
 ('propuestas', 0.024621991),
 ('representantes', 0.024146853),
 ('agenda', 0.023162525)]

62 - 3.46967116743
[('comisión', 0.11907797),
 ('trabajo', 0.033435173),
 ('subcomisión', 0.028990448),
 ('verdad', 0.025640469),
 ('histórica', 0.023981383),
 ('puntos', 0.023658464),
 ('medidas', 0.022186119),
 ('delegaciones', 0.019357778),
 ('representantes', 0.018764907),
 ('desarrollo', 0.018431358)]

52 - 3.28658923879
[('acuerdos', 0.05179872),
 ('participación', 0.03337758),
 ('hoy', 0.03322

### HDP

#### Topics

In [28]:
hdp_best_topics[1][0]

[(0,
  [('verdad', 0.0067678273629236292),
   ('comisión', 0.0063857159473691546),
   ('justicia', 0.0061018133269391608),
   ('reconocimiento', 0.004515592780729473),
   ('participación', 0.0042060987880528306),
   ('satisfacción', 0.0041200504800445407),
   ('reparación', 0.0038784747446529732),
   ('principios', 0.0033802588787005474),
   ('agenda', 0.0030631727025306065),
   ('repetición', 0.0030146326263065871)]),
 (1,
  [('medidas', 0.0062935529128474998),
   ('delegaciones', 0.0037339131631550742),
   ('partes', 0.0032599149312722933),
   ('criterios', 0.0031485856520193827),
   ('pensamiento', 0.0030958027428465381),
   ('selección', 0.0030710460921224373),
   ('menores', 0.0030621804718379587),
   ('confianza', 0.0029714529703338578),
   ('armado', 0.0029206011646252788),
   ('primera', 0.0029057857106824674)]),
 (2,
  [('medidas', 0.0061987677253666981),
   ('reúne', 0.0057190264702405265),
   ('delegaciones', 0.0054951029277734128),
   ('subcomisión', 0.0052469815657871042),

#### Association document-topics

In [29]:
corpus = hdp_best_topics[1][2]
for doc in corpus:
    print(doc)

[(0, 0.050198478624534165), (4, 0.94875704080708656)]
[(1, 0.94309001978446716), (12, 0.054378532317609342)]
[(2, 0.99871300157192522)]
[(0, 0.21276092750815781), (24, 0.78297631356671127)]
[(12, 0.99895603795492616)]
[(3, 0.3149711272938861), (9, 0.68217410069194417)]
[(0, 0.99885292662164349)]
[(26, 0.99724076116136207)]
[(1, 0.99306033128861715)]
[(0, 0.20515646152519815), (2, 0.52594490459576548), (3, 0.26471212387840837)]
[(3, 0.04491598052674399), (19, 0.95122481266420422)]
[(1, 0.22960554413101766), (3, 0.36191045940328553), (32, 0.40395324200493687)]
[(0, 0.019124592360632048), (10, 0.97943881686259149)]
[(0, 0.81837227282389002), (1, 0.17940587561773474)]
[(0, 0.14287522904275188), (39, 0.84472963347892094)]
[(3, 0.99862883909846056)]
[(0, 0.35345563066083663), (52, 0.63792491374426719)]
[(3, 0.99875452590630343)]
[(11, 0.99893817050134814)]
[(27, 0.98744579358032858)]
[(1, 0.22327158275882511), (28, 0.77161019282187371)]
[(0, 0.12840766205516432), (18, 0.8693168086673122)]
[(

#### Topics sorted by recurrence

In [30]:
total_weights = defaultdict(float)
for doc in corpus:
    for topic, weight in doc:
        total_weights[topic] += abs(weight)
total_weights = dict(total_weights)

In [31]:
Counter(total_weights).most_common()

[(3, 9.0398592934197666),
 (1, 7.4437717351345771),
 (0, 5.708545232744819),
 (9, 4.5496601654155358),
 (2, 4.5212703344979612),
 (5, 3.1893604821450006),
 (10, 2.8760885814208739),
 (20, 2.75199056992275),
 (7, 2.6619842402686973),
 (19, 2.4602146476643316),
 (6, 2.1573103570688197),
 (11, 2.1072717548198026),
 (4, 2.0163117902370424),
 (13, 1.964027313458335),
 (16, 1.9447080000421639),
 (29, 1.8101023477573031),
 (8, 1.417386700202278),
 (12, 1.0533345702725354),
 (17, 0.99829212366573561),
 (26, 0.99724076116136207),
 (53, 0.9930062549368629),
 (37, 0.99280755368690043),
 (30, 0.99146793468515027),
 (14, 0.99130535990169311),
 (27, 0.98744579358032858),
 (15, 0.92940686592556498),
 (72, 0.89560487225693575),
 (18, 0.8693168086673122),
 (39, 0.84472963347892094),
 (87, 0.80938150486931071),
 (24, 0.78297631356671127),
 (28, 0.77161019282187371),
 (21, 0.76770115319087529),
 (22, 0.76134736645351875),
 (43, 0.76098405683261672),
 (52, 0.63792491374426719),
 (25, 0.63630644113939094),

#### Top 10 topics

In [37]:
hdp_best_topics_dct

{0: [('verdad', 0.0067678273629236292),
  ('comisión', 0.0063857159473691546),
  ('justicia', 0.0061018133269391608),
  ('reconocimiento', 0.004515592780729473),
  ('participación', 0.0042060987880528306),
  ('satisfacción', 0.0041200504800445407),
  ('reparación', 0.0038784747446529732),
  ('principios', 0.0033802588787005474),
  ('agenda', 0.0030631727025306065),
  ('repetición', 0.0030146326263065871)],
 1: [('medidas', 0.0062935529128474998),
  ('delegaciones', 0.0037339131631550742),
  ('partes', 0.0032599149312722933),
  ('criterios', 0.0031485856520193827),
  ('pensamiento', 0.0030958027428465381),
  ('selección', 0.0030710460921224373),
  ('menores', 0.0030621804718379587),
  ('confianza', 0.0029714529703338578),
  ('armado', 0.0029206011646252788),
  ('primera', 0.0029057857106824674)],
 2: [('medidas', 0.0061987677253666981),
  ('reúne', 0.0057190264702405265),
  ('delegaciones', 0.0054951029277734128),
  ('subcomisión', 0.0052469815657871042),
  ('grupo', 0.00443107279334301

In [32]:
hdp_best_topics_dct = dict(hdp_best_topics[1][0])
for topic_id, weight in Counter(total_weights).most_common()[:10]:
    print(topic_id, '-', weight)
    pprint(hdp_best_topics_dct[topic_id])
    print()

3 - 9.03985929342
[('comisión', 0.013761389011450405),
 ('expertos', 0.011323061209277159),
 ('informe', 0.0085811064463560127),
 ('informes', 0.0073636868240974348),
 ('delegaciones', 0.0059078751871302947),
 ('género', 0.0048454887146030948),
 ('subcomisión', 0.0045539662345677225),
 ('sub', 0.0040961682043965421),
 ('mujeres', 0.0038463491868553808),
 ('entrega', 0.0037221286344120822)]

1 - 7.44377173513
[('medidas', 0.0062935529128474998),
 ('delegaciones', 0.0037339131631550742),
 ('partes', 0.0032599149312722933),
 ('criterios', 0.0031485856520193827),
 ('pensamiento', 0.0030958027428465381),
 ('selección', 0.0030710460921224373),
 ('menores', 0.0030621804718379587),
 ('confianza', 0.0029714529703338578),
 ('armado', 0.0029206011646252788),
 ('primera', 0.0029057857106824674)]

0 - 5.70854523274
[('verdad', 0.0067678273629236292),
 ('comisión', 0.0063857159473691546),
 ('justicia', 0.0061018133269391608),
 ('reconocimiento', 0.004515592780729473),
 ('participación', 0.0042060987