# Tomo 1

In [1]:
import os
from collections import Counter, defaultdict
from pprint import pprint
from string import punctuation

import nltk
import numpy as np
from gensim import corpora, models
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
def key_sort_files(x):
    return int(x[:-4].split('-')[0])

path = 'data/aux/biblioteca/text_parts/7/'
unsorted_file_list = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
file_list = sorted(unsorted_file_list, key=key_sort_files)

raw_texts = []
for filename in file_list:
    with open(path + filename) as f:
        raw_texts.append(f.read())

## Cleaning data

In [3]:
chars = []
for text in raw_texts:
    for c in text:
        if not c.isalnum():
            chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_texts = [clean(text) for text in raw_texts]

In [8]:
char_counter = defaultdict(int)
for c in characters:
    for text in cleaned_texts:
        if c in text:
            char_counter[c]+=text.count(c)
char_counter

defaultdict(int, {' ': 53855, '-': 200})

## Preparing data

In [9]:
documents = cleaned_texts

In [10]:
document = ' '.join(documents)

In [11]:
# Only texts that contain MORE THAN or EQUALS TO 100 words
texts = [[word for word in document.split()] for document in documents if len(document.split()) >= 100]

In [12]:
len(texts)

46

In [13]:
dictionary = corpora.Dictionary(texts)

In [14]:
len(dictionary)

7829

## Processing data

In [15]:
%%time

# lsi_topics_lists = []
lda_topics_lists = []
hdp_topics_lists = []

MIN_DFs = [1, 2, 4, 7]
MAX_DFs = [0.6, 0.7, 0.8, 0.9]
MAX_FTs = [100, 1000, len(dictionary)]
N_TOPICSs = [10, len(texts), 100]

total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * len(N_TOPICSs)
iteration = 1
for MIN_DF in MIN_DFs:
    for MAX_DF in MAX_DFs:
        for MAX_FT in MAX_FTs:
            dictionary = corpora.Dictionary(texts)
            dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF, keep_n=MAX_FT)
            corpus = [dictionary.doc2bow(text) for text in texts]
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            
            for N_TOPICS in N_TOPICSs:
                print('Iteration: {} / {}'.format(iteration, total_iterations), \
                      '- MIN_DF:', MIN_DF, '- MAX_DF:', MAX_DF, '- MAX_FT:', MAX_FT, '- N_TOPICS:', N_TOPICS)
                iteration += 1
                
                # LSI
#                 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
#                 corpus_lsi = lsi[corpus_tfidf]
#                 ts = lsi.show_topics(lsi.num_topics, formatted=False)
#                 result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lsi)
#                 lsi_topics_lists.append(result)
            
                # LDA
                lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=0)
                corpus_lda = lda[corpus]
                ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lda)
                lda_topics_lists.append(result)
            
                # HDP
                hdp = models.HdpModel(corpus, id2word=dictionary, random_state=0)
                corpus_hdp = hdp[corpus]
                ts = hdp.show_topics(len(hdp.show_topics(num_topics=-1)), num_words=10, formatted=False)
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_hdp)
                hdp_topics_lists.append(result)

Iteration: 1 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 2 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 46
Iteration: 3 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 4 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 5 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 46
Iteration: 6 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 7 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 10
Iteration: 8 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 46
Iteration: 9 / 144 - MIN_DF: 1 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 100
Iteration: 10 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 11 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 46
Iteration: 12 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 13 / 144 - MIN_DF: 1 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 14 / 144 - M

Iteration: 109 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 110 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 46
Iteration: 111 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 112 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 113 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 46
Iteration: 114 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 1000 - N_TOPICS: 100
Iteration: 115 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 10
Iteration: 116 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 46
Iteration: 117 / 144 - MIN_DF: 7 - MAX_DF: 0.6 - MAX_FT: 7829 - N_TOPICS: 100
Iteration: 118 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 119 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 46
Iteration: 120 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 100
Iteration: 121 / 144 - MIN_DF: 7 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
I

In [16]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = [word for word, weight in t]
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    return dict(t)

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, j, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [17]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def top_n_used(ts_list, n):
    total_weights = defaultdict(float)
    corpus = ts_list[2]
    for doc in corpus:
        for topic, weight in doc:
            total_weights[topic] += abs(weight)
    total_weights = dict(total_weights)
    counter = Counter(total_weights)
    topics_dct = dict(ts_list[0])
    r = ([(weight, topic_id, topics_dct[topic_id])
             for topic_id, weight in counter.most_common(n)], ts_list[1], corpus)
    return r

def best_topics(ts_lists, n=10):
    r = []
    for ts_list in ts_lists:
        top_n_ts_list = top_n_used(ts_list, n)
        ts_vector = vectorize(top_n_ts_list[0])
        d = multiple_topics_distance(ts_vector)
        r.append((d, top_n_ts_list))
    r = sorted(r, reverse=True)
    return r[0]

In [18]:
%%time

# print('LSI')
# lsi_best_topics_list = best_topics(lsi_topics_lists)
print('LDA')
lda_best_topics_list = best_topics(lda_topics_lists)
print('HDP')
hdp_best_topics_list = best_topics(hdp_topics_lists)

LDA
HDP
CPU times: user 1min 14s, sys: 1min 14s, total: 2min 28s
Wall time: 38.2 s


## Results

### LSI

In [19]:
# lsi_best_topics_list[1][0]

### LDA

In [20]:
lda_best_topics_list[1][0]

[(2.9281185753643513,
  49,
  [('foro', 0.014351286),
   ('pueblos', 0.012354096),
   ('consultas', 0.011176488),
   ('mesas', 0.0096873445),
   ('representantes', 0.0090755615),
   ('comunidades', 0.0084993485),
   ('organización', 0.0078254649),
   ('participantes', 0.0075355857),
   ('étnico', 0.0069663241),
   ('comisión', 0.0066033257)]),
 (2.8275989601388574,
  68,
  [('territorios', 0.025001768),
   ('inicio', 0.022780633),
   ('pedagogía', 0.021334987),
   ('foro', 0.017387316),
   ('territorial', 0.015666567),
   ('capacidades', 0.013118932),
   ('proyecto', 0.012521289),
   ('actores', 0.010379532),
   ('diálogos', 0.010261009),
   ('convivencia', 0.0095848795)]),
 (2.400023702532053,
  10,
  [('pueblos', 0.025529699),
   ('territorios', 0.014066815),
   ('étnica', 0.012898666),
   ('étnico', 0.012449214),
   ('comisión', 0.011285299),
   ('étnicos', 0.011060828),
   ('capítulo', 0.011057509),
   ('afrodescendientes', 0.0080890832),
   ('territoriales', 0.0076786876),
   ('pe

### HDP

In [21]:
hdp_best_topics_list[1][0]

[(3.1873839353578641,
  1,
  [('foro', 0.031710169489529053),
   ('mesas', 0.013839864270060094),
   ('participantes', 0.01197594584791448),
   ('universidad', 0.010153316467709743),
   ('sectores', 0.0092518558130842716),
   ('actores', 0.0079997492461898571),
   ('diferentes', 0.0078906784879175986),
   ('movimientos', 0.0075832429399493176),
   ('políticos', 0.0072151437980676678),
   ('importantes', 0.0072063790910216422)]),
 (3.1428975955273675,
  3,
  [('sociedad', 0.010403642591430205),
   ('civil', 0.0077161599356565039),
   ('integral', 0.0076318415226086173),
   ('través', 0.0069911069612856227),
   ('social', 0.0061677876697838986),
   ('agenda', 0.0061386666866666871),
   ('recomendaciones', 0.0061150440848340556),
   ('iniciativas', 0.0060643498978363766),
   ('medio', 0.0056871701251030964),
   ('importantes', 0.0051027157356209737)]),
 (3.0672081836365859,
  2,
  [('comunidades', 0.010200322398191238),
   ('medidas', 0.0097055154085956431),
   ('protección', 0.0094036789