# Tomo 1

In [1]:
import os
from collections import Counter, defaultdict
from string import punctuation

import nltk
from gensim import corpora, models
from nltk.corpus import stopwords

## Loading data

In [2]:
def key_sort_files(x):
    return int(x[:-4].split('-')[0])

path = 'data/aux/biblioteca/text_parts/1/'
unsorted_file_list = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
file_list = sorted(unsorted_file_list, key=key_sort_files)

raw_texts = []
for filename in file_list:
    with open(path + filename) as f:
        raw_texts.append(f.read())

## Cleaning data

In [3]:
chars = []
for text in raw_texts:
    for c in text:
        if not c.isalnum():
            chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_texts = [clean(text) for text in raw_texts]

In [8]:
char_counter = defaultdict(int)
for c in characters:
    for text in cleaned_texts:
        if c in text:
            char_counter[c]+=text.count(c)
char_counter

defaultdict(int, {' ': 37776, '-': 417})

## Preparing data

In [9]:
documents = cleaned_texts

In [10]:
document = ' '.join(documents)

In [11]:
# counter = Counter(document.split())

# def is_too_common(word, n):
#     return all(map(lambda x: x.count(word) >= n, documents))

In [12]:
# Only texts that contain MORE THAN or EQUALS TO 100 words
texts = [[word for word in document.split()] for document in documents if len(document.split()) >= 100]

In [13]:
len(texts)

35

In [14]:
dictionary = corpora.Dictionary(texts)

In [15]:
len(dictionary)

8832

In [16]:
min_df = 5
max_df = 0.6
max_features = 8832
dictionary.filter_extremes(no_below=min_df, no_above=max_df, keep_n=max_features)

In [17]:
print(dictionary)

Dictionary(984 unique tokens: ['segunda', 'veces', 'fuerzas', 'mil', 'número']...)


In [18]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
tfidf = models.TfidfModel(corpus)

In [20]:
print(tfidf)

TfidfModel(num_docs=35, num_nnz=8357)


In [21]:
corpus_tfidf = tfidf[corpus]

## Topic analysis

### LSI

In [22]:
N_TOPICS = 35

In [23]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [24]:
lsi.num_topics

35

In [25]:
lsi.print_topics(lsi.num_topics)

[(0,
  '0.155*"mesa" + 0.120*"farc" + 0.113*"fase" + 0.105*"agenda" + 0.103*"delegados" + 0.101*"programas" + 0.094*"exploratorio" + 0.093*"general" + 0.089*"víctimas" + 0.079*"participación"'),
 (1,
  '-0.267*"delegados" + -0.196*"pablo" + 0.182*"programas" + -0.175*"traslado" + -0.167*"cicr" + -0.157*"acta" + -0.142*"catatumbo" + -0.125*"secreto" + -0.123*"plenipotenciario" + -0.123*"farc"'),
 (2,
  '0.230*"plenipotenciario" + -0.225*"pablo" + 0.176*"acta" + 0.172*"exploratorio" + -0.160*"catatumbo" + -0.142*"abrazo" + -0.140*"carta" + 0.135*"actasyacuerdos" + -0.131*"amigo" + 0.130*"delegado"'),
 (3,
  '-0.340*"programas" + 0.159*"sergio" + -0.148*"acceso" + -0.139*"mecanismos" + 0.139*"jaramillo" + 0.132*"doctor" + -0.127*"pablo" + 0.118*"presidencia" + -0.118*"planes" + -0.114*"regionales"'),
 (4,
  '-0.170*"farc" + -0.163*"traslado" + 0.147*"mesa" + -0.145*"mundo" + -0.145*"orchila" + 0.143*"pablo" + 0.141*"estable" + 0.140*"duradera" + 0.139*"delegaciones" + -0.132*"cicr"'),
 (5

In [26]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(0, 15),
 (1, 3),
 (2, 3),
 (3, 2),
 (12, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (11, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (23, 1)]

### LDA

In [44]:
N_TOPICS = 6

In [45]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS)
corpus_lda = lda[corpus]

In [46]:
lda.num_topics

6

In [47]:
sorted(lda.print_topics(N_TOPICS))

[(0,
  '0.007*"participación" + 0.007*"víctimas" + 0.007*"años" + 0.007*"mesa" + 0.007*"final" + 0.006*"desarrollo" + 0.006*"agenda" + 0.006*"fase" + 0.006*"marco" + 0.006*"general"'),
 (1,
  '0.010*"mesa" + 0.008*"general" + 0.007*"desarrollo" + 0.007*"farc" + 0.006*"delegados" + 0.005*"participación" + 0.005*"agenda" + 0.005*"marco" + 0.005*"noruega" + 0.005*"conversaciones"'),
 (2,
  '0.013*"delegados" + 0.010*"farc" + 0.009*"venezuela" + 0.009*"partes" + 0.007*"negociación" + 0.006*"noruega" + 0.006*"exploratorio" + 0.006*"mesa" + 0.006*"agenda" + 0.006*"víctimas"'),
 (3,
  '0.008*"farc" + 0.007*"mesa" + 0.007*"agenda" + 0.006*"social" + 0.006*"exploratorio" + 0.006*"pueblo" + 0.006*"marco" + 0.006*"participación" + 0.006*"desarrollo" + 0.006*"general"'),
 (4,
  '0.016*"farc" + 0.006*"hoy" + 0.006*"fase" + 0.006*"exploratorio" + 0.005*"zona" + 0.005*"colombianos" + 0.005*"mesa" + 0.005*"general" + 0.005*"seguridad" + 0.005*"víctimas"'),
 (5,
  '0.013*"víctimas" + 0.007*"seguridad" 

In [31]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_lda: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(2, 16), (3, 10), (1, 5), (0, 4)]

### HDP

In [32]:
# N_TOPICS = 4

In [33]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [34]:
hdp.print_topics(10)

[(0,
  '0.007*fase + 0.007*agosto + 0.006*general + 0.006*primera + 0.006*jaramillo + 0.006*comandante + 0.006*marzo + 0.006*armado + 0.006*exploratorio + 0.006*conversaciones'),
 (1,
  '0.014*negociación + 0.013*farc + 0.010*víctimas + 0.005*sistema + 0.005*especial + 0.005*apoyo + 0.005*posible + 0.005*general + 0.005*menos + 0.005*paso'),
 (2,
  '0.012*mesa + 0.008*diálogo + 0.007*desarrollo + 0.006*años + 0.006*guerra + 0.006*presidencia + 0.005*ley + 0.005*diálogos + 0.005*armas + 0.004*miembros'),
 (3,
  '0.008*gran + 0.008*hoy + 0.007*colombianos + 0.006*día + 0.006*social + 0.005*nuevo + 0.005*mundo + 0.005*trabajo + 0.005*fuerzas + 0.005*desarrollo'),
 (4,
  '0.007*fase + 0.006*equipo + 0.006*hoy + 0.005*colombianos + 0.005*quiero + 0.005*trabajo + 0.005*conversaciones + 0.005*víctimas + 0.005*general + 0.004*gracias'),
 (5,
  '0.011*mesa + 0.006*colombiano + 0.006*secreto + 0.005*tema + 0.005*jaramillo + 0.005*farc + 0.005*mauricio + 0.005*opinión + 0.005*ahora + 0.005*hablar

In [35]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_hdp: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(0, 3),
 (5, 3),
 (4, 2),
 (7, 2),
 (11, 2),
 (13, 2),
 (24, 2),
 (1, 1),
 (2, 1),
 (3, 1),
 (6, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (12, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (42, 1)]