# Tomo 1

In [1]:
import os
import re
from collections import Counter, defaultdict
from string import punctuation

import nltk
import numpy as np
from gensim import corpora, models
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
def key_sort_files(x):
    return int(x[:-4].split('-')[0])

path = 'data/aux/biblioteca/text_parts/1/'
unsorted_file_list = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
file_list = sorted(unsorted_file_list, key=key_sort_files)

raw_texts = []
for filename in file_list:
    with open(path + filename) as f:
        raw_texts.append(f.read())

## Cleaning data

In [3]:
chars = []
for text in raw_texts:
    for c in text:
        if not c.isalnum():
            chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_texts = [clean(text) for text in raw_texts]

In [8]:
char_counter = defaultdict(int)
for c in characters:
    for text in cleaned_texts:
        if c in text:
            char_counter[c]+=text.count(c)
char_counter

defaultdict(int, {' ': 37776, '-': 417})

## Preparing data

In [9]:
documents = cleaned_texts

In [10]:
document = ' '.join(documents)

In [11]:
# Only texts that contain MORE THAN or EQUALS TO 100 words
texts = [[word for word in document.split()] for document in documents if len(document.split()) >= 100]

In [12]:
len(texts)

35

In [13]:
dictionary = corpora.Dictionary(texts)

In [14]:
len(dictionary)

8832

## Parameters

In [15]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = re.findall('"(\w+)"', t)
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    weights = re.findall('(0.\d{3})\*"(\w+)"', t)
    r = {wo: float(we) for we, wo in weights}
    return r

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [16]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def best_topics(ts_lst):
    r = []
    for ts in ts_lst:
        ts_vector = vectorize(ts)
        d = multiple_topics_distance(ts_vector)
        r.append((d, ts))
    r = sorted(r, reverse=True)
    return r[0][1]

In [17]:
ts1=[(0,
  '0.006*"casa" + 0.005*"cocina" + 0.002*"coche"'),
 (1,
  '0.007*"coche" + 0.006*"motor" + 0.003*"carretera"'),
 (2,
  '0.007*"colegio" + 0.004*"casa" + 0.003*"coche"'),
]

In [18]:
vectorize(ts1)

[[0, 0.006, 0.002, 0.005, 0, 0],
 [0.003, 0, 0.007, 0, 0, 0.006],
 [0, 0.004, 0.003, 0, 0.007, 0]]

In [19]:
ts2=[(0,
  '0.009*"comedor" + 0.003*"cocina" + 0.002*"casa"'),
 (1,
  '0.007*"coche" + 0.006*"motor" + 0.003*"carretera"'),
 (2,
  '0.007*"colegio" + 0.004*"aula" + 0.003*"casa"'),
]

In [20]:
ts_lst = [ts1, ts2]

In [21]:
best_topics(ts_lst)

[(0, '0.009*"comedor" + 0.003*"cocina" + 0.002*"casa"'),
 (1, '0.007*"coche" + 0.006*"motor" + 0.003*"carretera"'),
 (2, '0.007*"colegio" + 0.004*"aula" + 0.003*"casa"')]

In [22]:
min_df = 5
max_df = 0.6
max_features = 8832
dictionary.filter_extremes(no_below=min_df, no_above=max_df, keep_n=max_features)

In [23]:
print(dictionary)

Dictionary(984 unique tokens: ['firmado', 'lucha', 'ampliar', 'subsidios', 'dinámica']...)


In [24]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [25]:
tfidf = models.TfidfModel(corpus)

In [26]:
print(tfidf)

TfidfModel(num_docs=35, num_nnz=8357)


In [27]:
corpus_tfidf = tfidf[corpus]

## Topic analysis

### LSI

In [28]:
N_TOPICS = 35

In [29]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [30]:
lsi.num_topics

35

In [42]:
type(lsi.print_topics(lsi.num_topics))

list

In [32]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(0, 15),
 (1, 3),
 (2, 3),
 (3, 2),
 (12, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (11, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (23, 1)]

### LDA

In [33]:
N_TOPICS = 6

In [34]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS)
corpus_lda = lda[corpus]

In [35]:
lda.num_topics

6

In [36]:
sorted(lda.print_topics(N_TOPICS))

[(0,
  '0.015*"farc" + 0.008*"zona" + 0.007*"general" + 0.006*"alto" + 0.006*"conversaciones" + 0.006*"carlos" + 0.006*"mesa" + 0.006*"marco" + 0.006*"seguridad" + 0.005*"comisionado"'),
 (1,
  '0.010*"mesa" + 0.008*"participación" + 0.007*"agenda" + 0.007*"desarrollo" + 0.006*"marco" + 0.006*"exploratorio" + 0.006*"general" + 0.005*"fase" + 0.005*"negociación" + 0.005*"guerra"'),
 (2,
  '0.012*"farc" + 0.009*"exploratorio" + 0.009*"delegados" + 0.007*"negociación" + 0.006*"víctimas" + 0.006*"armado" + 0.006*"noruega" + 0.006*"agenda" + 0.005*"primera" + 0.005*"marco"'),
 (3,
  '0.007*"agenda" + 0.007*"farc" + 0.007*"mesa" + 0.007*"desarrollo" + 0.006*"participación" + 0.006*"años" + 0.006*"víctimas" + 0.005*"delegados" + 0.005*"colombianos" + 0.005*"seguridad"'),
 (4,
  '0.014*"víctimas" + 0.008*"farc" + 0.007*"mesa" + 0.007*"general" + 0.006*"hoy" + 0.006*"colombianos" + 0.005*"desarrollo" + 0.005*"años" + 0.005*"negociación" + 0.005*"partes"'),
 (5,
  '0.011*"delegados" + 0.007*"ven

In [37]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_lda: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(1, 10), (3, 8), (0, 5), (2, 4), (4, 4), (5, 4)]

### HDP

In [38]:
# N_TOPICS = 4

In [39]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [46]:
type(hdp.print_topics(2))

list

In [41]:
def get_predominant_topic(doc):
    return sorted(doc, key=lambda x: abs(x[1]), reverse=True)[0][0]

predominant_topics = []
for doc in corpus_hdp: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    predominant_topics.append(get_predominant_topic(doc))
Counter(predominant_topics).most_common()
# result: (topic_id, number of documents)

[(6, 5),
 (0, 3),
 (1, 3),
 (2, 2),
 (7, 2),
 (9, 2),
 (21, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (8, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (27, 1),
 (29, 1)]