# Tomo 7

In [1]:
from collections import Counter
from string import punctuation

import nltk
from nltk.corpus import stopwords

## Loading data

In [2]:
with open('data/aux/biblioteca/texts/7.txt') as f:
    text = f.read()

## Cleaning data

In [3]:
chars = []
for c in text:
    if not c.isalnum():
        chars.append(c)

In [4]:
characters = set(chars)

In [5]:
with open('data/stopwords/spanish_stopwords.txt') as f:
    sp_stopwords = list(set(map(str.strip, f.readlines())))

with open('data/stopwords/my_stopwords.txt') as f:
    my_stopwords = list(set(map(str.strip, f.readlines())))

stop = stopwords.words('spanish') + sp_stopwords + my_stopwords + list(punctuation) + list(characters)

In [6]:
def clean(s):
    r = s.lower().strip()
    for c in characters:
        r = r.replace(c, ' ')
    r = r.replace('farc ep', 'farc-ep')
    r = r.replace('confianz a', 'confianza')
    r = r.replace('cons trucción', 'construcción')
    rs = [w for w in nltk.word_tokenize(r) if w not in stop and len(w) > 2 and not w.isdecimal()]
    r = ' '.join(rs)
    return r

In [7]:
cleaned_text = clean(text)

In [8]:
for c in characters:
    if c in cleaned_text:
        print(c, cleaned_text.count(c))

- 218
  56741


## Processing data

In [9]:
counter = Counter(cleaned_text.split())

In [10]:
counter.most_common(100)

[('paz', 1026),
 ('mujeres', 854),
 ('nacional', 681),
 ('participación', 602),
 ('mesa', 510),
 ('organizaciones', 501),
 ('género', 474),
 ('foro', 440),
 ('conversaciones', 405),
 ('acuerdo', 368),
 ('víctimas', 359),
 ('gobierno', 338),
 ('conflicto', 305),
 ('derechos', 293),
 ('proceso', 271),
 ('colombia', 269),
 ('propuestas', 261),
 ('enfoque', 233),
 ('construcción', 226),
 ('farc-ep', 218),
 ('subcomisión', 216),
 ('acuerdos', 202),
 ('participantes', 196),
 ('general', 192),
 ('trabajo', 192),
 ('desarrollo', 186),
 ('foros', 177),
 ('implementación', 174),
 ('comunidades', 168),
 ('personas', 165),
 ('pueblos', 164),
 ('habana', 162),
 ('indígenas', 161),
 ('política', 161),
 ('sociedad', 158),
 ('mesas', 156),
 ('fin', 155),
 ('diferentes', 152),
 ('violencia', 151),
 ('especial', 150),
 ('país', 144),
 ('manera', 144),
 ('final', 141),
 ('territorios', 140),
 ('representantes', 138),
 ('universidad', 135),
 ('naciones', 134),
 ('unidas', 132),
 ('sectores', 128),
 ('soci

## Saving data

In [11]:
with open('data/out/cleaned_tomo_7.txt', 'w') as f:
    f.write(cleaned_text)