In [1]:
import csv
import os
from collections import Counter
from collections import defaultdict
from string import punctuation

import enchant
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams

# Lexicon

In [2]:
def remove_accent_marks(w):
    r = w
    r = r.replace('á', 'a')
    r = r.replace('é', 'e')
    r = r.replace('í', 'i')
    r = r.replace('ó', 'o')
    r = r.replace('ú', 'u')
    return r

In [3]:
lexicon = {}
with open('lexicon/es_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    for row in reader:
        w = remove_accent_marks(row[0])
        lexicon[w] = []
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['lemma'] = remove_accent_marks(row[i].lower())
            entry['eagle'] = remove_accent_marks(row[i+1].lower())
            lexicon[w].append(entry)

In [4]:
len(lexicon)

500949

In [5]:
def is_vowel(c):
    return c in 'aeiouAEIOUáéíóúÁÉÍÓÚüÜ'

def next_level(w):
    result = [w]
    if is_vowel(w[0]):
        result.append('h' + w)
    elif w.startswith('h'):
        result.append(w[1:])
    if "q'" in w:
        result.append(w.replace("q'", 'que'))
    for i in range(len(w)):
        if w[i] == 'v':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'b':
            result.append(w[:i] + 'v' + w[i+1:])
            result.append(w[:i] + 'd' + w[i+1:])
        elif w[i] == 'd':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'c' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 's' + w[i+1:])
        elif w[i] == 's' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 'c' + w[i+1:])
    return result

def combinations_aux(visited, result):
    while visited:
        w = visited[0]
        visited = visited[1:]
        if w not in result:
            result.append(w)
            combs = next_level(w)
            visited.extend(combs)        
    return result

def combinations(w):
    return combinations_aux([w], [])

In [6]:
es = enchant.Dict('es_ES')

# Analysis

In [7]:
os.listdir('data')

['ana.csv', 'dago.csv', 'nandi.csv', 'rafa.csv', 'antonio.csv', 'yadira.csv']

In [8]:
d = {}
for filename in os.listdir('data'):
    with open('data/' + filename) as f:
        reader = csv.reader(
            f,
            delimiter=',',
            quotechar='"'
        )
        next(reader)
        for row in reader:
            d[row[0]] = {
                'date': row[1],
                'name': row[2],
                'gender': row[3],
                'age': row[4],
                'city': row[5],
                'drawings': row[6],
                'title': row[7],
                'description': row[8],
            }

In [9]:
len(d)

1028

## Gender

In [10]:
blank = 0
male = 0
female = 0
for k in d:
    gender = d[k]['gender']
    if gender == 'M':
        male += 1
    elif gender == 'F':
        female += 1
    else:
        blank += 1

In [11]:
blank

534

In [12]:
male

146

In [13]:
female

348

## Date

In [14]:
dates = []
for k in d:
    date = d[k]['date']
    if date == '2016/6/10':
        date = '2016/06/10'
    elif date == '2011-2014':
        date = '2011/01/01'
    elif date == '2013/01/01 – 2016/01/01':
        date = '2013/01/01'
    elif date == '2015/1017':
        date = '2015/10/17'
    elif date == '2016/4/28':
        date = '2016/04/28'
    elif date == '2016/04/27 – 2016/03/20':
        date = '2016/03/20'
    elif date == '2015-2016':
        date = '2015/01/01'
    elif date == '2016/4/27':
        date = '2016/04/27'
    elif date == '201210/21':
        date = '2012/10/21'
    elif date == '2015/05':
        date = '2015/05/01'
    dates.append(date.replace('/', '-'))

In [15]:
dates = [da for da in dates if da]

In [16]:
len(dates)

950

In [17]:
workshop_dates = [da for da in dates if '2016-04-19' <= da <= '2016-05-02']

In [18]:
len(workshop_dates)

234

In [19]:
workshop_dates_counter = Counter(workshop_dates)

In [20]:
workshop_dates_counter

Counter({'2016-04-19': 1,
         '2016-04-20': 3,
         '2016-04-21': 5,
         '2016-04-22': 1,
         '2016-04-23': 6,
         '2016-04-24': 10,
         '2016-04-25': 11,
         '2016-04-26': 22,
         '2016-04-27': 34,
         '2016-04-28': 27,
         '2016-04-29': 24,
         '2016-04-30': 54,
         '2016-05-01': 35,
         '2016-05-02': 1})

In [21]:
fact_dates = [da for da in dates if not '2016-04-19' <= da <= '2016-05-02']

In [22]:
len(fact_dates)

716

In [23]:
fact_dates_counter = Counter(fact_dates)

In [24]:
fact_dates_counter

Counter({'1810-03-01': 1,
         '1905-05-06': 1,
         '1905-06-05': 1,
         '1905-06-17': 1,
         '1905-07-07': 1,
         '1916-04-30': 1,
         '1953-01-01': 1,
         '1963-12-01': 1,
         '1973-02-14': 1,
         '1989-09-01': 1,
         '1990-01-01': 2,
         '1993-01-01': 2,
         '1994-08-12': 1,
         '1995-01-01': 2,
         '1995-07-01': 1,
         '1996-02-01': 1,
         '1996-08-15': 1,
         '1996-12-14': 1,
         '1997-01-01': 1,
         '1997-11-11': 1,
         '1998-01-01': 1,
         '1998-11-01': 1,
         '1999-01-01': 3,
         '1999-01-25': 1,
         '2000-01-01': 5,
         '2000-03-01': 1,
         '2000-04-01': 1,
         '2000-04-19': 1,
         '2000-04-28': 1,
         '2001-01-15': 1,
         '2001-02-01': 1,
         '2002-01-01': 2,
         '2002-05-01': 1,
         '2003-01-01': 2,
         '2004-01-01': 4,
         '2004-04-23': 1,
         '2005-01-01': 5,
         '2005-02-01': 1,
         '20

In [25]:
fact_dates_year = [da.split('-')[0] for da in fact_dates]

In [26]:
len(fact_dates_year)

716

In [27]:
fact_dates_year_counter = Counter(fact_dates_year)

In [28]:
fact_dates_year_counter

Counter({'1810': 1,
         '1905': 4,
         '1916': 1,
         '1953': 1,
         '1963': 1,
         '1973': 1,
         '1989': 1,
         '1990': 2,
         '1993': 2,
         '1994': 1,
         '1995': 3,
         '1996': 3,
         '1997': 2,
         '1998': 2,
         '1999': 4,
         '2000': 9,
         '2001': 2,
         '2002': 3,
         '2003': 2,
         '2004': 5,
         '2005': 7,
         '2006': 10,
         '2007': 7,
         '2008': 17,
         '2009': 12,
         '2010': 27,
         '2011': 15,
         '2012': 41,
         '2013': 36,
         '2014': 69,
         '2015': 182,
         '2016': 242,
         '2018': 1})

## Cities

In [29]:
cities = []
for k in d:
    city = d[k]['city']
    cities.append(city)

In [30]:
c = Counter(cities)

In [31]:
# c

In [32]:
c.most_common(10)

[('', 668),
 ('Bogotá', 229),
 ('Neiva', 13),
 ('Bogotá D.C.', 9),
 ('Soacha', 7),
 ('Fusagasugá', 6),
 ('Tunja', 6),
 ('Medellín', 5),
 ('Bogotá D.C', 5),
 ('Cunday Tolima', 4)]

## Ages

In [33]:
ages = []
for k in d:
    age = d[k]['age']
    ages.append(age)

In [34]:
a = Counter(ages)

In [35]:
# a

In [36]:
a.most_common(11)

[('', 671),
 ('16', 24),
 ('14', 22),
 ('12', 18),
 ('15', 17),
 ('11', 17),
 ('10', 14),
 ('19', 14),
 ('13', 14),
 ('17', 13),
 ('20', 13)]

In [37]:
children = 0
teenagers = 0
young_adults = 0
adults = 0
for k in d:
    try:
        if d[k]['age']:
            age = int(d[k]['age'])
            if 0 <= age <= 12:
                children += 1
            elif 13 <= age <= 18:
                teenagers += 1
            elif 19 <= age <= 25:
                young_adults += 1
            else:
                adults += 1
    except: # '16, 19'
        teenagers += 1

In [38]:
children

67

In [39]:
teenagers

100

In [40]:
young_adults

68

In [41]:
adults

122

## Drawings

In [42]:
drawing_words = []
for k in d:
    drawings = d[k]['drawings'].lower().replace(';', ',')
    words = [x.strip() for x in drawings.split(',')]
    drawing_words.extend(words)

In [43]:
dw = Counter(drawing_words)

In [44]:
# dw

In [45]:
dw.most_common(11)

[('', 906),
 ('corazón', 84),
 ('cara feliz', 28),
 ('carita feliz', 11),
 ('niño', 11),
 ('nube', 7),
 ('cara sonriente', 5),
 ('flor', 4),
 ('perro', 4),
 ('sol', 3),
 ('cruz', 2)]

## Words

In [46]:
punctuation = punctuation + '¡¿“…'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¡¿“…'

In [47]:
def normalize(text):
    text = text.lower()
    text = remove_accent_marks(text)
    for p in punctuation:
        text = text.replace(p, '')
    return text

In [48]:
def total_text(r):
    return normalize(r['title'] + ' ' + r['description'])

In [49]:
total_words = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    total_words.extend(words)

In [50]:
len(total_words)

57051

In [51]:
total_words_freq = Counter(total_words)

In [52]:
total_words_freq.most_common(10)

[('y', 2477),
 ('de', 2460),
 ('la', 2089),
 ('que', 1738),
 ('a', 1514),
 ('en', 1502),
 ('el', 1282),
 ('un', 1004),
 ('con', 888),
 ('mi', 874)]

In [53]:
stop_words = [remove_accent_marks(w) for w in stopwords.words('spanish')] + ['xxx']

In [54]:
total_nonstop_words = [w for w in total_words if w not in stop_words]

In [55]:
len(total_nonstop_words)

27314

In [56]:
unique_nonstop_words = set(total_nonstop_words)

In [57]:
len(unique_nonstop_words)

7619

## Spelling

In [58]:
correct_spelling = {}

for w in total_words_freq:
    if w not in correct_spelling:
        if w in lexicon:
            correct_spelling[w] = w
        else:
            combs = combinations(w)
            freqs = [(c, total_words_freq[c]) for c in combs if c in total_words_freq and c in lexicon]
            if freqs:
                freqs.sort(key=lambda x: x[1], reverse=True)
                correct_spelling[w] = freqs[0][0]
            else:
                freqs = [(c, total_words_freq[c]) for c in es.suggest(w) if c in total_words_freq]
                if freqs:
                    freqs.sort(key=lambda x: x[1], reverse=True)
                    correct_spelling[w] = freqs[0][0]
                else:
                    correct_spelling[w] = w

In [59]:
len(correct_spelling)

7805

In [60]:
len(total_words_freq)

7805

In [61]:
spelling_map = defaultdict(list)
for key in correct_spelling:
    v = correct_spelling[key]
    spelling_map[v].append((key, total_words_freq[key]))
spelling_map = dict(spelling_map)

In [62]:
len(spelling_map)

7058

In [63]:
correct_spelling['aser']

'hacer'

In [64]:
spelling_map['hacer']

[('aser', 3), ('hacer', 90), ('acer', 1), ('haser', 1)]

In [65]:
mispellings = [w for w in spelling_map if w not in stop_words and len(spelling_map[w]) > 1]
mispellings.sort()

In [66]:
mispellings[:10]

['abandonados',
 'abrasamos',
 'abuelas',
 'abuelo',
 'abuelos',
 'abusar',
 'acabo',
 'aceptacion',
 'aceptar',
 'aceptarnos']

In [67]:
spelling_map['abandonados']

[('abandonados', 3), ('habandonados', 1)]

In [68]:
def all_correct(spellings):
    return all(map(lambda x: x in lexicon or es.check(x), [x for x, y in spellings]))

for w in spelling_map:
    if len(w) > 2 and len(spelling_map[w]) > 1 and not all_correct(spelling_map[w]):
        incorrect_sum = sum(n for (p, n) in spelling_map[w] if p != w)
#         correct_sum = sum(n for (p, n) in spelling_map[w] if p == w)
#         total_sum = incorrect_sum + correct_sum
        if incorrect_sum > 1:
            print(w, spelling_map[w])

pelearan [('pelearan', 3), ('peliaran', 2), ('peliabaran', 1)]
era [('era', 115), ('tra', 1), ('sra', 2), ('dra', 1), ('hera', 1)]
violencia [('violencia', 57), ('laviolencia', 1), ('violence', 1)]
peleado [('peleado', 7), ('peliado', 2)]
igual [('igual', 10), ('igal', 1), ('igualda', 1)]
acabo [('acabo', 2), ('boyaca', 3)]
necesita [('necesita', 14), ('nesecita', 1), ('nevesita', 1)]
niña [('niña', 30), ('ninña', 1), ('nina', 1)]
respeto [('respeto', 63), ('irrespeto', 1), ('resoeto', 1), ('trespeto', 1)]
viene [('viene', 3), ('biene', 2)]
sed [('sed', 1), ('sedi', 2)]
amistad [('amistad', 27), ('amitad', 1), ('amisgyad', 1)]
once [('onces', 2), ('once', 3)]
aveces [('aveces', 2), ('abeses', 3)]
triste [('trizte', 1), ('triste', 18), ('tristesa', 1)]
casi [('casi', 12), ('casasi', 1), ('cali', 1)]
peleamos [('peleamos', 9), ('peliamos', 4)]
habia [('abia', 1), ('havia', 5), ('habia', 63)]
gato [('gato', 17), ('gatito', 4)]
mucho [('muchor', 1), ('mucho', 84), ('muchso', 1), ('much', 2

### Common errors

#### h
ahora [('haora', 4), ('ahora', 52), ('ahoran', 1)]<br>
asi [('asi', 97), ('haci', 6)]<br>
era [('tra', 1), ('hera', 1), ('dra', 1), ('sra', 2), ('era', 115)]<br>
errores [('herrores', 1), ('errores', 15), ('erroresñ', 1)]<br>
eso [('etso', 1), ('iso', 1), ('heso', 1), ('edo', 1), ('eso', 70)]<br>
haber [('haver', 1), ('aver', 2), ('haber', 15)]<br>
habia [('habia', 63), ('abia', 1), ('havia', 5)]<br>
hace [('haci', 6), ('face', 1), ('hace', 66)]<br>
hacer [('hacer', 90), ('aser', 3), ('haser', 1), ('acer', 1)]<br>
hagan [('haygan', 1), ('hagan', 2), ('agan', 1), ('haigan', 1)]<br>
hermana [('hermana', 34), ('ermana', 2), ('miermana', 1)]<br>
hermano [('hermanodi', 1), ('hermanito', 2), ('ermano', 1), ('hermano', 48), ('permano', 1), ('hermanao', 1), ('mermano', 1)]<br>
hija [('hija', 30), ('hiba', 1), ('hiva', 2)] # iba<br>
hice [('hice', 22), ('hise', 2), ('ise', 2)]<br>
hicieron [('isieron', 1), ('hisieron', 1), ('hicieron', 11)]<br>
honestidad [('onestidad', 1), ('honestidad', 3), ('autohonestidad', 1)]<br>
humanos [('humanos', 17), ('umanos', 1), ('humanso', 1)]<br>
iban [('hiban', 1), ('iban', 6), ('ivan', 2)]<br>

#### s/c/z
aveces [('abeses', 3), ('aveces', 2)]<br>
bicicleta [('bicicleta', 1), ('bicilecto', 1), ('bisicleta', 1)]<br>
ceder [('ceder', 4), ('seder', 1), ('1ceder', 1)]<br>
consientes [('consientes', 1), ('concientes', 2)]<br>
decidimos [('desidimos', 4), ('decidimos', 32)]<br>
decirle [('decirles', 1), ('decirlo', 2), ('desirle', 3), ('decirle', 5), ('decirme', 1)]<br>
decision [('desicion', 6), ('decision', 19)]<br>
diciendo [('diciendo', 2), ('disiendo', 2)]<br>
discapacidad [('discapasidades', 1), ('discap', 1), ('discapacidad', 4)]<br>
empezamos [('empezamos', 7), ('empesamos', 2)]<br>
empezar [('empezar', 10), ('empeze', 1), ('empesar', 1)]<br>
entonces [('entoses', 1), ('entoces', 1), ('entonces', 41)]<br>
hace [('haci', 6), ('face', 1), ('hace', 66)]<br>
hacer [('hacer', 90), ('aser', 3), ('haser', 1), ('acer', 1)]<br>
hecho [('hecho', 111), ('hechoz', 1), ('decho', 1)]<br>
hice [('hice', 22), ('hise', 2), ('ise', 2)]<br>
hicieron [('isieron', 1), ('hisieron', 1), ('hicieron', 11)]<br>
hizo [('hize', 2), ('hiso', 3), ('hizo', 33)]<br>
hubo [('hubo', 14), ('hugo', 2), ('huvo', 1)]<br>
necesita [('necesita', 14), ('nevesita', 1), ('nesecita', 1)]<br>
necesitamos [('necesitamos', 6), ('nesecitamos', 1), ('nesesitamos', 1)]<br>
paciencia [('paciencia', 8), ('pasiencia', 3), ('pazciencia', 1)]<br>
paz [('paz', 825), ('paiz', 1), ('pau', 1)]<br>
reconciliacion [('reconsiliacion', 6), ('reconciliacion', 49)]<br>
reconciliamos [('reconsiliamos', 1), ('reconcialiamos', 1), ('reconciliamos', 7)]<br>
sed [('sed', 1), ('sedi', 2)] # cedí<br>
señor [('senor', 1), ('señor', 18), ('seño', 1), ('señir', 1), ('ceñor', 1)]<br>
sientas [('sientas', 2), ('siertas', 2)]<br>
triste [('tristesa', 1), ('triste', 18), ('trizte', 1)]<br>

#### b/v
aveces [('abeses', 3), ('aveces', 2)]<br>
bien [('bien', 52), ('biem', 1), ('boen', 1), ('vien', 1)]<br>
brava [('braba', 4), ('brava', 2)]<br>
estaba [('estaba', 111), ('estava', 5), ('staba', 1)]<br>
estuvimos [('estuvimos', 4), ('estubimos', 2)]<br>
haber [('haver', 1), ('aver', 2), ('haber', 15)]<br>
habia [('habia', 63), ('abia', 1), ('havia', 5)]<br>
hija [('hija', 30), ('hiba', 1), ('hiva', 2)] # iba<br>
ibamos [('ibamos', 7), ('hibamos', 2), ('ivamos', 2)]<br>
iban [('hiban', 1), ('iban', 6), ('ivan', 2)]<br>
tuve [('tube', 6), ('tuve', 39)]<br>
viene [('biene', 2), ('viene', 3)]<br>

#### g/j
mejores [('megores', 1), ('mejores', 25), ('amejores', 1)]<br>

#### haber
haber [('haver', 1), ('aver', 2), ('haber', 15)]<br>
habia [('habia', 63), ('abia', 1), ('havia', 5)]<br>
hagan [('haygan', 1), ('hagan', 2), ('agan', 1), ('haigan', 1)]<br>
hay [('cay', 2), ('hay', 74), ('hqy', 1), ('hai', 1), ('ahy', 1), ('hiy', 1)]<br>
hubo [('hubo', 14), ('hugo', 2), ('huvo', 1)]<br>

#### joined words
partir [('partir', 13), ('apartir', 2)]<br>
aveces [('abeses', 3), ('aveces', 2)]<br>
los [('los', 687), ('alos', 3), ('loq', 2)]<br>
encontrar [('encontrat', 1), ('encontra', 1), ('encontrar', 17)] # en contra<br>
hermana [('hermana', 34), ('ermana', 2), ('miermana', 1)]<br>
hermano [('hermanodi', 1), ('hermanito', 2), ('ermano', 1), ('hermano', 48), ('permano', 1), ('hermanao', 1), ('mermano', 1)]<br>
partir [('apartir', 2), ('partir', 13)]<br>
pelea [('pelea', 51), ('pelia', 1), ('lapelea', 1)]<br>
repente [('reoente', 1), ('derrepente', 1), ('repente', 9)]<br>
violencia [('violence', 1), ('violencia', 57), ('laviolencia', 1)]<br>

#### i/e
diferencias [('diferencias', 33), ('diferents', 1), ('difirencias', 1)]<br>
pelea [('pelea', 51), ('pelia', 1), ('lapelea', 1)]<br>
peleaba [('peliaba', 2), ('peleaba', 3)]<br>
peleado [('peleado', 7), ('peliado', 2)]<br>
peleamos [('peleamos', 9), ('peliamos', 4)]<br>
pelear [('peliar', 11), ('pelear', 29)]<br>
pelearan [('pelearan', 3), ('peliabaran', 1), ('peliaran', 2)]<br>
pelee [('pelie', 13), ('pelee', 4)]<br>
peleo [('pelio', 2), ('peleo', 4)]<br>

#### order
aunque [('aunque', 19), ('auqnue', 2)]<br>
haciendo [('haciendo', 17), ('haciedno', 1), ('baciendo', 1)]<br>
humanos [('humanos', 17), ('umanos', 1), ('humanso', 1)]<br>
nuestra [('nuestra', 63), ('nuetsra', 1), ('nuestrar', 1)]<br>
nuestro [('nuesto', 1), ('nuetsro', 1), ('nuestro', 45)]<br>
nuestros [('nuestros', 31), ('nuetsros', 1), ('cuestros', 1)]<br>
que [('qeu', 1), ('aue', 1), ('qu', 3), ('qye', 2), ('q´', 1), ('que', 1738), ('quñe', 1)]<br>

#### y/ll
ayudo [('ayudo', 16), ('alludo', 2)]<br>

#### ñ
compañera [('comañera', 1), ('companera', 1), ('compañera', 24), ('campañero', 1)]<br>
compañeros [('compañeros', 55), ('companeros', 2), ('compañeritos', 1)]<br>
niña [('nina', 1), ('ninña', 1), ('niña', 30)]<br>
niños [('ninños', 1), ('nigos', 1), ('niños', 99), ('niñes', 1), ('ninos', 1)]<br>
señor [('senor', 1), ('señor', 18), ('seño', 1), ('señir', 1), ('ceñor', 1)]<br>

#### d/b
problema [('problema', 28), ('prolema', 1), ('prodlema', 1)]<br>

#### others
amistad [('amitad', 1), ('amistad', 27), ('amisgyad', 1)]<br>
campesinos [('canpesinos', 1), ('compesinosas', 1), ('campesinosas', 2), ('campesinos', 1)]<br>
colombiano [('colombiano', 7), ('colombbia', 1), ('colombia', 58)]<br>
comunidad [('cmunidad', 1), ('comunida', 1), ('comunidad', 36), ('comunidar', 1)]<br>
consciente [('conciente', 2), ('consciente', 3)]<br>
demos [('demos', 2), ('bemos', 3)]<br>
fue [('fuy', 2), ('fur', 1), ('fuel', 1), ('fue', 172)]<br>
guerra [('guerr', 1), ('gerra', 1), ('guerra', 36)]<br>
oportunidad [('oportunidad', 31), ('poprtunidades', 1), ('oportunidade', 1), ('oportunida', 1)]<br>
padre [('pradre', 2), ('padre', 45)]<br>
pego [('pego', 12), ('pege', 2)]<br>
perra [('perra', 4), ('pegra', 2)]<br>
persona [('person', 2), ('pesona', 1), ('personita', 1), ('persona', 73)]<br>
queremos [('quieremos', 1), ('querñuamos', 1), ('queremos', 5)]<br>
rugby [('rurby', 2), ('rugby', 1)]

## n-grams

In [69]:
def all_stopwords(ngrm):
    return all(map(lambda x: x in stop_words, ngrm))

In [70]:
def my_ngrams(tokens):
    ngrms = []
    for i in range(1, 3 + 1):
        ngrms_aux = [ngrm for ngrm in ngrams(tokens, i) if not all_stopwords(ngrm)]
        ngrms.extend(ngrms_aux)
    return ngrms

In [71]:
total_unigrams = []
total_bigrams = []
total_trigrams = []
for k in d:
    clean_title_words = [correct_spelling[w] for w in word_tokenize(normalize(d[k]['title']))]
    clean_description_words = [correct_spelling[w] for w in word_tokenize(normalize(d[k]['description']))]
    title_ngrams = my_ngrams(clean_title_words)
    description_ngrams = my_ngrams(clean_description_words)
    text_ngrams = title_ngrams + description_ngrams
    total_unigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 1)
    total_bigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 2)
    total_trigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 3)

In [72]:
tuni = Counter(total_unigrams)

In [73]:
tuni.most_common(200)

[(('paz',), 827),
 (('dia',), 303),
 (('familia',), 183),
 (('vida',), 170),
 (('personas',), 165),
 (('ser',), 161),
 (('amor',), 128),
 (('mejor',), 123),
 (('hecho',), 113),
 (('asi',), 103),
 (('niños',), 103),
 (('solo',), 102),
 (('perdon',), 101),
 (('despues',), 101),
 (('años',), 100),
 (('colegio',), 98),
 (('cada',), 98),
 (('casa',), 96),
 (('hacer',), 95),
 (('siempre',), 88),
 (('tiempo',), 86),
 (('mama',), 78),
 (('año',), 78),
 (('persona',), 77),
 (('amigos',), 75),
 (('cosas',), 67),
 (('hace',), 67),
 (('demas',), 66),
 (('respeto',), 66),
 (('colombiano',), 66),
 (('perdonar',), 64),
 (('hogar',), 64),
 (('tener',), 63),
 (('mal',), 63),
 (('pais',), 61),
 (('momento',), 60),
 (('dos',), 60),
 (('violencia',), 59),
 (('dias',), 58),
 (('corazon',), 58),
 (('compañeros',), 58),
 (('hoy',), 57),
 (('ahora',), 57),
 (('trabajo',), 57),
 (('dije',), 56),
 (('cuenta',), 56),
 (('reconciliacion',), 55),
 (('hablar',), 55),
 (('hermano',), 55),
 (('amiga',), 55),
 (('mund

In [74]:
tbi = Counter(total_bigrams)

In [75]:
tbi.most_common(200)

[(('la', 'paz'), 447),
 (('de', 'paz'), 158),
 (('un', 'dia'), 141),
 (('paz', 'es'), 81),
 (('las', 'personas'), 73),
 (('mi', 'familia'), 72),
 (('paz', 'en'), 64),
 (('hecho', 'de'), 64),
 (('la', 'vida'), 62),
 (('despues', 'de'), 59),
 (('en', 'paz'), 58),
 (('mi', 'mama'), 55),
 (('paz', 'y'), 52),
 (('los', 'demas'), 50),
 (('los', 'niños'), 49),
 (('personas', 'que'), 44),
 (('mi', 'hermano'), 44),
 (('la', 'familia'), 44),
 (('la', 'calle'), 43),
 (('asi', 'que'), 42),
 (('el', 'colegio'), 40),
 (('le', 'dije'), 39),
 (('mi', 'vida'), 36),
 (('un', 'grupo'), 36),
 (('grupo', 'de'), 35),
 (('el', 'amor'), 33),
 (('un', 'hecho'), 33),
 (('la', 'violencia'), 33),
 (('ayudar', 'a'), 33),
 (('el', 'perdon'), 32),
 (('medio', 'de'), 32),
 (('el', 'año'), 32),
 (('mis', 'compañeros'), 32),
 (('parte', 'de'), 31),
 (('paz', 'con'), 31),
 (('mi', 'hermana'), 31),
 (('el', 'dia'), 31),
 (('cuenta', 'que'), 29),
 (('ese', 'dia'), 29),
 (('mi', 'padre'), 28),
 (('mi', 'casa'), 28),
 (('mi

In [76]:
ttri = Counter(total_trigrams)

In [77]:
ttri.most_common(200)

[(('la', 'paz', 'es'), 59),
 (('hecho', 'de', 'paz'), 52),
 (('a', 'la', 'paz'), 36),
 (('en', 'el', 'colegio'), 33),
 (('la', 'paz', 'en'), 32),
 (('un', 'grupo', 'de'), 28),
 (('a', 'los', 'demas'), 26),
 (('de', 'la', 'paz'), 23),
 (('para', 'la', 'paz'), 23),
 (('que', 'la', 'paz'), 22),
 (('las', 'personas', 'que'), 22),
 (('en', 'la', 'calle'), 20),
 (('un', 'hecho', 'de'), 20),
 (('a', 'las', 'personas'), 20),
 (('en', 'el', 'año'), 19),
 (('con', 'mi', 'hermano'), 19),
 (('mi', 'hecho', 'de'), 19),
 (('de', 'la', 'calle'), 18),
 (('con', 'mi', 'mama'), 18),
 (('con', 'mi', 'familia'), 18),
 (('por', 'medio', 'de'), 17),
 (('a', 'los', 'niños'), 17),
 (('la', 'paz', 'se'), 17),
 (('me', 'di', 'cuenta'), 17),
 (('la', 'paz', 'y'), 16),
 (('le', 'dije', 'que'), 16),
 (('a', 'traves', 'de'), 16),
 (('de', 'paz', 'es'), 16),
 (('la', 'oportunidad', 'de'), 16),
 (('todos', 'los', 'dias'), 16),
 (('la', 'paz', 'desde'), 15),
 (('en', 'ese', 'momento'), 15),
 (('en', 'medio', 'de'), 15

## Animales

In [78]:
animales = ['animal', 'animalito', 'animales', 'animalitos',
            'perro', 'perra', 'perros', 'perras', 'perrito', 'perrita', 'perritos', 'perritas',
            'gato', 'gata', 'gatos', 'gatas', 'gatito', 'gatita', 'gatitos', 'gatitas',
            'mascota', 'ovejita', 'pez']

In [79]:
# animales = ['caballo', 'yegua', 'caballito', 'burro', 'burrito', 'oveja',
#             'cabra', 'vaquita', 'ovejita', 'vaca', 'cordero', 'corderito',
#             'pez', 'pececito', 'cerdo', 'cerdito', 'puerco', 'gallina', 'pollo', 'gallo',
#             'pesesito', 'pes']

In [80]:
total_animales = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_animales.append(a)

In [81]:
len(total_animales)

169

In [82]:
ta = Counter(total_animales)

In [83]:
ta

Counter({'animal': 17,
         'animales': 36,
         'animalito': 2,
         'animalitos': 1,
         'gata': 4,
         'gatita': 1,
         'gatito': 4,
         'gatitos': 1,
         'gato': 17,
         'gatos': 6,
         'mascota': 7,
         'ovejita': 1,
         'perra': 4,
         'perrita': 10,
         'perrito': 24,
         'perritos': 5,
         'perro': 14,
         'perros': 13,
         'pez': 2})

In [84]:
total_animales_m = []
total_animales_f = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['gender'] == 'M':
                total_animales_m.append(a)
            elif d[k]['gender'] == 'F':
                total_animales_f.append(a)

In [85]:
len(total_animales_m)

28

In [86]:
len(total_animales_f)

66

In [87]:
Counter(total_animales_m).most_common()

[('perrito', 5),
 ('perro', 5),
 ('gato', 5),
 ('perrita', 3),
 ('animales', 3),
 ('perritos', 3),
 ('mascota', 2),
 ('perros', 1),
 ('gata', 1)]

In [88]:
Counter(total_animales_f).most_common()

[('animales', 18),
 ('animal', 10),
 ('gato', 9),
 ('perros', 6),
 ('perrito', 5),
 ('perro', 3),
 ('animalito', 2),
 ('perrita', 2),
 ('perra', 2),
 ('mascota', 2),
 ('gatito', 2),
 ('animalitos', 1),
 ('gatos', 1),
 ('gata', 1),
 ('gatitos', 1),
 ('gatita', 1)]

In [89]:
total_animales_age = []
total_animales_noage = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['age']:
                total_animales_age.append(a)
            else:
                total_animales_noage.append(a)

In [90]:
len(total_animales_age)

74

In [91]:
len(total_animales_noage)

95

In [92]:
total_animales_ch = []
total_animales_te = []
total_animales_ya = []
total_animales_ad = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            try:
                if d[k]['age']:
                    age = int(d[k]['age'])
                    if 0 <= age <= 12:
                        total_animales_ch.append(a)
                    elif 13 <= age <= 18:
                        total_animales_te.append(a)
                    elif 19 <= age <= 25:
                        total_animales_ya.append(a)
                    else:
                        total_animales_ad.append(a)
            except: # '16, 19'
                total_animales_te.append(a)

In [93]:
len(total_animales_ch)

32

In [94]:
Counter(total_animales_ch).most_common()

[('gato', 10),
 ('perrito', 5),
 ('perro', 4),
 ('animal', 3),
 ('animales', 3),
 ('perrita', 3),
 ('mascota', 2),
 ('animalito', 1),
 ('perritos', 1)]

In [95]:
len(total_animales_te)

17

In [96]:
Counter(total_animales_te).most_common()

[('animales', 5),
 ('perrito', 3),
 ('animal', 3),
 ('perrita', 1),
 ('perros', 1),
 ('animalito', 1),
 ('gato', 1),
 ('mascota', 1),
 ('gatito', 1)]

In [97]:
len(total_animales_ya)

13

In [98]:
Counter(total_animales_ya).most_common()

[('animales', 4),
 ('perrito', 2),
 ('animal', 2),
 ('perra', 2),
 ('gatitos', 1),
 ('gato', 1),
 ('perrita', 1)]

In [99]:
len(total_animales_ad)

12

In [100]:
Counter(total_animales_ad).most_common()

[('animales', 4),
 ('perros', 2),
 ('gata', 2),
 ('gato', 1),
 ('animal', 1),
 ('gatita', 1),
 ('mascota', 1)]

In [101]:
total_cards_with_animales = 0
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_cards_with_animales += 1
            break

In [102]:
total_cards_with_animales

83