In [1]:
import csv
import os
from collections import Counter
from collections import defaultdict
from string import punctuation

import enchant
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams

# Lexicon

In [2]:
def remove_accent_marks(w):
    r = w
    r = r.replace('á', 'a')
    r = r.replace('é', 'e')
    r = r.replace('í', 'i')
    r = r.replace('ó', 'o')
    r = r.replace('ú', 'u')
    return r

In [3]:
lexicon = {}
with open('lexicon/es_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    for row in reader:
        w = remove_accent_marks(row[0])
        lexicon[w] = []
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['lemma'] = remove_accent_marks(row[i].lower())
            entry['eagle'] = remove_accent_marks(row[i+1].lower())
            lexicon[w].append(entry)

In [4]:
len(lexicon)

500949

In [5]:
def is_vowel(c):
    return c in 'aeiouAEIOUáéíóúÁÉÍÓÚüÜ'

def next_level(w):
    result = [w]
    if is_vowel(w[0]):
        result.append('h' + w)
    if "q'" in w:
        result.append(w.replace("q'", 'que'))
    for i in range(len(w)):
        if w[i] == 'v':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'b':
            result.append(w[:i] + 'v' + w[i+1:])
            result.append(w[:i] + 'd' + w[i+1:])
        elif w[i] == 'd':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'c' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 's' + w[i+1:])
        elif w[i] == 's' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 'c' + w[i+1:])
    return result

def combinations_aux(visited, result):
    while visited:
        w = visited[0]
        visited = visited[1:]
        if w not in result:
            result.append(w)
            combs = next_level(w)
            visited.extend(combs)        
    return result

def combinations(w):
    return combinations_aux([w], [])

In [6]:
es = enchant.Dict('es_ES')

# Analysis

In [7]:
os.listdir('data')

['ana.csv', 'dago.csv', 'nandi.csv', 'rafa.csv', 'final.csv', 'yadira.csv']

In [8]:
d = {}
for filename in os.listdir('data'):
    with open('data/' + filename) as f:
        reader = csv.reader(
            f,
            delimiter=',',
            quotechar='"'
        )
        next(reader)
        for row in reader:
            d[row[0]] = {
                'date': row[1],
                'name': row[2],
                'gender': row[3],
                'age': row[4],
                'city': row[5],
                'drawings': row[6],
                'title': row[7],
                'description': row[8],
            }

In [9]:
len(d)

883

## Gender

In [10]:
blank = 0
male = 0
female = 0
for k in d:
    gender = d[k]['gender']
    if gender == 'M':
        male += 1
    elif gender == 'F':
        female += 1
    else:
        blank += 1

In [11]:
blank

471

In [12]:
male

116

In [13]:
female

296

## Cities

In [14]:
cities = []
for k in d:
    city = d[k]['city']
    cities.append(city)

In [15]:
c = Counter(cities)

In [16]:
# c

In [17]:
c.most_common(10)

[('', 607),
 ('Bogotá', 171),
 ('Neiva', 13),
 ('Bogotá D.C.', 9),
 ('Soacha', 7),
 ('Fusagasugá', 6),
 ('Bogotá D.C', 5),
 ('Bucaramanga', 4),
 ('Ibagué', 4),
 ('Cali', 3)]

## Ages

In [18]:
ages = []
for k in d:
    age = d[k]['age']
    ages.append(age)

In [19]:
a = Counter(ages)

In [20]:
# a

In [21]:
a.most_common(11)

[('', 602),
 ('16', 21),
 ('14', 17),
 ('10', 13),
 ('12', 12),
 ('19', 12),
 ('13', 12),
 ('11', 12),
 ('21', 11),
 ('15', 11),
 ('20', 11)]

In [22]:
children = 0
teenagers = 0
young_adults = 0
adults = 0
for k in d:
    try:
        if d[k]['age']:
            age = int(d[k]['age'])
            if 0 <= age <= 12:
                children += 1
            elif 13 <= age <= 18:
                teenagers += 1
            elif 19 <= age <= 25:
                young_adults += 1
            else:
                adults += 1
    except: # '16, 19'
        teenagers += 1

In [23]:
children

55

In [24]:
teenagers

81

In [25]:
young_adults

58

In [26]:
adults

87

## Drawings

In [27]:
drawing_words = []
for k in d:
    drawings = d[k]['drawings'].lower().replace(';', ',')
    words = [x.strip() for x in drawings.split(',')]
    drawing_words.extend(words)

In [28]:
dw = Counter(drawing_words)

In [29]:
# dw

In [30]:
dw.most_common(11)

[('', 768),
 ('corazón', 84),
 ('cara feliz', 28),
 ('niño', 11),
 ('carita feliz', 11),
 ('nube', 7),
 ('cara sonriente', 5),
 ('perro', 4),
 ('flor', 4),
 ('sol', 3),
 ('cruz', 2)]

## Words

In [31]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
def normalize(text):
    text = text.lower()
    text = remove_accent_marks(text)
    for p in punctuation:
        text = text.replace(p, '')
    return text

In [33]:
def total_text(r):
    return normalize(r['title'] + ' ' + r['description'])

In [34]:
total_words = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    total_words.extend(words)

In [35]:
len(total_words)

48621

In [36]:
total_words_freq = Counter(total_words)

In [37]:
total_words_freq.most_common(10)

[('de', 2112),
 ('y', 2092),
 ('la', 1797),
 ('que', 1456),
 ('en', 1302),
 ('a', 1293),
 ('el', 1087),
 ('un', 871),
 ('con', 755),
 ('mi', 729)]

In [38]:
stop_words = [remove_accent_marks(w) for w in stopwords.words('spanish')]

In [39]:
total_nonstop_words = [w for w in total_words if w not in stop_words]

In [40]:
len(total_nonstop_words)

23427

In [41]:
unique_nonstop_words = set(total_nonstop_words)

In [42]:
len(unique_nonstop_words)

6937

## Spelling

In [43]:
correct_spelling = {}

for w in total_words_freq:
    if w not in correct_spelling:
        if w in lexicon:
            correct_spelling[w] = w
        else:
            combs = combinations(w)
            freqs = [(c, total_words_freq[c]) for c in combs if c in total_words_freq and c in lexicon]
            if freqs:
                freqs.sort(key=lambda x: x[1], reverse=True)
                correct_spelling[w] = freqs[0][0]
            else:
                freqs = [(c, total_words_freq[c]) for c in es.suggest(w) if c in total_words_freq]
                if freqs:
                    freqs.sort(key=lambda x: x[1], reverse=True)
                    correct_spelling[w] = freqs[0][0]
                else:
                    correct_spelling[w] = w

In [44]:
len(correct_spelling)

7121

In [45]:
len(total_words_freq)

7121

In [46]:
spelling_map = defaultdict(list)
for key in correct_spelling:
    v = correct_spelling[key]
    spelling_map[v].append((key, total_words_freq[key]))
spelling_map = dict(spelling_map)

In [47]:
len(spelling_map)

6513

In [48]:
correct_spelling['aser']

'hacer'

In [49]:
spelling_map['hacer']

[('haser', 1), ('acer', 1), ('hacer', 74), ('aser', 3)]

In [50]:
mispellings = [w for w in spelling_map if w not in stop_words and len(spelling_map[w]) > 1]
mispellings.sort()

In [51]:
mispellings[:10]

['abandonados',
 'abrasamos',
 'abuelas',
 'abuelo',
 'abuelos',
 'abusar',
 'acabo',
 'aceptar',
 'aceptaron',
 'adoptar']

In [52]:
spelling_map['abandonados']

[('abandonados', 3), ('habandonados', 1)]

## n-grams

In [53]:
def all_stopwords(ngrm):
    return all(map(lambda x: x in stop_words, ngrm))

In [54]:
def my_ngrams(tokens):
    ngrms = []
    for i in range(1, 3 + 1):
        ngrms_aux = [ngrm for ngrm in ngrams(tokens, i) if not all_stopwords(ngrm)]
        ngrms.extend(ngrms_aux)
    return ngrms

In [55]:
total_unigrams = []
total_bigrams = []
total_trigrams = []
for k in d:
    clean_title_words = [correct_spelling[w] for w in word_tokenize(normalize(d[k]['title']))]
    clean_description_words = [correct_spelling[w] for w in word_tokenize(normalize(d[k]['description']))]
    title_ngrams = my_ngrams(clean_title_words)
    description_ngrams = my_ngrams(clean_description_words)
    text_ngrams = title_ngrams + description_ngrams
    total_unigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 1)
    total_bigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 2)
    total_trigrams.extend(ngrm for ngrm in text_ngrams if len(ngrm) == 3)

In [56]:
tuni = Counter(total_unigrams)

In [57]:
tuni.most_common(100)

[(('paz',), 707),
 (('dia',), 257),
 (('familia',), 162),
 (('personas',), 146),
 (('ser',), 140),
 (('vida',), 130),
 (('amor',), 114),
 (('hecho',), 95),
 (('mejor',), 93),
 (('cada',), 92),
 (('años',), 92),
 (('niños',), 89),
 (('despues',), 88),
 (('colegio',), 87),
 (('asi',), 85),
 (('perdon',), 84),
 (('solo',), 83),
 (('casa',), 82),
 (('hacer',), 79),
 (('tiempo',), 74),
 (('año',), 73),
 (('siempre',), 72),
 (('persona',), 67),
 (('hace',), 65),
 (('amigos',), 64),
 (('mama',), 62),
 (('hogar',), 57),
 (('cosas',), 56),
 (('dos',), 56),
 (('respeto',), 56),
 (('colombiano',), 55),
 (('trabajo',), 55),
 (('demas',), 55),
 (('violencia',), 53),
 (('mundo',), 53),
 (('mal',), 52),
 (('compañeros',), 52),
 (('pais',), 52),
 (('ayudar',), 52),
 (('ahora',), 52),
 (('hermano',), 51),
 (('social',), 50),
 (('hoy',), 50),
 (('dias',), 50),
 (('tener',), 48),
 (('corazon',), 48),
 (('bien',), 47),
 (('perdonar',), 46),
 (('calle',), 46),
 (('hablar',), 46),
 (('dije',), 46),
 (('ayud

In [58]:
tbi = Counter(total_bigrams)

In [59]:
tbi.most_common(100)

[(('la', 'paz'), 378),
 (('de', 'paz'), 131),
 (('un', 'dia'), 117),
 (('paz', 'es'), 69),
 (('las', 'personas'), 67),
 (('mi', 'familia'), 60),
 (('hecho', 'de'), 56),
 (('paz', 'en'), 56),
 (('despues', 'de'), 53),
 (('la', 'vida'), 51),
 (('en', 'paz'), 50),
 (('paz', 'y'), 44),
 (('mi', 'mama'), 43),
 (('los', 'niños'), 42),
 (('la', 'calle'), 40),
 (('los', 'demas'), 40),
 (('la', 'familia'), 40),
 (('mi', 'hermano'), 40),
 (('personas', 'que'), 35),
 (('el', 'colegio'), 34),
 (('asi', 'que'), 33),
 (('grupo', 'de'), 33),
 (('un', 'grupo'), 32),
 (('el', 'amor'), 31),
 (('le', 'dije'), 31),
 (('la', 'violencia'), 30),
 (('parte', 'de'), 30),
 (('ayudar', 'a'), 30),
 (('el', 'año'), 30),
 (('mi', 'hermana'), 29),
 (('el', 'perdon'), 28),
 (('un', 'hecho'), 28),
 (('mi', 'padre'), 27),
 (('mis', 'compañeros'), 27),
 (('medio', 'de'), 25),
 (('paz', 'desde'), 25),
 (('ese', 'dia'), 24),
 (('el', 'dia'), 24),
 (('paz', 'con'), 24),
 (('familia', 'y'), 23),
 (('mi', 'papa'), 23),
 (('m

In [60]:
ttri = Counter(total_trigrams)

In [61]:
ttri.most_common(100)

[(('la', 'paz', 'es'), 51),
 (('hecho', 'de', 'paz'), 45),
 (('a', 'la', 'paz'), 32),
 (('en', 'el', 'colegio'), 29),
 (('un', 'grupo', 'de'), 26),
 (('la', 'paz', 'en'), 26),
 (('a', 'los', 'demas'), 23),
 (('para', 'la', 'paz'), 21),
 (('las', 'personas', 'que'), 20),
 (('en', 'el', 'año'), 19),
 (('un', 'hecho', 'de'), 19),
 (('a', 'las', 'personas'), 19),
 (('en', 'la', 'calle'), 18),
 (('de', 'la', 'calle'), 18),
 (('con', 'mi', 'hermano'), 17),
 (('mi', 'hecho', 'de'), 17),
 (('de', 'la', 'paz'), 17),
 (('que', 'la', 'paz'), 17),
 (('la', 'paz', 'se'), 17),
 (('con', 'mi', 'familia'), 16),
 (('la', 'paz', 'desde'), 15),
 (('con', 'mi', 'mama'), 15),
 (('la', 'oportunidad', 'de'), 15),
 (('todos', 'los', 'dias'), 14),
 (('un', 'dia', 'en'), 14),
 (('con', 'mi', 'hermana'), 14),
 (('me', 'di', 'cuenta'), 14),
 (('la', 'paz', 'y'), 14),
 (('por', 'medio', 'de'), 14),
 (('a', 'traves', 'de'), 13),
 (('por', 'la', 'paz'), 13),
 (('la', 'paz', 'no'), 13),
 (('a', 'los', 'niños'), 12),


## Animales

In [62]:
animales = ['animal', 'animalito', 'animales', 'animalitos',
            'perro', 'perra', 'perros', 'perras', 'perrito', 'perrita', 'perritos', 'perritas',
            'gato', 'gata', 'gatos', 'gatas', 'gatito', 'gatita', 'gatitos', 'gatitas',
            'mascota', 'ovejita', 'pez']

In [63]:
# animales = ['caballo', 'yegua', 'caballito', 'burro', 'burrito', 'oveja',
#             'cabra', 'vaquita', 'ovejita', 'vaca', 'cordero', 'corderito',
#             'pez', 'pececito', 'cerdo', 'cerdito', 'puerco', 'gallina', 'pollo', 'gallo',
#             'pesesito', 'pes']

In [64]:
total_animales = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_animales.append(a)

In [65]:
len(total_animales)

149

In [66]:
ta = Counter(total_animales)

In [67]:
ta

Counter({'animal': 16,
         'animales': 31,
         'animalito': 2,
         'animalitos': 1,
         'gata': 3,
         'gatito': 4,
         'gatitos': 1,
         'gato': 14,
         'gatos': 6,
         'mascota': 6,
         'ovejita': 1,
         'perra': 4,
         'perrita': 7,
         'perrito': 22,
         'perritos': 5,
         'perro': 13,
         'perros': 11,
         'pez': 2})

In [68]:
total_animales_m = []
total_animales_f = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['gender'] == 'M':
                total_animales_m.append(a)
            elif d[k]['gender'] == 'F':
                total_animales_f.append(a)

In [69]:
len(total_animales_m)

20

In [70]:
len(total_animales_f)

59

In [71]:
Counter(total_animales_m).most_common()

[('perrito', 5),
 ('perro', 4),
 ('perritos', 3),
 ('gato', 3),
 ('mascota', 2),
 ('animales', 2),
 ('perros', 1)]

In [72]:
Counter(total_animales_f).most_common()

[('animales', 17),
 ('animal', 9),
 ('gato', 8),
 ('perros', 6),
 ('perro', 3),
 ('perrito', 3),
 ('gatito', 2),
 ('perrita', 2),
 ('perra', 2),
 ('animalito', 2),
 ('animalitos', 1),
 ('mascota', 1),
 ('gatitos', 1),
 ('gata', 1),
 ('gatos', 1)]

In [73]:
total_animales_age = []
total_animales_noage = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['age']:
                total_animales_age.append(a)
            else:
                total_animales_noage.append(a)

In [74]:
len(total_animales_age)

59

In [75]:
len(total_animales_noage)

90

In [76]:
total_animales_ch = []
total_animales_te = []
total_animales_ya = []
total_animales_ad = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            try:
                if d[k]['age']:
                    age = int(d[k]['age'])
                    if 0 <= age <= 12:
                        total_animales_ch.append(a)
                    elif 13 <= age <= 18:
                        total_animales_te.append(a)
                    elif 19 <= age <= 25:
                        total_animales_ya.append(a)
                    else:
                        total_animales_ad.append(a)
            except: # '16, 19'
                total_animales_te.append(a)

In [77]:
len(total_animales_ch)

27

In [78]:
Counter(total_animales_ch).most_common()

[('gato', 8),
 ('perrito', 5),
 ('perro', 4),
 ('animales', 3),
 ('animal', 3),
 ('mascota', 2),
 ('perritos', 1),
 ('animalito', 1)]

In [79]:
len(total_animales_te)

16

In [80]:
Counter(total_animales_te).most_common()

[('animales', 4),
 ('perrito', 3),
 ('animal', 3),
 ('perros', 1),
 ('gatito', 1),
 ('mascota', 1),
 ('gato', 1),
 ('animalito', 1),
 ('perrita', 1)]

In [81]:
len(total_animales_ya)

10

In [82]:
Counter(total_animales_ya).most_common()

[('animales', 3),
 ('animal', 2),
 ('perra', 2),
 ('perrita', 1),
 ('gatitos', 1),
 ('gato', 1)]

In [83]:
len(total_animales_ad)

6

In [84]:
Counter(total_animales_ad).most_common()

[('animales', 4), ('gata', 1), ('perros', 1)]

In [85]:
total_cards_with_animales = 0
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_cards_with_animales += 1
            break

In [86]:
total_cards_with_animales

70