In [61]:
import csv
import os
from collections import Counter
from collections import defaultdict
from string import punctuation

import enchant
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams

# Lexicon

In [139]:
def remove_accent_marks(w):
    r = w
    r = r.replace('á', 'a')
    r = r.replace('é', 'e')
    r = r.replace('í', 'i')
    r = r.replace('ó', 'o')
    r = r.replace('ú', 'u')
    return r

In [140]:
lexicon = {}
with open('lexicon/es_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    for row in reader:
        w = remove_accent_marks(row[0])
        lexicon[w] = []
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['lemma'] = remove_accent_marks(row[i].lower())
            entry['eagle'] = remove_accent_marks(row[i+1].lower())
            lexicon[w].append(entry)

In [3]:
len(lexicon)

556210

In [4]:
def is_vowel(c):
    return c in 'aeiouAEIOUáéíóúÁÉÍÓÚüÜ'

def next_level(w):
    result = [w]
    if is_vowel(w[0]):
        result.append('h' + w)
    if "q'" in w:
        result.append(w.replace("q'", 'que'))
    for i in range(len(w)):
        if w[i] == 'v':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'b':
            result.append(w[:i] + 'v' + w[i+1:])
            result.append(w[:i] + 'd' + w[i+1:])
        elif w[i] == 'd':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'c' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 's' + w[i+1:])
        elif w[i] == 's' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 'c' + w[i+1:])
    return result

def combinations_aux(visited, result):
    while visited:
        w = visited[0]
        visited = visited[1:]
        if w not in result:
            result.append(w)
            combs = next_level(w)
            visited.extend(combs)        
    return result

def combinations(w):
    return combinations_aux([w], [])

In [75]:
es = enchant.Dict('es_ES')

# Analysis

In [5]:
os.listdir('data')

['ana.csv', 'dago.csv', 'rafa.csv', 'final.csv', 'yadira.csv']

In [6]:
d = {}
for filename in os.listdir('data'):
    with open('data/' + filename) as f:
        reader = csv.reader(
            f,
            delimiter=',',
            quotechar='"'
        )
        next(reader)
        for row in reader:
            d[row[0]] = {
                'date': row[1],
                'name': row[2],
                'gender': row[3],
                'age': row[4],
                'city': row[5],
                'drawings': row[6],
                'title': row[7],
                'description': row[8],
            }

In [7]:
len(d)

826

## Gender

In [8]:
blank = 0
male = 0
female = 0
for k in d:
    gender = d[k]['gender']
    if gender == 'M':
        male += 1
    elif gender == 'F':
        female += 1
    else:
        blank += 1

In [9]:
blank

449

In [10]:
male

108

In [11]:
female

269

## Cities

In [12]:
cities = []
for k in d:
    city = d[k]['city']
    cities.append(city)

In [13]:
c = Counter(cities)

In [14]:
# c

In [15]:
c.most_common(10)

[('', 582),
 ('Bogotá', 155),
 ('Bogotá D.C.', 9),
 ('Neiva', 7),
 ('Fusagasugá', 6),
 ('Bogotá D.C', 5),
 ('Soacha', 5),
 ('Bucaramanga', 4),
 ('Medellín', 3),
 ('Cali', 3)]

## Ages

In [16]:
ages = []
for k in d:
    age = d[k]['age']
    ages.append(age)

In [17]:
a = Counter(ages)

In [18]:
# a

In [19]:
a.most_common(11)

[('', 575),
 ('14', 16),
 ('16', 13),
 ('10', 12),
 ('19', 12),
 ('13', 11),
 ('11', 11),
 ('20', 11),
 ('12', 10),
 ('21', 9),
 ('17', 9)]

In [20]:
children = 0
teenagers = 0
young_adults = 0
adults = 0
for k in d:
    try:
        if d[k]['age']:
            age = int(d[k]['age'])
            if 0 <= age <= 12:
                children += 1
            elif 13 <= age <= 18:
                teenagers += 1
            elif 19 <= age <= 25:
                young_adults += 1
            else:
                adults += 1
    except: # '16, 19'
        teenagers += 1

In [21]:
children

51

In [22]:
teenagers

65

In [23]:
young_adults

54

In [24]:
adults

81

## Drawings

In [25]:
drawing_words = []
for k in d:
    drawings = d[k]['drawings'].lower().replace(';', ',')
    words = [x.strip() for x in drawings.split(',')]
    drawing_words.extend(words)

In [26]:
dw = Counter(drawing_words)

In [27]:
# dw

In [28]:
dw.most_common(11)

[('', 715),
 ('corazón', 84),
 ('cara feliz', 28),
 ('niño', 11),
 ('carita feliz', 11),
 ('nube', 7),
 ('cara sonriente', 5),
 ('perro', 4),
 ('sol', 3),
 ('ojos', 2),
 ('cruz', 2)]

## Words

In [29]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [141]:
def total_text(r):
    return remove_accent_marks((r['title'] + ' ' + r['description']).lower())

In [223]:
total_words = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    total_words.extend(words)

In [224]:
len(total_words)

45801

In [144]:
total_words_freq = Counter(total_words)

In [145]:
total_words_freq.most_common(10)

[('de', 1996),
 ('y', 1953),
 ('la', 1684),
 ('que', 1365),
 ('en', 1224),
 ('a', 1193),
 ('el', 1032),
 ('un', 813),
 ('con', 692),
 ('mi', 668)]

In [146]:
stop_words = [remove_accent_marks(w) for w in stopwords.words('spanish')]

In [147]:
total_nonstop_words = [w for w in total_words if w not in stopwords_no_accents]

In [148]:
len(total_nonstop_words)

22240

In [149]:
unique_nonstop_words = set(total_nonstop_words)

In [150]:
len(unique_nonstop_words)

6705

## Spelling

In [71]:
def correct(w):
    if w in lexicon:
        r = w
    else:
        combs = combinations(w)
        freqs = [(w, total_words_freq[w]) for w in combs if w in lexicon]
        freqs.sort(key=lambda x: x[1], reverse=True)
        r = freqs[0][0]
    return r

In [151]:
correct_spelling = {}

for w in total_words_freq:
    if w not in correct_spelling:
        if w in lexicon:
            correct_spelling[w] = w
        else:
            combs = combinations(w)
            freqs = [(c, total_words_freq[c]) for c in combs if c in total_words_freq and c in lexicon]
            if freqs:
                freqs.sort(key=lambda x: x[1], reverse=True)
                correct_spelling[w] = freqs[0][0]
            else:
                freqs = [(c, total_words_freq[c]) for c in es.suggest(w) if c in total_words_freq]
                if freqs:
                    freqs.sort(key=lambda x: x[1], reverse=True)
                    correct_spelling[w] = freqs[0][0]
                else:
                    correct_spelling[w] = w

In [152]:
len(correct_spelling)

6888

In [153]:
len(total_words_freq)

6888

In [167]:
spelling_map = defaultdict(list)
for key in correct_spelling:
    v = correct_spelling[key]
    spelling_map[v].append((key, total_words_freq[key]))
spelling_map = dict(spelling_map)

In [173]:
len(spelling_map)

6327

In [170]:
correct_spelling['aser']

'hacer'

In [171]:
spelling_map['hacer']

[('hacer', 71), ('aser', 3), ('haser', 1), ('acer', 1)]

In [184]:
mispellings = [w for w in spelling_map if w not in stop_words and len(spelling_map[w]) > 1]
mispellings.sort()

In [186]:
mispellings[:10]

['abandonados',
 'abrasamos',
 'abuelas',
 'abuelo',
 'abuelos',
 'abusar',
 'acabo',
 'aceptar',
 'aceptaron',
 'adoptar']

In [195]:
spelling_map['abandonados']

[('abandonados', 3), ('habandonados', 1)]

## n-grams

In [None]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

## Animales

In [197]:
animales = ['animal', 'animalito', 'animales', 'animalitos',
            'perro', 'perra', 'perros', 'perras', 'perrito', 'perrita', 'perritos', 'perritas',
            'gato', 'gata', 'gatos', 'gatas', 'gatito', 'gatita', 'gatitos', 'gatitas',
            'mascota', 'ovejita', 'pez']

In [198]:
# animales = ['caballo', 'yegua', 'caballito', 'burro', 'burrito', 'oveja',
#             'cabra', 'vaquita', 'ovejita', 'vaca', 'cordero', 'corderito',
#             'pez', 'pececito', 'cerdo', 'cerdito', 'puerco', 'gallina', 'pollo', 'gallo',
#             'pesesito', 'pes']

In [248]:
total_animals = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_animals.append(a)

In [249]:
len(total_animals)

136

In [250]:
ta = Counter(total_animales)

In [251]:
ta

Counter({'animal': 13,
         'animales': 26,
         'animalito': 1,
         'animalitos': 1,
         'gata': 3,
         'gatito': 3,
         'gato': 13,
         'gatos': 6,
         'mascota': 6,
         'ovejita': 1,
         'perra': 4,
         'perrita': 6,
         'perrito': 22,
         'perritos': 5,
         'perro': 13,
         'perros': 11,
         'pez': 2})

In [253]:
total_animales_m = []
total_animales_f = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['gender'] == 'M':
                total_animales_m.append(a)
            elif d[k]['gender'] == 'F':
                total_animales_f.append(a)

In [254]:
len(total_animales_m)

20

In [255]:
len(total_animales_f)

52

In [256]:
Counter(total_animales_m).most_common()

[('perrito', 5),
 ('perro', 4),
 ('gato', 3),
 ('perritos', 3),
 ('mascota', 2),
 ('animales', 2),
 ('perros', 1)]

In [257]:
Counter(total_animales_f).most_common()

[('animales', 15),
 ('animal', 8),
 ('gato', 7),
 ('perros', 6),
 ('perrito', 3),
 ('perro', 3),
 ('perra', 2),
 ('perrita', 2),
 ('gata', 1),
 ('mascota', 1),
 ('gatos', 1),
 ('gatito', 1),
 ('animalito', 1),
 ('animalitos', 1)]

In [258]:
total_animales_age = []
total_animales_noage = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            if d[k]['age']:
                total_animales_age.append(a)
            else:
                total_animales_noage.append(a)

In [259]:
len(total_animales_age)

51

In [260]:
len(total_animales_noage)

85

In [262]:
total_animales_ch = []
total_animales_te = []
total_animales_ya = []
total_animales_ad = []
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            try:
                if d[k]['age']:
                    age = int(d[k]['age'])
                    if 0 <= age <= 12:
                        total_animales_ch.append(a)
                    elif 13 <= age <= 18:
                        total_animales_te.append(a)
                    elif 19 <= age <= 25:
                        total_animales_ya.append(a)
                    else:
                        total_animales_ad.append(a)
            except: # '16, 19'
                total_animales_te.append(a)

In [263]:
len(total_animales_ch)

27

In [264]:
Counter(total_animales_ch).most_common()

[('gato', 8),
 ('perrito', 5),
 ('perro', 4),
 ('animal', 3),
 ('animales', 3),
 ('mascota', 2),
 ('perritos', 1),
 ('animalito', 1)]

In [265]:
len(total_animales_te)

12

In [266]:
Counter(total_animales_te).most_common()

[('animal', 3),
 ('animales', 3),
 ('perrito', 3),
 ('gato', 1),
 ('mascota', 1),
 ('perros', 1)]

In [267]:
len(total_animales_ya)

6

In [268]:
Counter(total_animales_ya).most_common()

[('perra', 2), ('animales', 2), ('animal', 1), ('perrita', 1)]

In [269]:
len(total_animales_ad)

6

In [270]:
Counter(total_animales_ad).most_common()

[('animales', 4), ('gata', 1), ('perros', 1)]

In [271]:
total_cards_with_animales = 0
for k in d:
    text = total_text(d[k])
    words = [w for w in word_tokenize(text) if w not in punctuation]
    for a in words:
        if a in animales:
            total_cards_with_animales += 1
            break

In [272]:
total_cards_with_animales

63