In [1]:
import csv
import os
from collections import Counter
from string import punctuation

import enchant
from nltk import word_tokenize
from nltk.corpus import stopwords

# Lexicon

In [77]:
lexicon = {}
with open('lexicon/es_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    for row in reader:
        lexicon[row[0]] = []
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['lemma'] = row[i].lower()
            entry['eagle'] = row[i+1].lower()
            lexicon[row[0]].append(entry)

In [78]:
len(lexicon)

556210

In [126]:
def is_vowel(c):
    return c in 'aeiouAEIOUáéíóúÁÉÍÓÚüÜ'

def next_level(w):
    result = [w]
    if is_vowel(w[0]):
        result.append('h' + w)
    if "q'" in w:
        result.append(w.replace("q'", 'que'))
    for i in range(len(w)):
        if w[i] == 'v':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'b':
            result.append(w[:i] + 'v' + w[i+1:])
            result.append(w[:i] + 'd' + w[i+1:])
        elif w[i] == 'd':
            result.append(w[:i] + 'b' + w[i+1:])
        elif w[i] == 'c' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 's' + w[i+1:])
        elif w[i] == 's' and i < len(w) - 1 and w[i+1] in 'ei':
            result.append(w[:i] + 'c' + w[i+1:])
    return result

def combinations_aux(visited, result):
    while visited:
        w = visited[0]
        visited = visited[1:]
        if w not in result:
            result.append(w)
            combs = next_level(w)
            visited.extend(combs)        
    return result

def combinations(w):
    return combinations_aux([w], [])

# Analysis

In [2]:
os.listdir('data')

['ana.csv', 'final.csv', 'yadira.csv', 'rafa.csv', 'dago.csv']

In [3]:
d = {}
for filename in os.listdir('data'):
    with open('data/' + filename) as f:
        reader = csv.reader(
            f,
            delimiter=',',
            quotechar='"'
        )
        next(reader)
        for row in reader:
            d[row[0]] = {
                'date': row[1],
                'name': row[2],
                'gender': row[3],
                'age': row[4],
                'city': row[5],
                'drawings': row[6],
                'title': row[7],
                'description': row[8],
            }

In [4]:
len(d)

826

## Gender

In [5]:
blank = 0
male = 0
female = 0
for k in d:
    gender = d[k]['gender']
    if gender == 'M':
        male += 1
    elif gender == 'F':
        female += 1
    else:
        blank += 1

In [6]:
blank

449

In [7]:
male

108

In [8]:
female

269

## Cities

In [9]:
cities = []
for k in d:
    city = d[k]['city']
    cities.append(city)

In [10]:
c = Counter(cities)

In [11]:
# c

In [12]:
c.most_common(10)

[('', 582),
 ('Bogotá', 155),
 ('Bogotá D.C.', 9),
 ('Neiva', 7),
 ('Fusagasugá', 6),
 ('Soacha', 5),
 ('Bogotá D.C', 5),
 ('Bucaramanga', 4),
 ('Medellín', 3),
 ('Cali', 3)]

## Ages

In [13]:
ages = []
for k in d:
    age = d[k]['age']
    ages.append(age)

In [14]:
a = Counter(ages)

In [15]:
# a

In [16]:
a.most_common(11)

[('', 575),
 ('14', 16),
 ('16', 13),
 ('19', 12),
 ('10', 12),
 ('13', 11),
 ('20', 11),
 ('11', 11),
 ('12', 10),
 ('21', 9),
 ('17', 9)]

In [17]:
children = 0
teenagers = 0
young_adults = 0
adults = 0
for k in d:
    try:
        if d[k]['age']:
            age = int(d[k]['age'])
            if 0 <= age <= 12:
                children += 1
            elif 13 <= age <= 18:
                teenagers += 1
            elif 19 <= age <= 25:
                young_adults += 1
            else:
                adults += 1
    except: # '16, 19'
        teenagers += 1

In [18]:
children

51

In [19]:
teenagers

65

In [20]:
young_adults

54

In [21]:
adults

81

## Drawings

In [22]:
drawing_words = []
for k in d:
    drawings = d[k]['drawings'].lower().replace(';', ',')
    words = [x.strip() for x in drawings.split(',')]
    drawing_words.extend(words)

In [23]:
dw = Counter(drawing_words)

In [24]:
# dw

In [25]:
dw.most_common(11)

[('', 715),
 ('corazón', 84),
 ('cara feliz', 28),
 ('niño', 11),
 ('carita feliz', 11),
 ('nube', 7),
 ('cara sonriente', 5),
 ('perro', 4),
 ('sol', 3),
 ('cruz', 2),
 ('flor', 2)]

## Words

In [120]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [121]:
def total_text(r):
    text = (r['title'] + ' ' + r['description']).lower()
    text = text.replace('á', 'a')
    text = text.replace('é', 'e')
    text = text.replace('í', 'i')
    text = text.replace('ó', 'o')
    text = text.replace('ú', 'u')
    return text

In [122]:
total_words = []
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in punctuation])
    words = word_tokenize(no_stopwords)
    total_words.extend(words)

In [123]:
len(total_words)

45801

In [124]:
total_words_freq = Counter(total_words)

In [125]:
total_words_freq.most_common(10)

[('de', 1996),
 ('y', 1953),
 ('la', 1684),
 ('que', 1365),
 ('en', 1224),
 ('a', 1193),
 ('el', 1032),
 ('un', 813),
 ('con', 692),
 ('mi', 668)]

In [130]:
stopwords_no_accents = []
for w in stopwords.words('spanish'):
    w = w.replace('á', 'a')
    w = w.replace('é', 'e')
    w = w.replace('í', 'i')
    w = w.replace('ó', 'o')
    w = w.replace('ú', 'u')
    stopwords_no_accents.append(w)
stopwords_no_accents.extend(stopwords.words('spanish'))

In [131]:
total_nonstop_words = [w for w in total_words if w not in stopwords_no_accents]

In [132]:
len(total_nonstop_words)

22240

In [133]:
unique_nonstop_words = set(total_nonstop_words)

In [134]:
len(unique_nonstop_words)

6705

In [33]:
total_words2 = []
for k2 in d:
    text2 = total_text(d[k2])
    no_stopwords2 = ' '.join([w for w in word_tokenize(text2) if True])
    words2 = word_tokenize(no_stopwords2)
    total_words2.extend(words2)

In [34]:
len(total_words2)

49008

In [35]:
tw = Counter(total_words)

In [36]:
# tw

In [37]:
tw.most_common(12)

[(',', 1596),
 ('.', 1223),
 ('paz', 664),
 ('dia', 240),
 ('familia', 152),
 ('ser', 133),
 ('personas', 132),
 ('vida', 117),
 ('amor', 102),
 ('años', 88),
 ('cada', 88),
 ('mejor', 87)]

In [38]:
undia = 0
eldia = 0
dedia = 0
otros = 0
otras = 0
for k in d:
    text = total_text(d[k])
    if 'un dia' in text or 'unos dias' in text:
        undia += 1
    elif 'el dia' in text or 'los dias' in text:
        eldia += 1
    elif 'de dia' in text:
        dedia += 1
    else:
        words = word_tokenize(text)
        if 'dia' in words or 'dias' in words:
            otros += 1
#             print(text)
#             print()

In [39]:
undia

113

In [40]:
eldia

49

In [41]:
dedia

4

In [42]:
otros

69

In [43]:
tw['dia']

240

In [44]:
animales = ['animal', 'animalito', 'animales', 'animalitos',
            'perro', 'perra', 'perros', 'perras', 'perrito', 'perrita', 'perritos', 'perritas',
            'gato', 'gata', 'gatos', 'gatas', 'gatito', 'gatita', 'gatitos', 'gatitas',
            'mascota', 'ovejita', 'pez']

In [45]:
# animales = ['caballo', 'yegua', 'caballito', 'burro', 'burrito', 'oveja',
#             'cabra', 'vaquita', 'ovejita', 'vaca', 'cordero', 'corderito',
#             'pez', 'pececito', 'cerdo', 'cerdito', 'puerco', 'gallina', 'pollo', 'gallo',
#             'pesesito', 'pes']

In [46]:
total_animales = []
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in sw_no_accents])
    words = word_tokenize(no_stopwords)
    for a in animales:
        if a in words:
            total_animales.append(a)

In [47]:
len(total_animales)

100

In [48]:
ta = Counter(total_animales)

In [49]:
ta

Counter({'animal': 10,
         'animales': 21,
         'animalito': 1,
         'animalitos': 1,
         'gata': 2,
         'gatito': 3,
         'gato': 8,
         'gatos': 4,
         'mascota': 5,
         'ovejita': 1,
         'perra': 3,
         'perrita': 5,
         'perrito': 13,
         'perritos': 3,
         'perro': 10,
         'perros': 9,
         'pez': 1})

In [50]:
total_animales_m = []
total_animales_f = []
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in sw_no_accents])
    words = word_tokenize(no_stopwords)
    for an in animales:
        if an in words:
            if d[k]['gender'] == 'M':
                total_animales_m.append(an)
            elif d[k]['gender'] == 'F':
                total_animales_f.append(an)

In [51]:
len(total_animales_m)

14

In [52]:
len(total_animales_f)

41

In [53]:
Counter(total_animales_m).most_common()

[('perrito', 3),
 ('perro', 3),
 ('animales', 2),
 ('mascota', 2),
 ('perritos', 2),
 ('perros', 1),
 ('gato', 1)]

In [54]:
Counter(total_animales_f).most_common()

[('animales', 12),
 ('animal', 6),
 ('gato', 5),
 ('perros', 4),
 ('perro', 3),
 ('perrita', 2),
 ('perrito', 2),
 ('perra', 1),
 ('mascota', 1),
 ('animalito', 1),
 ('gata', 1),
 ('gatito', 1),
 ('gatos', 1),
 ('animalitos', 1)]

In [55]:
total_animales_a = []
total_animales_na = []
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in sw_no_accents])
    words = word_tokenize(no_stopwords)
    for an in animales:
        if an in words:
            if d[k]['age']:
                total_animales_a.append(an)
            else:
                total_animales_na.append(an)

In [56]:
len(total_animales_a)

39

In [57]:
len(total_animales_na)

61

In [58]:
total_animales_ch = []
total_animales_te = []
total_animales_ya = []
total_animales_ad = []
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in sw_no_accents])
    words = word_tokenize(no_stopwords)
    for an in animales:
        if an in words:
            try:
                if d[k]['age']:
                    age = int(d[k]['age'])
                    if 0 <= age <= 12:
                        total_animales_ch.append(an)
                    elif 13 <= age <= 18:
                        total_animales_te.append(an)
                    elif 19 <= age <= 25:
                        total_animales_ya.append(an)
                    else:
                        total_animales_ad.append(an)
            except: # '16, 19'
                total_animales_te.append(an)

In [59]:
len(total_animales_ch)

19

In [60]:
Counter(total_animales_ch).most_common()

[('gato', 4),
 ('animales', 3),
 ('perrito', 3),
 ('perro', 3),
 ('animal', 2),
 ('mascota', 2),
 ('animalito', 1),
 ('perritos', 1)]

In [61]:
len(total_animales_te)

10

In [62]:
Counter(total_animales_te).most_common()

[('animales', 3),
 ('perrito', 2),
 ('animal', 2),
 ('mascota', 1),
 ('gato', 1),
 ('perros', 1)]

In [63]:
len(total_animales_ya)

5

In [64]:
Counter(total_animales_ya).most_common()

[('animales', 2), ('perra', 1), ('animal', 1), ('perrita', 1)]

In [65]:
len(total_animales_ad)

5

In [66]:
Counter(total_animales_ad).most_common()

[('animales', 3), ('gata', 1), ('perros', 1)]

In [67]:
total_cards_with_animales = 0
for k in d:
    text = total_text(d[k])
    no_stopwords = ' '.join([w for w in word_tokenize(text) if w not in sw_no_accents])
    words = word_tokenize(no_stopwords)
    for a in animales:
        if a in words:
            total_cards_with_animales += 1
            break

In [68]:
total_cards_with_animales

63

In [79]:
lexicon['a']

[{'eagle': 'ncfs000', 'lemma': 'a'}, {'eagle': 'sps00', 'lemma': 'a'}]

In [71]:
q=enchant.Dict('es_ES')

In [72]:
q.check('avuela')

False

In [73]:
q.check('abuela')

True

In [76]:
q.suggest('habia')

['había',
 'haba',
 'habita',
 'habida',
 'rabia',
 'sabia',
 'habla',
 'labia',
 'hacia',
 'habiz',
 'Babia']