In [1]:
import csv
import pickle
import string

import nltk
from nltk.corpus import stopwords

In [2]:
with open('data/out/stories.pickle', 'rb') as f:
    stories = pickle.load(f)

In [3]:
len(stories)

98

In [4]:
with open('data/aux/es_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    lexicon = []
    for row in reader:
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['flexion'] = row[0].lower()
            entry['lemma'] = row[i].lower()
            entry['eagle'] = row[i+1].lower()
            lexicon.append(entry)

In [5]:
all_words = set(entry['flexion'] for entry in lexicon)
nouns = set(entry['flexion'] for entry in lexicon if entry['eagle'].startswith('n'))

In [6]:
with open('data/aux/stopwords-es.txt') as f:
    stop = [w.strip() for w in f.readlines()]

In [7]:
punctuation = list(string.punctuation)
weird = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '¡', '¢', '£', '¤', '¥', '¨', '©', '¬', '\xad',
    '°', '±', '³', '´', '·', '¾', '¿', '×', '́', '̃', '͜', '͡', '๏', '\u200b', '‒', '–', '—', '―', '•',
    '…', '\u202a', '※', '€', '™', '↑', '→', '↓', '↘', '↙', '∆', '√', '∞', '∴', '∵', '⊙', '╥', '▁',
    '▂', '▪', '▬', '▶', '◀', '◆', '◇', '○', '●', '◔', '◡', '☀', '★', '☆', '☺', '☻', '♠', '♡', '♢',
    '♣', '♤', '♥', '♦', '♧', '⚜', '⚫', '✅', '✋', '✌', '✎', '✔', '✰', '✿', '❎', '❓', '❝', '❞', '❤',
    '❥', '⬆', '⬇', '⭕', '《', '》', '️', '︿', '﹏', '\ufeff', '｡', '�', '🌈', '🌊', '🌚', '🌟', '🌷',
    '🌸', '🌹', '🌼', '🎁', '🎄', '🎉', '🎊', '🎶', '🏄', '🏻', '🏼', '🏽', '🐺', '🐻', '👀', '👉', '👊',
    '👋', '👌', '👍', '👏', '👑', '👻', '💁', '💋', '💕', '💖', '💘', '💙', '💚', '💛', '💜', '💞', '💠',
    '💩', '📖', '📚', '🔥', '🔫', '🖊', '🖐', '🖕', '😀', '😁', '😂', '😃', '😄', '😅', '😆', '😇', '😈',
    '😉', '😊', '😋', '😌', '😍', '😎', '😏', '😐', '😓', '😔', '😕', '😖', '😘', '😙', '😚', '😜', '😝',
    '😟', '😢', '😣', '😥', '😨', '😫', '😬', '😭', '😰', '😱', '😳', '😵', '😷', '😼', '🙂', '🙄', '🙇',
    '🙈', '🙊', '🙏', '🤓', '🤗'
]
others = punctuation + weird
# adverbs = [
#     'ahora', 'antes', 'después', 'tarde', 'luego', 'ayer', 'temprano', 'ya', 'todavía', 'anteayer', 'aún',
#     'pronto', 'hoy', 'aquí', 'ahí', 'allí', 'cerca', 'lejos', 'fuera', 'dentro', 'alrededor', 'aparte',
#     'encima', 'debajo', 'delante', 'detrás', 'así', 'bien', 'mal', 'despacio', 'deprisa', 'como', 'mucho',
#     'poco', 'muy', 'casi', 'todo', 'nada', 'algo', 'medio', 'demasiado', 'bastante', 'más', 'menos', 'además',
#     'incluso', 'también', 'sí', 'también', 'asimismo', 'no', 'tampoco', 'jamás', 'nunca', 'acaso', 'quizá',
#     'quizás', 'tal', 'vez', 'tan'
# ]
# prepositions = [
#     'a', 'ante', 'bajo', 'cabe', 'con', 'contra', 'de', 'desde', 'en', 'entre', 'hacia', 'hasta', 'para',
#     'por', 'según', 'sin', 'so', 'sobre', 'tras', 'durante', 'mediante', 'excepto', 'salvo', 'incluso',
#     'más', 'menos',
# ]
stop += stopwords.words('english') + stopwords.words('spanish') + others # adverbs + prepositions

my_own_stop_words = ['dije']
stop += my_own_stop_words

def remove_accent_marks(s):
    return s.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u')

stop += [remove_accent_marks(w) for w in stop]

def clean(s, only_nouns=True, no_accent_marks=True):
    r = s.lower()
    for p in others:
        r = r.replace(p, '')
    words = [w for w in nltk.word_tokenize(r) if w not in stop and not w.isnumeric() and len(w) > 1]
    if only_nouns:
        words = [w for w in words if w in nouns or w not in all_words]
    r = ' '.join(words)
    if no_accent_marks:
        r = remove_accent_marks(r)
    return r

def join_texts(d):
    return ' '.join(d.values())

In [8]:
%%time

for story in stories:
    story['texts'] = dict((k, clean(v)) for k, v in story['texts'].items())
    story['text'] = join_texts(story['texts'])

CPU times: user 2min 28s, sys: 40 ms, total: 2min 29s
Wall time: 2min 29s


In [9]:
with open('data/out/clean_stories.pickle', 'wb') as f:
    pickle.dump(stories, f)