In [1]:
import collections
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import spacy
import nltk
from nltk.stem import WordNetLemmatizer

from utils import (
    drop_spam_rows,
    remove_digits,
    remove_prefixed_words,
    contract_spaces,
    remove_word,
    remove_single_characters,
    remove_special_characters,
)



### Load spaCy Spanish trained pipeline.

In [3]:
try:
    sp = spacy.load("es_core_news_sm")
except OSError:
    !python3 -m spacy download es_core_news_sm
    sp = spacy.load("es_core_news_sm")

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/robert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/robert/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Load texts

In [5]:
dataset_path = "data/balcones_2020.csv"
dataset = pd.read_csv(dataset_path)
texts = dataset["text"]

### Remove spam

In [7]:
spam_texts = ["El Magazin del Balcón Segoviano", "Viva María Auxiliadora"]
texts = drop_spam_rows(text_series=texts, spam_messages=spam_texts)

### Preprocessing

In [83]:
PUNCTUATION_MARKS = string.punctuation + "¿¡" + "..." + "…" + " "
STOP_WORDS = nltk.corpus.stopwords.words("spanish")
UNDESIRED_WORDS = [
    "balcón",
    "balcones",
    "balcon",
    "si",
    "haber",
    "ser",
    "quedateencasa",
    "yomequedoencasa",
    "parir"
]
UNDESIRED_PREFIXES = ["@", "#", "http", "jaj", "xd", "xD", "XD"]


def tokenize(text):
    return [
        token.lemma_ for token in sp(text)
        if token.text not in PUNCTUATION_MARKS
        and token.text not in STOP_WORDS + UNDESIRED_WORDS
    ]
    
# def tokenize(text):
#     return [
#         WordNetLemmatizer().lemmatize(word)
#         for word in text.split()
#         if word not in PUNCTUATION_MARKS
#         and word not in STOP_WORDS + UNDESIRED_WORDS
#     ]

def preprocess_text(text):

    # Regex filters.
    for prefix in UNDESIRED_PREFIXES:
            text = remove_prefixed_words(prefix, text)
    text = remove_special_characters(text)
    text = remove_single_characters(text)
    text = remove_digits(text)
    text = contract_spaces(text)
    text = text.lower()

    # Lemmatization
    return tokenize(text)

In [85]:
for i in texts[:1000].apply(lambda x: preprocess_text(x)): print(i)

['preparado', 'móvil', 'poner', 'foto', 'hacer', 'mes', 'medio', 'decir', 'mal', 'hacer', 'gente', 'cambiado', 'estrategia', 'buena', 'noche', 'amargadito']
['semana', 'solo', 'calle', 'observar', 'ver', 'gente', 'pasar', 'confinamiento', 'salir', 'cojón', 'azotea', 'ver', 'reunión', 'chavala', 'gente', 'hacer', 'barbacoas', 'gente', 'calle', 'tal', 'cosa', 'junto', 'charla', 'niño']
['madre', 'acordándome', 'pequeño', 'tirar', 'globo', 'agua', 'policía', 'llevar', 'dentro', 'eh']
['resumen', 'vida', 'inteligente', 'psoe', 'pasar', 'voto', 'cs', 'gente', 'consciente', 'venir', 'gestión', 'tanto', 'nefasta', 'seguir', 'baile']
['querer', 'acabar', 'cuanto', 'volver', 'ánimo']
['problema', 'vez', 'microsoft', 'offizar', 'mierda', 'oleeeeeeir', 'tirar']
['gran', 'madre', 'poder', 'nunca', 'quejar', 'mañana', 'pasar', 'verte', 'avisar', 'salga', 'lanzarte', 'ver', 'carita']
['servir', 'consuelo', 'acristalado', 'espacio', 'dar', 'sol', 'sumir', 'odio', 'ig', 'lleno', 'tb']
['hacer', 'calor