## Limpieza de _tweets_

In [1]:
import pandas as pd
import numpy as np

import re
from textblob import TextBlob

import langid  
from langdetect import detect 

import string

### Leer datos

In [2]:
tweets_es = pd.read_feather('../data/tweets_es')
tweets = tweets_es['texto']
tweets

0        #promotoresods les desea feliz año. Nuestro lu...
1        Aplicación de un riego de agua depurada con en...
2        ??Es lunes de estreno ????????\nLos esperamos ...
3        A una década para cumplir con los objetivos de...
4        ???Nº 329\n??#HomenajeAlMunicipalismo en el @S...
                               ...                        
25360    Documento #CEPAL examina las tendencias económ...
25361    Ámsterdam demuestra que no es ninguna quimera ...
25362    ¡Aprovecha tu tiempo de confinamiento! En el b...
25363    Un pilar del mandato de la Fundación es promov...
25364    Casa Valle Imperial                           ...
Name: texto, Length: 25365, dtype: object

### Detectar idioma

In [3]:
tweet = tweets[25361]
tweet

'Ámsterdam demuestra que no es ninguna quimera lo de salir de manera diferente a como entramos en esta crisis, a nivel económico y de #sostenibilidad, ¡toca romper con el modelo actual de consumo! https://t.co/lDA5sLHTsu\n#COVID19'

In [4]:
TextBlob(tweet).detect_language()

'es'

In [5]:
langid.classify(tweet)[0]

'es'

In [6]:
detect(tweet)

'es'

### Limpiar tweets

[Python NLTK: Twitter Sentiment Analysis [Natural Language Processing (NLP)]](https://blog.chapagain.com.np/python-nltk-twitter-sentiment-analysis-natural-language-processing-nlp/)

1. Remove stock market tickers like $GE
1. Remove retweet text “RT”
1. Remove hyperlinks
1. Remove hashtags (only the hashtag # and not the word)
1. Remove stop words like a, and, the, is, are, etc.
1. Remove emoticons like :), :D, :(, :-), etc.
1. Remove punctuation like full-stop, comma, exclamation sign, etc.
1. Convert words to Stem/Base words using Porter Stemming Algorithm. E.g. words like ‘working’, ‘works’, and ‘worked’ will be converted to their base/stem word “work”.

In [7]:
tweet = tweets[4]
tweet

'???Nº 329\n??#HomenajeAlMunicipalismo en el @Senadoesp, #40AñosDeDemocraciaLocal. \n??La #FEMP en #COP25\n??Poniendo cara a los #ODS, casos prácticos: #ODS2.\n\n??https://t.co/HiejHrKUOZ https://t.co/fAeD4UUfEx'

Funciones cogidas de [Word embeddings with code2vec, GloVe, and spaCy](https://towardsdatascience.com/word-embeddings-with-code2vec-glove-and-spacy-5b26420bf632)

In [8]:
def camel_case(example):      
    if  any(x in example for x  in string.punctuation)==True:
        return False
    else:
        if any(list(map(str.isupper, example[1:-1]))) and not all(list(map(str.isupper, example[1:-1]))):
            return True
        else:
            return False

def camel_case_split(word):
    idx = list(map(str.isupper, word))
    case_change = [0]
    for (i, (x, y)) in enumerate(zip(idx, idx[1:])):
        if x and not y:  
            case_change.append(i)
        elif not x and y:  
            case_change.append(i+1)
    case_change.append(len(word))
    return [word[x:y] for x, y in zip(case_change, case_change[1:]) if x < y]

In [9]:
[word[1:] for word in re.findall(r'#[a-zA-Z]+', tweet)]

['HomenajeAlMunicipalismo', 'FEMP', 'COP', 'ODS', 'ODS']

In [10]:
[camel_case(word[1:]) for word in re.findall(r'#[a-zA-Z]+', tweet)]

[True, False, False, False, False]

In [11]:
[word[1:] for word in re.findall(r'#[a-zA-Z]+', tweet) if camel_case(word[1:])]

['HomenajeAlMunicipalismo']

In [12]:
[' '.join(camel_case_split(word[1:])) for word in re.findall(r'#[a-zA-Z]+', tweet) if camel_case(word[1:])]

['Homenaje Al Municipalismo']

In [13]:
import string
import re
 
import spacy
from spacy.lang.es.stop_words import STOP_WORDS

nlp = spacy.load("es")
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

# Punctuation
punctuation = string.punctuation
punctuation += '¿¡'

# Add stop words
stopwords_spanish = STOP_WORDS
stopwords_spanish.update(['covid19'])

# Noise words
noise_words = stopwords_spanish.union(emoticons)
noise_words = noise_words.union(set([c for c in punctuation]))

def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove users
    tweet = re.sub(r'@[a-zA-Z0-9]+', '', tweet)
    
    # split camel case
    camel_case_words = [word[1:] for word in re.findall(r'#[a-zA-Z]+', tweet) if camel_case(word[1:])]
    splited_words = [' '.join(camel_case_split(word[1:])) for word in re.findall(r'#[a-zA-Z]+', tweet) if camel_case(word[1:])]
    for camel, splited in zip(camel_case_words, splited_words):
        tweet = tweet.replace(camel, splited)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # remove single letters
    tweet = re.sub(r"\b[a-zA-Z]\b", "", tweet)
 
    # remove multiple spaces
    tweet = re.sub(r"\s+", " ", tweet)

    # tokenize tweets
    tweet_tokens = nlp(tweet)
 
    #tweets_clean = [word.lemma_ for word in tweet_tokens if word.text not in noise_words]    
    
    tweets_clean = [word.lemma_ for word in tweet_tokens if not word.is_stop and word.pos_ != 'PUNCT']
 
    return ' '.join(tweets_clean).lower().strip()

In [14]:
clean_tweets(tweet)

'nº 329 homenaje municipalismo 40añosdedemocracialocal femp cop25 poniendo caro ods caso práctico ods2'

In [15]:
%%time
tweets_limpios = tweets.apply(clean_tweets)

CPU times: user 3min, sys: 133 ms, total: 3min
Wall time: 3min 1s


In [16]:
tweets_limpios

0        promotoresods desear feliz año necesitar esper...
1        aplicación regar aguar depurar enzima natural ...
2        lunes estrenar esperar 19:30 _ 13c viajar para...
3        década cumplir objetivo agenda2030 preparar en...
4        nº 329 homenaje municipalismo 40añosdedemocrac...
                               ...                        
25360    documento cepal examinar tendencia económico s...
25361    ámsterdam demostrar quimera salir entrar crisi...
25362    aprovecha confinamiento blog encontrar 40 artí...
25363    pilar mandato fundación promover debatir ue al...
25364    casa valle imperial proyectar integración tecn...
Name: texto, Length: 25365, dtype: object

In [17]:
pd.DataFrame(tweets_limpios.values, columns=['text']).to_feather('../data/tweets_limpios')

### Modelado de categorías

[Ultimate Guide to Understand and Implement Natural Language Processing (with codes in Python)](https://www.analyticsvidhya.com/blog/2017/01/ultimate-guide-to-understand-implement-natural-language-processing-codes-in-python/)

In [18]:
import gensim
from gensim import corpora

In [19]:
doc_clean = [doc.split() for doc in tweets_limpios]
dictionary = corpora.Dictionary(doc_clean)

In [20]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [21]:
Lda = gensim.models.ldamodel.LdaModel

In [22]:
%%time
ldamodel = Lda(doc_term_matrix, num_topics=17, id2word = dictionary, passes=50)

CPU times: user 3min 42s, sys: 5.94 ms, total: 3min 42s
Wall time: 3min 42s


In [23]:
ldamodel.show_topics(num_topics=17)

[(0,
  '0.047*"empresa" + 0.047*"rsc" + 0.043*"sostenibilidad" + 0.037*"ods" + 0.024*"virtual" + 0.018*"relacionar" + 0.017*"nacional" + 0.016*"gracia" + 0.016*"compromiso" + 0.016*"rse"'),
 (1,
  '0.113*"_" + 0.079*"ods" + 0.052*"agenda2030" + 0.032*"igualdad" + 0.027*"mujer" + 0.022*"género" + 0.012*"niño" + 0.012*"foro" + 0.011*"gracias" + 0.011*"importancia"'),
 (2,
  '0.080*"ods" + 0.076*"sostenible" + 0.057*"desarrollo" + 0.053*"agenda2030" + 0.037*"objetivos" + 0.022*"planeta" + 0.021*"desarrollar" + 0.020*"objetivo" + 0.014*"sociedad" + 0.013*"lograr"'),
 (3,
  '0.085*"sostenibilidad" + 0.027*"sostenible" + 0.025*"jornada" + 0.021*"organizar" + 0.018*"sector" + 0.012*"producto" + 0.012*"naturaleza" + 0.012*"the" + 0.011*"centro" + 0.011*"educativo"'),
 (4,
  '0.106*"sostenibilidad" + 0.023*"casa" + 0.023*"sostenible" + 0.020*"innovación" + 0.015*"_" + 0.013*"social" + 0.011*"tecnología" + 0.010*"movilidad" + 0.010*"verde" + 0.010*"congreso"'),
 (5,
  '0.039*"educación" + 0.032*