In [1]:
import warnings

import pandas as pd
import numpy as np
import pickle
import operator
import re
import gc
import gensim

from localsettings import DATA_PATH 
import json

from os.path import join, exists

In [3]:
with open(join(DATA_PATH,'texto_tweets_seguidores_cands')) as f:
    tweets = json.load(f)

In [48]:
def preprocess(doc, remove_hashtags=False, remove_accents=False):
    pre_doc = doc
        
    # remover URLs
    pre_doc = re.sub(
        r"https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ", pre_doc)
    
    # minúsculas
    pre_doc = pre_doc.lower()

    # volar acentos
    if remove_accents:
        pre_doc = gensim.utils.deaccent(pre_doc)

    # remove bullshit
    pre_doc = re.sub(r"\'|\"|\\|…|\/|\-|\||\(|\)|\.|\,|\!|\?|\:|\;|“|”|’|—", " ", pre_doc)
    
    # contraer vocales
#     for v in 'aeiou':
#         pre_doc = re.sub(r"[%s]+" % v, v, pre_doc)    

    # volar menciones
    pre_doc = re.sub(r"\@\w+"," ", pre_doc)
    
    # volar hashtags
    if remove_hashtags:
        pre_doc = re.sub(r"\B(\#[a-zA-Z]+\b)(?!;)"," ", pre_doc)
    
    # normalizar espacio en blanco
    pre_doc = re.sub(r"\s+", " ", pre_doc)
    pre_doc = re.sub(r"(^\s)|(\s$)", "", pre_doc)
    
    
    return pre_doc

In [50]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from string import punctuation

spanish_tokenizer = load('tokenizers/punkt/spanish.pickle')

# stopwords en español
spanish_stopwords = stopwords.words('spanish')

# spanish stemmer
stemmer = SnowballStemmer('spanish')

# punctuation to remove
non_words = list(punctuation)

# we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str, range(10)))

stemmer = SnowballStemmer('spanish')

def trystem(t):
    try:
        t = stemmer.stem(t)
    except Exception:
        pass
    return t

def tokenize(text, stem=False, remove_stopwords=True):
    text = preprocess(text, remove_hashtags=True)
    result = []
    
    for sentence in spanish_tokenizer.tokenize(text):
        # remover puntuación
#         text = ''.join([c for c in sentence if c not in non_words])
        
        # tokenize
        tokens = word_tokenize(text)

        if remove_stopwords:
            tokens = [t for t in tokens if t not in spanish_stopwords]

        # tokens de al menos 2 letras
        tokens = [t for t in tokens if len(t) > 1]
            
        # stem
        if stem:
            tokens = [trystem(t) for t in tokens]
        
        result += tokens
        
    return result

# 150 palabras más frecuentes

In [59]:
from collections import defaultdict
import codecs

for cand, tws in tweets.items():
    token_tws = [tokenize(tw) for tw in tws]
    counts = defaultdict(int)

    for tokens in token_tws:
        for t in tokens:
            counts[t] += 1
    
    fpath = join(DATA_PATH, 'top_palabras_%s.txt' % cand.lower())
    with codecs.open(fpath,'w',encoding='utf8') as f:    
        for w, c in sorted(counts.items(), key=lambda x:-x[1])[:150]:
            f.write("%s: %d\n" % (w, c))

# 150 Hashtags más frecuentes

In [61]:
def hashtags(t):
    return re.findall(r"\B(\#[a-zA-Z]+\b)(?!;)", t)

In [60]:
for cand, tws in tweets.items():
    hashtags_cand = [hashtags(preprocess(t)) for t in tws]

    counts = defaultdict(int)
    for hts in hashtags_cand:
        for ht in hts:
            counts[ht] += 1

    fpath = join(DATA_PATH, 'top_hashtags_%s.txt' % cand.lower())
    with codecs.open(fpath,'w',encoding='utf8') as f:    
        for w, c in sorted(counts.items(), key=lambda x:-x[1])[:150]:
            f.write("%s: %d\n" % (w, c))            