In [1]:
import warnings

import pandas as pd
import numpy as np
import pickle
import operator
import re
import gc
import gensim

import json

In [2]:
# TM_MODELS_PATH = '/media/pablo/data/Tesis/models/old/tm_feats/'
from os.path import join

In [3]:
# Leer todos los posts de los 3 candidatos

In [4]:
posts_text = []

In [5]:
for cand in ['hectorbaldassi', 'martinllaryoraoficial', 'pablocarrook']:
    df = pd.read_csv('/home/pablo/GDrive/BigSocialData/facebook/%s/posts.csv' % cand)
    posts_text += list(df.message[df.message.notnull()].values)

In [6]:
len(posts_text)

1519

In [7]:
corpus = posts_text

In [8]:
def preprocess(doc):
    pre_doc = doc
        
    # remover URLs
    pre_doc = re.sub(
        r"https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ", pre_doc)
    
    # minúsculas
    pre_doc = pre_doc.lower()

    # volar acentos
    pre_doc = gensim.utils.deaccent(pre_doc)

    # remove bullshit
    pre_doc = re.sub(r"\@|\'|\"|\\|…|\/|\-|\||\(|\)|\.|\,|\!|\?|\:|\;|“|”|’|—", " ", pre_doc)
    
    # contraer vocales
    for v in 'aeiou':
        pre_doc = re.sub(r"[%s]+" % v, v, pre_doc)    
    
    # normalizar espacio en blanco
    pre_doc = re.sub(r"\s+", " ", pre_doc)
    pre_doc = re.sub(r"(^\s)|(\s$)", "", pre_doc)
    
    return pre_doc

In [9]:
from collections import defaultdict

In [10]:
UNSTEMMED = {}

In [11]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from string import punctuation

spanish_tokenizer = load('tokenizers/punkt/spanish.pickle')

# stopwords en español
spanish_stopwords = stopwords.words('spanish')

# spanish stemmer
stemmer = SnowballStemmer('spanish')

# punctuation to remove
non_words = list(punctuation)

# we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str, range(10)))

stemmer = SnowballStemmer('spanish')

def trystem(t):
    try:
        s = stemmer.stem(t)
        if s not in UNSTEMMED:
            UNSTEMMED[s] = defaultdict(int)
        UNSTEMMED[s][t] += 1
        return s
    except Exception:
        return t

def tokenize(text, stem=True, remove_stopwords=False):
    text = text.lower()
    result = []
    
    for sentence in spanish_tokenizer.tokenize(text):
        # remover puntuación
        text = ''.join([c for c in sentence if c not in non_words])
        
        # tokenize
        tokens = word_tokenize(text)

        if remove_stopwords:
            tokens = [t for t in tokens if t not in spanish_stopwords]

        # tokens de al menos 2 letras
        tokens = [t for t in tokens if len(t) > 1]
            
        # stem
        if stem:
            tokens = [trystem(t) for t in tokens]
        
        result += tokens
        
    return result

In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
class get_docs(object):
    def __init__(self, corpus):
        self.corpus = corpus

    def __iter__(self):
        for doc in self.corpus:
            tokens = tokenize(preprocess(doc), remove_stopwords=True)
            yield tokens

In [14]:
from gensim.models.phrases import Phrases

In [15]:
phrases = Phrases(get_docs(corpus), min_count=5)
# bigram = Phraser(phrases)
# trigram = Phrases(bigram[get_docs(corpus)], min_count=5)
# dictionary = gensim.corpora.Dictionary(trigram[get_docs(corpus)])

In [16]:
dictionary = gensim.corpora.Dictionary(phrases[get_docs(corpus)])
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)

In [17]:
len(dictionary.dfs)

1240

In [18]:
bow = [dictionary.doc2bow(doc) for doc in phrases[get_docs(corpus)]]

In [19]:
from math import ceil
iters = 100
passes = 10
workers = 7
n_topics = 20
chunksize = int(ceil(len(bow)*1.0/workers))

In [20]:
model = gensim.models.LdaMulticore(
        corpus=bow,
        id2word=dictionary,
        num_topics=n_topics,
        iterations=iters,
        alpha=0.001,
        passes=passes,
        chunksize=chunksize,
        workers=workers
)

In [21]:
import pyLDAvis.gensim
import pyLDAvis

In [22]:
def unstemw(w):
    if w not in UNSTEMMED:
        return w
    else:
        return sorted(UNSTEMMED[w].items(), key=lambda x: -x[1])[0][0] # más frecuente

In [23]:
def unstemp(p):
    words = p.split('_')
    
    return '_'.join([unstemw(w) or w for w in words])

In [24]:
model.id2word.token2id = {unstemp(p): i  for p, i in model.id2word.token2id.items()}

In [25]:
viz = pyLDAvis.gensim.prepare(model, [v for v in bow], model.id2word)

In [26]:
pyLDAvis.display(viz)