In [14]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
# fixes weird issue with pyLDAvis (?) warnings

import pandas as pd
import numpy as np
import pickle
import operator
import re
import gc
import gensim
# from gensim.similarities import WmdSimilarity

import pyLDAvis
import pyLDAvis.gensim

# wtf
warnings.filterwarnings("ignore", category=DeprecationWarning)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from tw_dataset.dbmodels import *
from random import sample
import json

In [37]:
s = open_session()
all_tweets_es = [t for t in s.query(Tweet).all() if t.lang == 'es']
all_tweets_text_es = [t.text for t in all_tweets_es]

# with open('all_tweets_text_es.json', 'w') as f:
#     json.dump(all_tweets_text_es, f)

In [42]:
all_tweets_es_ids = [t.id for t in all_tweets_es]

In [15]:
with open('all_tweets_text_es.json') as f:
    all_tweets_text_es = json.load(f)

In [16]:
len(all_tweets_text_es)    

109040

In [17]:
tweets = all_tweets_text_es

# OK, let's finally do some topic modelling

In [18]:
from tokenizer import tokenize

In [19]:
def preprocess(doc):
    pre_doc = doc
        
    # remove URLs
    pre_doc = re.sub(
        r"https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ", pre_doc)
        
    # find and split hashtags
    # very simple splitting (TODO: come up with something wittier)
    # split on capital letters, but only if hashtag longer than 5
    # → conditional is to avoid splitting abbr. like "IoT" or "NSA"
    pre_doc = re.sub(r"(?:^|\s)[＃#]{1}(\w+)", 
            lambda s: re.sub(r"([A-Z])", r" \1", s.group(0)) if len(s.group(0)) > 5 else s.group(0), 
            pre_doc)
    pre_doc = re.sub(r"＃|#", " ", pre_doc)
    
    # lowercase everything
    pre_doc = pre_doc.lower()
        
    # remove bullshit
    pre_doc = re.sub(r"\@|\'|\"|\\|…|\/|\-|\||\(|\)|\.|\,|\!|\?|\:|\;|“|”|’|—", " ", pre_doc)
    
    # normalize whitespaces
    pre_doc = re.sub(r"\s+", " ", pre_doc)
    pre_doc = re.sub(r"(^\s)|(\s$)", "", pre_doc)
    
    return pre_doc

In [20]:
class get_docs(object):
    def __init__(self, corpus):
        self.corpus = corpus

    def __iter__(self):
        for doc in self.corpus:
            tokens = tokenize(preprocess(doc), remove_stopwords=True)
            yield tokens

In [23]:
dictionary = gensim.corpora.Dictionary(get_docs(tweets))
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=None)
dictionary.save("tweets_es.dict")

2017-04-19 10:32:46,389 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-04-19 10:33:30,676 : INFO : adding document #10000 to Dictionary(14876 unique tokens: [u'cobij', u'recoj', u'argenpython', u'concienci', u'colgueiiissss']...)
2017-04-19 10:34:20,995 : INFO : adding document #20000 to Dictionary(22420 unique tokens: [u'playstor', u'gauchit', u'lacocinadlmied', u'woods', u'insolit']...)
2017-04-19 10:35:04,397 : INFO : adding document #30000 to Dictionary(28380 unique tokens: [u'playstor', u'gauchit', u'gah', u'lacocinadlmied', u'vani']...)
2017-04-19 10:35:48,124 : INFO : adding document #40000 to Dictionary(33861 unique tokens: [u'playstor', u'gauchit', u'gah', u'lacocinadlmied', u'vani']...)
2017-04-19 10:36:35,681 : INFO : adding document #50000 to Dictionary(38796 unique tokens: [u'playstor', u'gauchit', u'gah', u'lacocinadlmied', u'\U0001f1f0\U0001f1f7\u2026']...)
2017-04-19 10:37:18,562 : INFO : adding document #60000 to Dictionary(43228 unique tokens: [u'

In [5]:
dictionary = gensim.corpora.Dictionary.load("tweets_es.dict")

2017-04-01 20:06:22,327 : INFO : loading Dictionary object from tweets_es.dict


In [24]:
bow = [dictionary.doc2bow(doc) for doc in get_docs(tweets)]
with open('tweets_es_bow.pickle', 'wb') as f:
    pickle.dump(bow, f)

In [3]:
with open('tweets_es_bow.pickle', 'rb') as f:
    bow = pickle.load(f)

In [25]:
from math import ceil

In [32]:
n_topics = 30
iters = 100
passes = 10
workers = 8
chunksize = int(ceil(len(bow)*1.0/workers))

In [33]:
model = gensim.models.LdaMulticore(
        corpus=bow,
        id2word=dictionary,
        num_topics=n_topics,
        iterations=iters,
        alpha=0.001,
        passes=passes,
        chunksize=chunksize,
        workers=workers
)
model.save("tweets_es_%dtopics.lda" % n_topics)

2017-04-19 11:17:21,412 : INFO : using serial LDA version on this node
2017-04-19 11:17:24,773 : INFO : running online LDA training, 30 topics, 10 passes over the supplied corpus of 109040 documents, updating every 109040 documents, evaluating every ~109040 documents, iterating 100x with a convergence threshold of 0.001000
2017-04-19 11:17:24,775 : INFO : training LDA model using 8 processes
2017-04-19 11:17:24,985 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #13630/109040, outstanding queue size 1
2017-04-19 11:17:25,111 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #27260/109040, outstanding queue size 2
2017-04-19 11:17:25,272 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #40890/109040, outstanding queue size 3
2017-04-19 11:17:25,430 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #54520/109040, outstanding queue size 4
2017-04-19 11:17:25,589 : INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to 

In [7]:
model = gensim.models.LdaModel.load("tweets_es_%dtopics.lda" % n_topics)

2017-04-01 20:40:06,205 : INFO : loading LdaModel object from tweets_es_15topics.lda
2017-04-01 20:40:06,222 : INFO : setting ignored attribute state to None
2017-04-01 20:40:06,223 : INFO : setting ignored attribute dispatcher to None
2017-04-01 20:40:06,224 : INFO : loading LdaModel object from tweets_es_15topics.lda.state


In [35]:
viz = pyLDAvis.gensim.prepare(model, [v for v in bow], model.id2word)
pickle.dump(viz, open("tweets_es_%d.viz" % n_topics,'wb'))

In [None]:
viz = pickle.load(open("tweets_es_%d.viz" % n_topics,'rb'))

In [36]:
pyLDAvis.display(viz)

In [34]:
# Comparamos con el de 15 temas
viz = pickle.load(open("tweets_es_15.viz",'rb'))
pyLDAvis.display(viz)

# Extracción de features LDA sobre tweets

In [50]:
tweets_lda = [model[t_bow] for t_bow in bow]

In [52]:
def rows_to_csc(rows):
    data = []
    row_ind = []
    col_ind = []
    for i, r in enumerate(rows):
        for j, d in r:
            row_ind.append(i)
            col_ind.append(j)
            data.append(d)
    return csc_matrix((data, (row_ind, col_ind)))

In [54]:
from scipy.sparse import csc, csc_matrix

In [55]:
X_tweets_lda = rows_to_csc(tweets_lda)

In [58]:
X_tweets_lda_df = pd.DataFrame(index=all_tweets_es_ids, data=X_tweets_lda.todense())

In [59]:
X_tweets_lda_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
1811555246,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.423934,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1835487964,0.000000,0.122243,0.000000,0.197152,0.000000,0.123709,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2096734273,0.000000,0.000000,0.000000,0.000000,0.000000,0.994235,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2259703503,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.995191
2409970666,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2465636950,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2692541035,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.389451,0.000000,0.000000,0.000000,0.000000
3710651980,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.116177,0.000000,0.000000,0.374736,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6787658239,0.000000,0.000000,0.681572,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.115661,0.000000,0.000000,0.000000,0.000000,0.000000
8814169669,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [60]:
X_tweets_lda_df.to_pickle("alltweets_es_lda30.pickle")