In [1]:
from tqdm import tqdm
import json
import re
import bz2
import datetime
import pandas as pd

from gensim import models
from gensim.corpora import Dictionary, MmCorpus

import pyLDAvis.gensim as gensimvis
import pyLDAvis

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

# Load data

In [3]:
# year, month, day, hour, minute
data_time_start = datetime.datetime(2016, 7, 7, 0, 0)
data_time_end = datetime.datetime(2016, 7, 7, 7, 59)

def time_range(start_time, end_time):
    while start_time <= end_time:
        start_time += datetime.timedelta(minutes=1)
        yield start_time

In [None]:
# Process data from raw files

tweets_json = []

for cur_time in tqdm(time_range(data_time_start, data_time_end)):
    try:
        with bz2.BZ2File('/media/phamthuonghai/DATA/twitter-data/%s.json.bz2' 
                         % cur_time.strftime('%Y/%m/%d/%H/%M'), 'r') as f:
            data_lines = f.readlines()
            tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
            punc = set('!$%^&*()_-+=\|{}[]:;"\'<>,.?/')

            for data_line in data_lines:
                tmp = json.loads(data_line)
                if 'text' in tmp and ('lang' in tmp and tmp['lang'] == 'en'):
                    tmp['text'] = [w for w in tknzr.tokenize(tmp['text'].lower()) if re.match('^#?\w+$', w)]
                    tweets_json.append([tmp['text'], tmp['lang'], tmp['user']['id']])
    except:
        pass

tweets = pd.DataFrame(tweets_json, columns=['text', 'lang', 'user'])

tweets.to_pickle('./data_%s_%s.pkl' % (data_time_start.strftime('%Y-%m-%d-%H-%M'),
                                              data_time_end.strftime('%Y-%m-%d-%H-%M')))

In [4]:
# Load processed data

tweets = pd.read_pickle('./data_%s_%s.pkl' % (data_time_start.strftime('%Y-%m-%d-%H-%M'),
                                              data_time_end.strftime('%Y-%m-%d-%H-%M')))

In [5]:
tweets = tweets.groupby(['user']).agg(lambda l: [item for sublist in l for item in sublist])

In [6]:
print len(tweets.index)
tweets.head()

263759


Unnamed: 0_level_0,text,lang
user,Unnamed: 1_level_1,Unnamed: 2_level_1
76,"[good, morning, twitpics]","[e, n, e, n]"
246,"[hmm, matter, unless, doing, concurrent, editi...","[e, n]"
850,"[rt, i, saw, the, best, minds, of, a, generati...","[e, n]"
1084,"[rt, wait, until, we, have, all, the, informat...","[e, n]"
1508,"[there, is, no, better, group, communication, ...","[e, n, e, n]"


# Prepare corpus

In [7]:
# Prepare corpus
stop_words = set(stopwords.words('english'))
stop_words.add(u'rt')

dictionary = Dictionary(tweets['text'])
stopword_ids = map(dictionary.token2id.get, stop_words)
dictionary.filter_tokens(stopword_ids)
dictionary.compactify()
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
dictionary.compactify()

corpus = [dictionary.doc2bow(doc) for doc in tweets['text']]

MmCorpus.serialize('news_gr.mm', corpus)
dictionary.save('news_gr.dict')

# LDA Training

In [8]:
%%time
# Training
lda = models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=30, passes=20, workers=3)

lda.save('news_30_lda_gr.model')

CPU times: user 49min 9s, sys: 2min 20s, total: 51min 30s
Wall time: 53min 52s


In [5]:
# or load them up
corpus = MmCorpus('./data/lda_sep_corpus.mm')
dictionary = Dictionary.load('./data/lda_sep_dict.dict')
model = models.ldamulticore.LdaMulticore.load('./data/lda_sep_model.model')

# LDA Visualization

In [6]:
# Visualize result
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)