In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords

from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LsiModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils

from collections import defaultdict
import pickle

Load Twitter batch. File has 6 tab delimited columns

In [2]:
path = "/Users/Orest/Desktop/Big Data and Text Analytics/FP/"
filename = "twitter_out.txt"
filepath = path + filename
tweet_df = pd.read_table(filepath, delimiter = '\t', header = None)

In [3]:
# Rename Columns
tweet_df.columns = ["id", "language", "date", "user", "location", "text"]
tweet_df.head(5)

Unnamed: 0,id,language,date,user,location,text
0,878033865981521920,en,Thu Jun 22 23:36:01 +0000 2017,NewMetaphor00,United States,RT @HdxAcademy: We congratulate @UChicago &amp...
1,878035512405114880,en,Thu Jun 22 23:42:33 +0000 2017,MORPaleo,"Bozeman, MT",Great visit from @UChicago students #PaulSeren...
2,878035797345054722,en,Thu Jun 22 23:43:41 +0000 2017,hbcfl,Los Angeles,#thursdaythought: “Shadowing attorneys this we...
3,878036950170468352,en,Thu Jun 22 23:48:16 +0000 2017,GET_AWAY_TRIKE,ブルースター首都ユニオン,RT @MORPaleo: Great visit from @UChicago stude...
4,878039761994227712,en,Thu Jun 22 23:59:27 +0000 2017,C_Ghillie,"New York, USA",@AriDavidPaul Curious if u knew @NateSilver538...


In [4]:
# Check if foreign language tweets are present
tweet_df.language.describe()

count     34372
unique        1
top          en
freq      34372
Name: language, dtype: object

All tweets are in English so no action is necessary here. Also notice that count of tweets is ~35K.

Text preparation/cleanup for analysis

In [5]:
tweet_list_raw = tweet_df.text.tolist()

In [6]:
# Text cleaning
# Remove http
link_pttrn = r"http\S+"
tweet_list = [re.sub(link_pttrn, "", tweet) for tweet in tweet_list_raw]

# Remove non-letter sequences
nl_pttrn = r"[^a-zA-Z']+"
tweet_list = [re.sub(nl_pttrn, " ", tweet) for tweet in tweet_list]

# Remove white space at beginning and end of document
ws_pttrn = r"(^ | $)"
tweet_list = [re.sub(ws_pttrn, "", tweet) for tweet in tweet_list]

In [13]:
# Pre-processing
tweet_list_raw[500]

'RT @UChicago: Exploring the legacy of #UChicago scholar Maria Goeppert Mayer, winner of 1963 @NobelPrize in Physics: https://t.co/KjuVi7VqLa'

In [14]:
# Post-processing
tweet_list[500]

'RT UChicago Exploring the legacy of UChicago scholar Maria Goeppert Mayer winner of NobelPrize in Physics'

In [15]:
def tokenize(tweet):
    return [token for token in simple_preprocess(tweet) if token not in STOPWORDS]

In [16]:
tweets = [tokenize(tweet) for tweet in tweet_list]

In [17]:
tweets[500]

['rt',
 'uchicago',
 'exploring',
 'legacy',
 'uchicago',
 'scholar',
 'maria',
 'goeppert',
 'mayer',
 'winner',
 'nobelprize',
 'physics']

In [18]:
frequency = defaultdict(int)
for tweet in tweets:
    for token in tweet:
        frequency[token] += 1
freq_series = pd.Series(frequency)

In [19]:
freq_series.sort_values(ascending=False)[0:10]

uchicago        32597
rt              23076
thaler           9449
nobelprize       5355
chicagobooth     4699
richard          4632
news             4322
prize            4265
economic         4177
sciences         4171
dtype: int64

From previous runs, calendar related items were found to confuse the algorithm. In addition two words are overly frequent and not helpful: 'rt' and 'uchicago'

In [20]:
days = ['mon','tue','wed','thu','fri','sat','sun']
months = ['jan', 'feb', 'mar','apr','may','jun',\
         'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

irrel = ["rt", "uchicago", "amp"]
uncommon_words = [w[0] for w in frequency.items() if w[1]<5]
two_letter = [w[0] for w in frequency.items() if len(w[0])< 3]

removed = days + months + irrel + uncommon_words + two_letter

In [36]:
# Process does take about 10-15 minutes, so the output is saved and loaded
tweets_clean = [[token for token in tweet if token not in removed] for tweet in tweets]
pickle_out = open("clean_final.pickle", "wb")
pickle.dump(tweets_clean, pickle_out)
pickle_out.close()

In [22]:
tweets_clean = pickle.load(open( "clean_final.pickle", "rb" ) )

In [25]:
tweets_clean[500]

['exploring',
 'legacy',
 'scholar',
 'maria',
 'goeppert',
 'mayer',
 'winner',
 'nobelprize',
 'physics']

In [26]:
dictionary = corpora.Dictionary(tweets_clean)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets_clean]

Run batch LDA and cross fingers

In [7]:
# Runtime about 5 mins
tweet_topics = LdaModel(corpus = corpus,
                       id2word = dictionary,
                       num_topics = 7,
                       passes= 10)
pickle_out2 = open("ffinalmodel.pickle", "wb")
pickle.dump(tweet_topics, pickle_out2)
pickle_out2.close()

In [27]:
tweet_topics = pickle.load(open( "ffinalmodel.pickle", "rb" ) )

In [28]:
for i, topic in enumerate(tweet_topics.print_topics(15)):
    print (i, topic)

0 (0, '0.017*"nuclear" + 0.017*"today" + 0.014*"berniesanders" + 0.012*"argonne" + 0.009*"fellow" + 0.009*"thanks" + 0.009*"reaction" + 0.008*"social" + 0.008*"fermilab" + 0.008*"chicago"')
1 (1, '0.016*"new" + 0.014*"professor" + 0.011*"prof" + 0.010*"happy" + 0.009*"renato" + 0.009*"mariotti" + 0.009*"human" + 0.008*"time" + 0.007*"explains" + 0.006*"crime"')
2 (2, '0.165*"thaler" + 0.088*"nobelprize" + 0.084*"chicagobooth" + 0.081*"richard" + 0.079*"prize" + 0.078*"sciences" + 0.078*"news" + 0.077*"economic" + 0.075*"awarded" + 0.075*"breaking"')
3 (3, '0.017*"chicago" + 0.014*"uchicagolaw" + 0.013*"cancer" + 0.011*"research" + 0.011*"new" + 0.010*"makes" + 0.009*"robert" + 0.008*"data" + 0.007*"uchicagopress" + 0.007*"debate"')
4 (4, '0.067*"uchicagogsu" + 0.044*"grad" + 0.035*"union" + 0.019*"yesgsu" + 0.019*"support" + 0.019*"final" + 0.017*"yes" + 0.017*"students" + 0.017*"room" + 0.017*"worker"')
5 (5, '0.017*"chicago" + 0.011*"student" + 0.011*"new" + 0.009*"school" + 0.009*"p

In [29]:
# Package is in detached-head mode. Clone from Github, commit latest changes and run setup.py
# https://github.com/bmabey/pyLDAvis
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [30]:
vis_data = gensimvis.prepare(tweet_topics, corpus, dictionary)
pyLDAvis.show(vis_data)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8898/    [Ctrl-C to exit]


127.0.0.1 - - [10/Dec/2017 00:39:24] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2017 00:39:24] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2017 00:39:25] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2017 00:39:25] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


Determine time evolution of tweets

In [31]:
# extract all document-topic distritbutions to dictionnary
document_key = list(tweet_df.date)
document_topic = {}

In [32]:
for doc_id in range(len(corpus)):
    docbok = corpus[doc_id]
    doc_topics = tweet_topics.get_document_topics(docbok, 0)
    tmp = []
    for topic_id, topic_prob in doc_topics:
        tmp.append(topic_prob)
    document_topic[document_key[doc_id]] = tmp

In [33]:
df = pd.DataFrame.from_dict(document_topic, orient='index')

In [34]:
df.columns = ["Nuclear", "Professor_Crime", "Thaler_Nobel", "Cancer_Research", "Unionization", "Evening_Program", "Zimmer_FreeSpeech"]

In [35]:
df.head(5)

Unnamed: 0,Nuclear,Professor_Crime,Thaler_Nobel,Cancer_Research,Unionization,Evening_Program,Zimmer_FreeSpeech
Thu Jun 22 23:36:01 +0000 2017,0.015873,0.015903,0.460265,0.198421,0.015873,0.015873,0.277791
Thu Jun 22 23:42:33 +0000 2017,0.921976,0.012991,0.012987,0.013001,0.013017,0.013012,0.013016
Thu Jun 22 23:43:41 +0000 2017,0.263168,0.017885,0.017857,0.517804,0.017908,0.017959,0.147419
Thu Jun 22 23:48:16 +0000 2017,0.928487,0.011908,0.011905,0.011916,0.01193,0.011926,0.011929
Thu Jun 22 23:59:27 +0000 2017,0.028571,0.028571,0.028571,0.828571,0.028571,0.028571,0.028571


In [36]:
df.to_csv("/Users/Orest/Desktop/Big Data and Text Analytics/FP/timed_tweets.csv")

Graphs are produced with ggplot2 package in R.