# LDA for TimeSets

*Apply LDA to find topics for the TimeSets paper.*

In [1]:
import json

from gensim.models.ldamodel import LdaModel
from gensim import corpora

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

from collections import Counter
from itertools import chain

import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

## Data Preprocessing

#### Load the data

In [2]:
def prepare_documents_facebook(filename):
    'Return a list of documents ready for topic modelling.'
    with open(filename) as f:
        posts = json.load(f)
        return [post['message'] for post in posts if 'message' in post]

In [3]:
data = prepare_documents_facebook('../data/facebook.json')

#### Clean the data

In [4]:
def preprocess_documents(docs):
    'Return a bag-of-word representation of the documents after cleaning (stopwords/punctuation removal, stemming).'
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation + '“”’—') 
    lemma = WordNetLemmatizer()
    return [clean(doc, stop, exclude, lemma) for doc in docs]

def clean(doc, stop, exclude, lemma):
    punc_free_doc = ''.join(c for c in doc if c not in exclude)
    stop_free_words = [w for w in punc_free_doc.lower().split() if w not in stop]    
    normalized_words = [lemma.lemmatize(w) for w in stop_free_words if len(w) >= 3]
    return normalized_words

In [5]:
docs = preprocess_documents(data)
for i in range(0, 10):
    print(data[i], docs[i])
    print()

The Syrian military declared today that the U.S.-Russia brokered cease-fire is over, blaming rebel groups for violating the agreement. ['syrian', 'military', 'declared', 'today', 'usrussia', 'brokered', 'ceasefire', 'blaming', 'rebel', 'group', 'violating', 'agreement']

Rose Pak, an influential community activist who turned San Francisco's Asian-American population into a political power in the city, passes away at 68. ['rose', 'pak', 'influential', 'community', 'activist', 'turned', 'san', 'franciscos', 'asianamerican', 'population', 'political', 'power', 'city', 'pass', 'away']

Warplanes target the besieged Syrian city of Aleppo for the first time since the ceasefire went into effect last week. ['warplane', 'target', 'besieged', 'syrian', 'city', 'aleppo', 'first', 'time', 'since', 'ceasefire', 'went', 'effect', 'last', 'week']

Using Skittles to make a point about “our Syrian refugee problem” didn’t go over too well with Mars, Incorporated—but the candy maker’s rebuke of Donald Tr

#### Look at word frequency

In [6]:
word_dict = Counter(chain.from_iterable(docs))
word_counts = sorted(word_dict.items(), key=lambda x: -x[1])
word_counts[:10]

[('trump', 204),
 ('donald', 165),
 ('clinton', 110),
 ('hillary', 95),
 ('say', 80),
 ('president', 74),
 ('debate', 63),
 ('presidential', 51),
 ('obama', 47),
 ('first', 40)]

## Topic Modelling with LDA

#### Find topics of the whole corpus

In [7]:
def build_lda(corpus, num_topics=10, passes=10, alpha='symmetric', eta=None):
    'Return an LDA model from the given doc-term matrix .'
    return LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, alpha=alpha, eta=eta, random_state=0)

def get_model_topics(lda):
    return [[(lda.id2word[t], '{:.3f}'.format(p)) for t, p in lda.get_topic_terms(i)] for i in range(lda.num_topics)]

In [None]:
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [19]:
lda = build_lda(corpus, num_topics=10)
get_model_topics(lda)

[[('trump', '0.030'),
  ('donald', '0.020'),
  ('debate', '0.015'),
  ('presidential', '0.011'),
  ('republican', '0.010'),
  ('clinton', '0.009'),
  ('hillary', '0.009'),
  ('first', '0.007'),
  ('cnn', '0.006'),
  ('country', '0.006')],
 [('trump', '0.014'),
  ('donald', '0.013'),
  ('say', '0.012'),
  ('presidential', '0.009'),
  ('week', '0.007'),
  ('news', '0.007'),
  ('abc', '0.007'),
  ('republican', '0.007'),
  ('hillary', '0.006'),
  ('clinton', '0.006')],
 [('trump', '0.028'),
  ('president', '0.020'),
  ('donald', '0.019'),
  ('debate', '0.012'),
  ('clinton', '0.011'),
  ('obama', '0.010'),
  ('hillary', '0.010'),
  ('say', '0.008'),
  ('week', '0.006'),
  ('republican', '0.006')],
 [('trump', '0.024'),
  ('donald', '0.018'),
  ('like', '0.013'),
  ('democrat', '0.009'),
  ('occupy', '0.008'),
  ('page', '0.008'),
  ('first', '0.008'),
  ('get', '0.008'),
  ('say', '0.007'),
  ('new', '0.007')],
 [('donald', '0.022'),
  ('trump', '0.022'),
  ('clinton', '0.017'),
  ('hilla

Couldn't tell what topics are about! Is it because the term probability is not a good measure here? Let's use pyLDAvis to look at other measures.

In [20]:
pyLDAvis.gensim.prepare(lda, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


What is this dataset about? Should I exlude high-frequency terms?

#### Filter out high-frequency terms

#### Get topics associated with documents

In [74]:
def get_topics_for_documents(lda, corpus):
    'Return top 5 topic probabilities for each document in the given corpus.'
    return [get_topics_for_one_document(lda, doc) for doc in corpus]

def get_topics_for_one_document(lda, doc):
    topics = sorted(lda.get_document_topics(doc, minimum_probability=10**-6), key = lambda x: -x[1])[:5]
    return [float('{:.3f}'.format(p)) for (t, p) in topics]

In [75]:
output_topics = get_topics_for_documents(lda, corpus)
output_topics[:10]

[[0.931, 0.008, 0.008, 0.008, 0.008],
 [0.944, 0.006, 0.006, 0.006, 0.006],
 [0.94, 0.007, 0.007, 0.007, 0.007],
 [0.959, 0.005, 0.005, 0.005, 0.005],
 [0.975, 0.003, 0.003, 0.003, 0.003],
 [0.925, 0.008, 0.008, 0.008, 0.008],
 [0.936, 0.007, 0.007, 0.007, 0.007],
 [0.95, 0.006, 0.006, 0.006, 0.006],
 [0.961, 0.004, 0.004, 0.004, 0.004],
 [0.94, 0.007, 0.007, 0.007, 0.007]]

#### Find terms associated with topics 

In [76]:
def get_terms_for_topics(lda):
    'Return term probabilities for each topic. Only top 5 terms.'
    return [get_terms_for_one_topic(lda, i) for i in range(lda.num_topics)]

def get_terms_for_one_topic(lda, topic_id):
    return [float('{:.3f}'.format(p)) for t, p in lda.get_topic_terms(topic_id, topn=5)]

In [77]:
output_terms = get_terms_for_topics(lda)
output_terms

[[0.042, 0.036, 0.025, 0.021, 0.014],
 [0.015, 0.013, 0.012, 0.011, 0.007],
 [0.016, 0.011, 0.008, 0.008, 0.008],
 [0.019, 0.018, 0.014, 0.01, 0.008],
 [0.016, 0.016, 0.008, 0.008, 0.007],
 [0.022, 0.017, 0.013, 0.01, 0.01],
 [0.018, 0.013, 0.013, 0.01, 0.009],
 [0.013, 0.009, 0.008, 0.008, 0.007],
 [0.021, 0.014, 0.014, 0.014, 0.012],
 [0.029, 0.028, 0.028, 0.023, 0.013]]

## Export Model Data

In [78]:
def export_model_data(lda, corpus):
    'Return a dictionary detailing model parameters and probability matrices.'
    return {
        'alpha': float(lda.alpha[0]), # assume the same value for all elements
        'beta': float(lda.eta[0]), # assume the same value for all elements
        'num_topics': lda.num_topics,
        'doc_topics': get_topics_for_documents(lda, corpus),
        'topic_terms': get_terms_for_topics(lda)
    }

In [79]:
def save_file(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

#### Different alpha values

In [80]:
def export_data_for_alphas(corpus, alphas, filename):
    data = [export_model_data(build_lda(corpus, alpha=alpha), corpus) for alpha in alphas]
    save_file(data, filename)

In [81]:
export_data_for_alphas(corpus, [0.01, 0.03, 0.1, 0.3, 1, 3, 10], '../data/facebook-alphas.json')