# LDA Topic Models with Varying Parameters

*Apply LDA with different values of alpha and beta parameters. Export the models for visualisation.*

In [3]:
import json

from gensim.models.ldamodel import LdaModel
from gensim import corpora

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

from collections import Counter
from itertools import chain

## Data Preprocessing

#### Load the data

In [4]:
def load_data(filename):
    'Return a list of documents.'
    with open(filename, encoding='ISO-8859-1') as f:
        return [line for line in f]

In [5]:
data = load_data('../data/lee.txt')
len(data), data[0]

(300,
 'Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year\'s Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are availabl

#### Clean the data

In [6]:
def preprocess_documents(docs):
    'Return tokens for documents after cleaning (stopwords/punctuation removal, lemmatization).'
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation + '“”’—') 
    lemma = WordNetLemmatizer()
    return [clean(doc, stop, exclude, lemma) for doc in docs]

def clean(doc, stop, exclude, lemma):
    punc_free_doc = ''.join(c for c in doc if c not in exclude)
    stop_free_words = [w for w in punc_free_doc.lower().split() if w not in stop]    
    normalized_words = [lemma.lemmatize(w) for w in stop_free_words if len(w) >= 3]
    return normalized_words

In [7]:
docs = preprocess_documents(data)
data[0], docs[0]

('Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year\'s Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are available at t

## Topic Modelling with LDA

#### Find topics of the whole corpus

In [8]:
def build_lda(corpus, num_topics=10, passes=10, alpha='symmetric', eta=None):
    'Return an LDA model from the given doc-term matrix .'
    return LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, alpha=alpha, eta=eta, random_state=0)

def get_model_topics(lda):
    return [[(lda.id2word[t], '{:.3f}'.format(p)) for t, p in lda.get_topic_terms(i)] for i in range(lda.num_topics)]

In [9]:
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]    
lda = build_lda(corpus, alpha=10, eta=0.01, num_topics=10)
get_model_topics(lda)

  diff = np.log(self.expElogbeta)


[[('palestinian', '0.020'),
  ('arafat', '0.015'),
  ('said', '0.015'),
  ('israeli', '0.013'),
  ('say', '0.011'),
  ('minister', '0.008'),
  ('gaza', '0.007'),
  ('israel', '0.006'),
  ('west', '0.006'),
  ('attack', '0.006')],
 [('say', '0.021'),
  ('new', '0.011'),
  ('south', '0.011'),
  ('fire', '0.010'),
  ('australia', '0.009'),
  ('said', '0.007'),
  ('people', '0.006'),
  ('firefighter', '0.005'),
  ('two', '0.005'),
  ('year', '0.004')],
 [('said', '0.017'),
  ('say', '0.017'),
  ('year', '0.008'),
  ('australian', '0.006'),
  ('day', '0.004'),
  ('new', '0.004'),
  ('river', '0.004'),
  ('could', '0.003'),
  ('take', '0.003'),
  ('people', '0.003')],
 [('said', '0.016'),
  ('say', '0.009'),
  ('metre', '0.008'),
  ('pakistan', '0.006'),
  ('two', '0.005'),
  ('minister', '0.005'),
  ('last', '0.005'),
  ('palestinian', '0.005'),
  ('australian', '0.005'),
  ('attack', '0.005')],
 [('said', '0.019'),
  ('say', '0.016'),
  ('force', '0.010'),
  ('afghanistan', '0.009'),
  ('g

#### Get topics associated with documents

In [10]:
def get_topics_for_documents(lda, corpus):
    'Return top 5 topic probabilities for each document in the given corpus.'
    return [get_topics_for_one_document(lda, doc) for doc in corpus]

def get_topics_for_one_document(lda, doc):
    topics = sorted(lda.get_document_topics(doc, minimum_probability=10**-6), key = lambda x: -x[1])[:5]
    return [float('{:.3f}'.format(p)) for (t, p) in topics]

In [11]:
output_topics = get_topics_for_documents(lda, corpus)
output_topics[:10]

[[0.394, 0.156, 0.074, 0.071, 0.063],
 [0.154, 0.137, 0.12, 0.113, 0.094],
 [0.125, 0.116, 0.109, 0.105, 0.1],
 [0.195, 0.113, 0.106, 0.103, 0.1],
 [0.17, 0.13, 0.128, 0.108, 0.08],
 [0.218, 0.115, 0.095, 0.095, 0.094],
 [0.259, 0.17, 0.124, 0.078, 0.071],
 [0.141, 0.118, 0.116, 0.107, 0.091],
 [0.464, 0.079, 0.067, 0.065, 0.063],
 [0.294, 0.087, 0.086, 0.084, 0.084]]

#### Find terms associated with topics 

In [12]:
def get_terms_for_topics(lda):
    'Return term probabilities for each topic. Only top 5 terms.'
    return [get_terms_for_one_topic(lda, i) for i in range(lda.num_topics)]

def get_terms_for_one_topic(lda, topic_id):
    return [float('{:.3f}'.format(p)) for t, p in lda.get_topic_terms(topic_id, topn=5)]

In [13]:
output_terms = get_terms_for_topics(lda)
output_terms

[[0.02, 0.015, 0.015, 0.013, 0.011],
 [0.021, 0.011, 0.011, 0.01, 0.009],
 [0.017, 0.017, 0.008, 0.006, 0.004],
 [0.016, 0.009, 0.008, 0.006, 0.005],
 [0.019, 0.016, 0.01, 0.009, 0.008],
 [0.018, 0.01, 0.01, 0.009, 0.007],
 [0.01, 0.009, 0.009, 0.009, 0.007],
 [0.02, 0.013, 0.007, 0.006, 0.005],
 [0.017, 0.01, 0.009, 0.006, 0.006],
 [0.015, 0.014, 0.006, 0.006, 0.005]]

## Export Model Data

In [14]:
def export_model_data(corpus, alpha, beta):
    'Return a dictionary detailing model parameters and probability matrices.'
    lda = build_lda(corpus, alpha=alpha, eta=beta)
    return {
        'alpha': alpha,
        'beta': beta,
        'num_topics': lda.num_topics,
        'doc_topics': get_topics_for_documents(lda, corpus),
        'topic_terms': get_terms_for_topics(lda)
    }

In [15]:
def save_file(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

#### Different alpha values

In [16]:
def export_data(corpus, alphas, betas, filename):
    data = [export_model_data(corpus, alpha, beta) for alpha in alphas for beta in betas]
    save_file(data, filename)

In [17]:
alphas = [0.01, 0.1, 1, 10]
betas = [0.001, 0.01, 0.1, 1]
export_data(corpus, alphas, betas, '../data/lee-params.json')

  diff = np.log(self.expElogbeta)
