# LDA Topic Models with Varying Parameters

*Apply LDA with different values of alpha and beta parameters. Export the models for visualisation.*

In [1]:
import nlp
import json

### Load and clean the data

In [2]:
data = nlp.load_file('../data/lee.txt')
docs = nlp.preprocess(data)
len(data), data[0], docs[0]

(300,
 'Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year\'s Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are availabl

### Test with LDA 

In [4]:
lda = nlp.build_lda(docs, num_topics=10)
lda.print_topics()

[(0,
  '0.009*"say" + 0.006*"wicket" + 0.006*"said" + 0.006*"australia" + 0.006*"south" + 0.005*"day" + 0.005*"fire" + 0.005*"test" + 0.005*"new" + 0.005*"pakistan"'),
 (1,
  '0.013*"said" + 0.012*"south" + 0.009*"fire" + 0.009*"new" + 0.009*"say" + 0.008*"sydney" + 0.008*"area" + 0.006*"state" + 0.005*"day" + 0.005*"wind"'),
 (2,
  '0.016*"say" + 0.007*"centre" + 0.007*"said" + 0.007*"australia" + 0.007*"australian" + 0.006*"year" + 0.005*"detainee" + 0.005*"rate" + 0.004*"detention" + 0.004*"south"'),
 (3,
  '0.019*"said" + 0.007*"say" + 0.006*"bin" + 0.006*"taliban" + 0.005*"force" + 0.005*"laden" + 0.005*"kandahar" + 0.004*"attack" + 0.004*"state" + 0.004*"also"'),
 (4,
  '0.020*"say" + 0.012*"afghanistan" + 0.012*"said" + 0.010*"government" + 0.010*"australian" + 0.009*"force" + 0.007*"australia" + 0.006*"afghan" + 0.006*"bin" + 0.006*"people"'),
 (5,
  '0.020*"palestinian" + 0.015*"said" + 0.014*"say" + 0.012*"israeli" + 0.012*"arafat" + 0.007*"attack" + 0.007*"israel" + 0.006*"h

### Compute  and export data with varying number of topics
Data output: a list of models
 - each model has a list of its topics
   - each topic has a list of top 100 terms (term, probability)

In [45]:
import math

def mydiff(p, q):
    total = 0
    for i in range(len(p)):
         total += (math.sqrt(p[i]) - math.sqrt(q[i])) ** 2
    return math.sqrt(total) / math.sqrt(2)

test

In [47]:
p1 = [.1, .2, .3, .4]
p2 = [.15, .25, .35, .45]
p3 = [.01, .8, .1, .2]
print(mydiff(p1, p2), mydiff(p1, p3), mydiff(p3, p2))

0.07495072700475026 0.4090592012983077 0.4265837569566741


In [70]:
def get_topic_terms(lda, topic_id, topn):
    return [(lda.id2word[t], float(format(p, '.3f'))) for t, p in lda.get_topic_terms(topic_id, topn=topn)]
    
def get_topics(corpus, dictionary, num_topics):
    'Return the 100 most probable terms as an array of (term, probability) for each topic.'
    lda = nlp.build_lda_with_corpus(corpus, dictionary, num_topics=num_topics)
    print(str(num_topics) + ' topics: done')
    return [get_topic_terms(lda, i, 100) for i in range(lda.num_topics)]

def compute_and_export(corpus, dictionary, topic_range, outname):
    data = [get_topics(corpus, dictionary, k) for k in topic_range]

    with open(outname, 'w') as f:
        json.dump(data, f)

In [71]:
corpus, dictionary = nlp.build_corpus_dictionary(docs)

In [72]:
compute_and_export(corpus, dictionary, range(1, 21), '../data/lee-k-20topics.json')

1 topics: done
2 topics: done
3 topics: done
4 topics: done
5 topics: done
6 topics: done
7 topics: done
8 topics: done
9 topics: done
10 topics: done
11 topics: done
12 topics: done
13 topics: done
14 topics: done
15 topics: done
16 topics: done
17 topics: done
18 topics: done
19 topics: done
20 topics: done


#### Get topics associated with documents

In [10]:
def get_topics_for_documents(lda, corpus):
    'Return top 5 topic probabilities for each document in the given corpus.'
    return [get_topics_for_one_document(lda, doc) for doc in corpus]

def get_topics_for_one_document(lda, doc):
    topics = sorted(lda.get_document_topics(doc, minimum_probability=10**-6), key = lambda x: -x[1])[:5]
    return [float('{:.3f}'.format(p)) for (t, p) in topics]

In [11]:
output_topics = get_topics_for_documents(lda, corpus)
output_topics[:10]

[[0.394, 0.156, 0.074, 0.071, 0.063],
 [0.154, 0.137, 0.12, 0.113, 0.094],
 [0.125, 0.116, 0.109, 0.105, 0.1],
 [0.195, 0.113, 0.106, 0.103, 0.1],
 [0.17, 0.13, 0.128, 0.108, 0.08],
 [0.218, 0.115, 0.095, 0.095, 0.094],
 [0.259, 0.17, 0.124, 0.078, 0.071],
 [0.141, 0.118, 0.116, 0.107, 0.091],
 [0.464, 0.079, 0.067, 0.065, 0.063],
 [0.294, 0.087, 0.086, 0.084, 0.084]]

#### Find terms associated with topics 

In [12]:
def get_terms_for_topics(lda):
    'Return term probabilities for each topic. Only top 5 terms.'
    return [get_terms_for_one_topic(lda, i) for i in range(lda.num_topics)]

def get_terms_for_one_topic(lda, topic_id):
    return [float('{:.3f}'.format(p)) for t, p in lda.get_topic_terms(topic_id, topn=5)]

In [13]:
output_terms = get_terms_for_topics(lda)
output_terms

[[0.02, 0.015, 0.015, 0.013, 0.011],
 [0.021, 0.011, 0.011, 0.01, 0.009],
 [0.017, 0.017, 0.008, 0.006, 0.004],
 [0.016, 0.009, 0.008, 0.006, 0.005],
 [0.019, 0.016, 0.01, 0.009, 0.008],
 [0.018, 0.01, 0.01, 0.009, 0.007],
 [0.01, 0.009, 0.009, 0.009, 0.007],
 [0.02, 0.013, 0.007, 0.006, 0.005],
 [0.017, 0.01, 0.009, 0.006, 0.006],
 [0.015, 0.014, 0.006, 0.006, 0.005]]

## Export Model Data

In [14]:
def export_model_data(corpus, alpha, beta):
    'Return a dictionary detailing model parameters and probability matrices.'
    lda = build_lda(corpus, alpha=alpha, eta=beta)
    return {
        'alpha': alpha,
        'beta': beta,
        'num_topics': lda.num_topics,
        'doc_topics': get_topics_for_documents(lda, corpus),
        'topic_terms': get_terms_for_topics(lda)
    }

In [15]:
def save_file(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

#### Different alpha values

In [16]:
def export_data(corpus, alphas, betas, filename):
    data = [export_model_data(corpus, alpha, beta) for alpha in alphas for beta in betas]
    save_file(data, filename)

In [17]:
alphas = [0.01, 0.1, 1, 10]
betas = [0.001, 0.01, 0.1, 1]
export_data(corpus, alphas, betas, '../data/lee-params.json')

  diff = np.log(self.expElogbeta)


---

In [19]:
%load_ext autoreload
%autoreload 2