In [62]:
import pickle
import json
import codecs
import nltk
from nltk.corpus import stopwords
from pywsd.utils import lemmatize_sentence

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

stopWords = set(stopwords.words('english'))

# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('stopwords')

In [63]:
data = json.load(codecs.open('raw_modules_data.json', 'r', 'utf-8-sig'))
d = {}
for datum in data:
    try:
        d[datum['ModuleCode']] = datum['ModuleDescription']
    except:
        pass
d

{'ACC1002': "The course provides an introduction to financial accounting. It examines accounting from an external user's perspective: an external user being an investor or a creditor. Such users would need to understand financial accounting in order to make investing or lending decisions. However, to attain a good understanding, it is also necessary to be familiar with how the information is derived. Therefore, students would learn how to prepare the reports or statements resulting from financial accounting and how to use them for decision-making.",
 'ACC1002X': "The course provides an introduction to financial accounting. It examines accounting from an external user's perspective: an external user being an investor or a creditor. Such users would need to understand financial accounting in order to make investing or lending decisions. However, to attain a good understanding, it is also necessary to be familiar with how the information are derived. Therefore, students would learn how to

In [64]:
for key in d.keys():
    tokens = list(set(lemmatize_sentence(d[key])) - stopWords)
    dirty_tags = nltk.pos_tag(tokens)
    tags = [t[0] for t in dirty_tags if t[1] in ['NN','JJ','VBG','JJR','JJS','NNS','NP','NPS']]
    d[key] = tags
d

{'ACC1002': ['information',
  'statement',
  'investing',
  'external',
  'familiar',
  'derive',
  'need',
  'accounting',
  'learn',
  'result',
  'good',
  'report',
  'perspective',
  'decision-making',
  'lending',
  'understand',
  'investor',
  'course',
  'user',
  'account',
  'necessary',
  'introduction',
  'financial',
  'make',
  'understanding',
  'attain',
  'creditor',
  'decision',
  'order',
  'examine'],
 'ACC1002X': ['information',
  'statement',
  'investing',
  'external',
  'familiar',
  'derive',
  'need',
  'accounting',
  'learn',
  'result',
  'good',
  'report',
  'perspective',
  'decision-making',
  'lending',
  'understand',
  'investor',
  'course',
  'user',
  'account',
  'necessary',
  'introduction',
  'financial',
  'make',
  'understanding',
  'attain',
  'creditor',
  'decision',
  'order',
  'examine'],
 'ACC1006': ['enhance',
  'operation',
  'information',
  'business',
  'role',
  'streamline',
  'chain',
  'student',
  'supply',
  'accounting

In [68]:
id2word = corpora.Dictionary(d.values())
corpus = [id2word.doc2bow(text) for text in d.values()]
freq = [[(id2word[id], freq) for id, freq in cp] for cp in corpus]
freq_d = {}
for f in freq:
    for f2 in f:
        if f2[0] in freq_d:
            freq_d[f2[0]] += f2[1]
        else:
            freq_d[f2[0]] = f2[1]
freq_d

{'account': 83,
 'accounting': 42,
 'attain': 14,
 'course': 1529,
 'creditor': 9,
 'decision': 237,
 'decision-making': 49,
 'derive': 38,
 'examine': 648,
 'external': 53,
 'familiar': 47,
 'financial': 205,
 'good': 270,
 'information': 249,
 'introduction': 349,
 'investing': 7,
 'investor': 27,
 'learn': 453,
 'lending': 11,
 'make': 81,
 'necessary': 140,
 'need': 104,
 'order': 163,
 'perspective': 306,
 'report': 160,
 'result': 105,
 'statement': 39,
 'understand': 322,
 'understanding': 683,
 'user': 43,
 'accounting/business': 2,
 'advantage': 32,
 'aim': 653,
 'application': 761,
 'area': 372,
 'business': 412,
 'chain': 60,
 'competitive': 54,
 'cycle': 69,
 'different': 461,
 'enhance': 112,
 'finance/accounting': 2,
 'functional': 77,
 'hr/management': 2,
 'innovative': 50,
 'marketing': 106,
 'operation': 197,
 'particular': 255,
 'role': 460,
 'streamline': 2,
 'student': 2999,
 'supply': 71,
 'system': 837,
 'use': 945,
 'various': 638,
 'aware': 13,
 'book-keeping': 

In [81]:
topd = {}
for k,v in d.items():
    innerd = {}
    for word in v:
        innerd[word] = freq_d[word]
    words = list(dict(sorted(innerd.items(), key=lambda x:x[1])).keys())[:5]
    topd[k] = words

    
with open('topics.pickle', 'wb') as handle:
    pickle.dump(topd, handle)
    
topd

{'ACC1002': ['investing', 'creditor', 'lending', 'attain', 'investor'],
 'ACC1002X': ['investing', 'creditor', 'lending', 'attain', 'investor'],
 'ACC1006': ['streamline',
  'hr/management',
  'finance/accounting',
  'accounting/business',
  'advantage'],
 'ACC1701': ['book-keeping', 'aware', 'viewpoint', 'reporting', 'investor'],
 'ACC1701X': ['book-keeping', 'aware', 'viewpoint', 'reporting', 'investor'],
 'ACC2002': ['evolves',
  'usefulness',
  'organisational',
  'accounting',
  'managerial'],
 'ACC2706': ['evolves',
  'usefulness',
  'organisational',
  'accounting',
  'managerial'],
 'ACC2707': ['post-balance-sheet',
  'intangible',
  'impairment',
  'interim',
  'revenue'],
 'ACC2708': ['share-based', 'lease', 'compensation', 'tax', 'employee'],
 'ACC2709': ['streamline',
  'hr/management',
  'finance/accounting',
  'accounting/business',
  'advantage'],
 'ACC3603': ['attestation', 'assurance', 'profession', 'audit', 'attitude'],
 'ACC3601': ['consolidated', 'leasing', 'off-bal

In [34]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [35]:
doc_lda = lda_model[corpus]
lda_model.print_topics()

[(0,
  '0.025*"presentation" + 0.021*"perform" + 0.018*"scientific" + 0.018*"week" + 0.017*"implication" + 0.016*"science" + 0.015*"quality" + 0.015*"basis" + 0.014*"effective" + 0.013*"security"'),
 (1,
  '0.037*"social" + 0.017*"society" + 0.017*"contemporary" + 0.015*"cultural" + 0.014*"culture" + 0.014*"relation" + 0.014*"way" + 0.014*"history" + 0.013*"community" + 0.012*"historical"'),
 (2,
  '0.015*"science" + 0.013*"application" + 0.010*"target" + 0.009*"phenomenon" + 0.008*"principle" + 0.008*"cover" + 0.008*"intervention" + 0.008*"fundamental" + 0.007*"equation" + 0.007*"function"'),
 (3,
  '0.028*"life" + 0.025*"economic" + 0.021*"singapore" + 0.020*"explore" + 0.017*"international" + 0.017*"examine" + 0.015*"real" + 0.014*"value" + 0.012*"country" + 0.012*"law"'),
 (4,
  '0.024*"course" + 0.015*"analysis" + 0.014*"concept" + 0.014*"cover" + 0.012*"use" + 0.010*"problem" + 0.010*"system" + 0.010*"topic" + 0.010*"basic" + 0.009*"method"'),
 (5,
  '0.064*"module" + 0.044*"stud

In [36]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [38]:
pyLDAvis.save_html(vis, 'topicmodelingplot.html')