# Topic Modelling 
modified with 
*  https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html
* https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2

In [1]:
#visualisation 
!pip install pyldavis --q

## Import Modules




In [2]:
from pprint import pprint 

#logging 
import logging 
logging.basicConfig(format = '%(asctime)s: %(levelname)s : %(message)s',level= logging.INFO)

In [3]:
#pre-process data
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from gensim.parsing.preprocessing import strip_numeric,strip_multiple_whitespaces, strip_punctuation, strip_short, strip_tags

2021-08-09 22:47:09,318: INFO : 'pattern' package not found; tag filters are not available for English


In [4]:
#form ngrams
from gensim.models import Phrases
from gensim.corpora import Dictionary 
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#LDA Model
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

#tfidf model 
from gensim.models import TfidfModel

In [6]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

2021-08-09 22:47:09,640: INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
2021-08-09 22:47:09,672: INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt
  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
Deprecated in NumPy 1.20; for more d

### Load Data

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = pd.read_csv('drive/MyDrive/bbc_1807_1906_novideos.csv',encoding = 'utf-8')
df.head()

Unnamed: 0,Month,link,text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1901,https://www.bbc.co.uk/news/health-46638419,My name is Tim and I'm a cheese addict. But wh...,,,,
1,1901,https://www.bbc.co.uk/news/health-46720303,Children in the UK exceed the maximum recommen...,,,,
2,1901,https://www.bbc.co.uk/news/health-46746552,A clinical trial has been launched to see if a...,,,,
3,1901,https://www.bbc.co.uk/news/health-46739905,When 12-year-old Matthew Carter saw how much s...,,,,
4,1901,https://www.bbc.co.uk/news/stories-46427960,When Jessica Share bought sperm from a sperm b...,,,,


In [10]:
texts = list(df.loc[:, "text"].values)
texts[0]

'My name is Tim and I\'m a cheese addict. But what I\'ve been discovering recently has shaken me to the core. I can barely look a Babybel in the face. A half-eaten halloumi squeaklessly lies yellowing in the fridge. My cheese dreams are shattering. For, after a lifetime of unfettered devotion, could it possibly be that cheese is more foe than friend? That I am addicted to something that is not so good for my body? That cheese should be toast? These are questions that began surfacing a couple of months ago when I began making an episode for my new podcast for the BBC, All Hail Kale, looking into whether dairy was scary.  For some time, I\'d increasingly been questioning the logic of adults drinking milk.  While milk and dairy products, such as cheese and yoghurt, are good sources of protein and calcium and can form part of a healthy, balanced diet, as Dr Michael Greger, from NutritionFacts.org, put it to me: "There\'s no animal on the planet that drinks milk after weaning - and then to 

### Pre-process, tokenise, and lemmatisation

In [11]:
new_sw = ['people','say','says','said','could','one','would','take','want','have','she','he','give','told','know','but','health','month','day','year', 'minute','second','hundred','thousand','however']
stop_words = STOPWORDS.union(set(new_sw))

In [12]:
pubmed_stoplist = ['a','about','again','all','almost', 'also','although','always',\
                   'among','an','and','another','any','are','as','at','be','because',\
                   'been','before','being','between','both','but','by','can','could',\
                   'did','do','does','done','due','during','each','either','enough', \
                   'especially','etc','for','found', 'from','further','had','is','it',\
                   'has','have','having','here','how','however','i','if','in','into',\
                   'its','itself','just','kg','km', 'made','mainly','make','may','mg',\
                   'might','ml','mm','most','mostly','must','nearly','neither','no',\
                   'nor','obtained','of','often','on','our','overall','perhaps','pmid',\
                   'quite','rather','really','regarding','seem','seen','several',\
                   'should','show','showed','shown','shows','significantly','since',\
                   'so','some','such','than','that','the','their','theirs','them',\
                   'then','there','therefore','these','they','this','those','through',\
                   'thus','to','upon','use','used','using','various','very', 'was','we',\
                   'were','what','when', 'which','while','with','within','without','would']

In [13]:
stop_words = STOPWORDS.union(set(pubmed_stoplist))

In [14]:
clean_article = []

for text in texts:
  #remove new line character and punctuations 
  clean = []
  text = strip_multiple_whitespaces(text)
  text = strip_numeric(text)
  text = strip_punctuation(text)
  text = strip_tags(text)
  text = strip_short(text,minsize=3)
  text = remove_stopwords(text)
  # lower case, de-accents.  Convert the texts into a list of tokens
  for token in gensim.utils.simple_preprocess(text, deacc=True):
      clean.append(token)
  clean_article.append(clean)

In [15]:
cleaned_texts = np.array(clean_article)

  """Entry point for launching an IPython kernel.


In [16]:
print(cleaned_texts.shape)

(712,)


In [17]:
print(cleaned_texts[0:2])

[list(['tim', 'cheese', 'addict', 'but', 'discovering', 'recently', 'shaken', 'core', 'barely', 'look', 'babybel', 'face', 'half', 'eaten', 'halloumi', 'squeaklessly', 'lies', 'yellowing', 'fridge', 'cheese', 'dreams', 'shattering', 'for', 'lifetime', 'unfettered', 'devotion', 'possibly', 'cheese', 'foe', 'friend', 'that', 'addicted', 'good', 'body', 'that', 'cheese', 'toast', 'these', 'questions', 'began', 'surfacing', 'couple', 'months', 'ago', 'began', 'making', 'episode', 'new', 'podcast', 'bbc', 'all', 'hail', 'kale', 'looking', 'dairy', 'scary', 'for', 'time', 'increasingly', 'questioning', 'logic', 'adults', 'drinking', 'milk', 'while', 'milk', 'dairy', 'products', 'cheese', 'yoghurt', 'good', 'sources', 'protein', 'calcium', 'form', 'healthy', 'balanced', 'diet', 'michael', 'greger', 'nutritionfacts', 'org', 'there', 'animal', 'planet', 'drinks', 'milk', 'weaning', 'drink', 'milk', 'species', 'sense', 'reeled', 'series', 'studies', 'showing', 'life', 'shortening', 'potential', 

In [18]:
# WORDNET LEMMATIZER (with appropriate pos tags)
  
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

  formatvalue=lambda value: "")[1:-1]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from .mio5_utils import VarReader5


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [19]:
lemmatizer = WordNetLemmatizer()
  
# Define function to lemmatize each word with its POS tag
  
# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(tagged_list):
  tag_list = []
  for (word,tag) in tagged_list:
    if tag.startswith('J'):
        tag_list.append((word, wordnet.ADJ))
    elif tag.startswith('V'):
        tag_list.append((word, wordnet.VERB))
    elif tag.startswith('N'):
        tag_list.append((word, wordnet.NOUN))
    elif tag.startswith('R'):
        tag_list.append((word, wordnet.ADV))
    else:          
        tag_list.append((word, None))
  return tag_list

In [20]:
pos_list=[]
for article in cleaned_texts:
  pos_article = []
  for token in article:
    pos_article.extend(nltk.pos_tag(nltk.word_tokenize(token)))
  pos_list.append(pos_article)  

In [21]:
print(pos_list[0])

[('tim', 'NN'), ('cheese', 'NN'), ('addict', 'NN'), ('but', 'CC'), ('discovering', 'VBG'), ('recently', 'RB'), ('shaken', 'NNS'), ('core', 'NN'), ('barely', 'RB'), ('look', 'NN'), ('babybel', 'NN'), ('face', 'NN'), ('half', 'NN'), ('eaten', 'VB'), ('halloumi', 'NN'), ('squeaklessly', 'RB'), ('lies', 'NNS'), ('yellowing', 'VBG'), ('fridge', 'NN'), ('cheese', 'NN'), ('dreams', 'NNS'), ('shattering', 'VBG'), ('for', 'IN'), ('lifetime', 'NN'), ('unfettered', 'JJ'), ('devotion', 'NN'), ('possibly', 'RB'), ('cheese', 'NN'), ('foe', 'NN'), ('friend', 'NN'), ('that', 'IN'), ('addicted', 'VBN'), ('good', 'JJ'), ('body', 'NN'), ('that', 'IN'), ('cheese', 'NN'), ('toast', 'NN'), ('these', 'DT'), ('questions', 'NNS'), ('began', 'VBD'), ('surfacing', 'VBG'), ('couple', 'NN'), ('months', 'NNS'), ('ago', 'RB'), ('began', 'VBD'), ('making', 'VBG'), ('episode', 'NN'), ('new', 'JJ'), ('podcast', 'NN'), ('bbc', 'NN'), ('all', 'DT'), ('hail', 'NN'), ('kale', 'NN'), ('looking', 'VBG'), ('dairy', 'NN'), ('s

In [22]:
lem_list = []
for article in pos_list:
  new_list = pos_tagger(article)
  lem_list.append(new_list)

In [23]:
print(lem_list[0])

[('tim', 'n'), ('cheese', 'n'), ('addict', 'n'), ('but', None), ('discovering', 'v'), ('recently', 'r'), ('shaken', 'n'), ('core', 'n'), ('barely', 'r'), ('look', 'n'), ('babybel', 'n'), ('face', 'n'), ('half', 'n'), ('eaten', 'v'), ('halloumi', 'n'), ('squeaklessly', 'r'), ('lies', 'n'), ('yellowing', 'v'), ('fridge', 'n'), ('cheese', 'n'), ('dreams', 'n'), ('shattering', 'v'), ('for', None), ('lifetime', 'n'), ('unfettered', 'a'), ('devotion', 'n'), ('possibly', 'r'), ('cheese', 'n'), ('foe', 'n'), ('friend', 'n'), ('that', None), ('addicted', 'v'), ('good', 'a'), ('body', 'n'), ('that', None), ('cheese', 'n'), ('toast', 'n'), ('these', None), ('questions', 'n'), ('began', 'v'), ('surfacing', 'v'), ('couple', 'n'), ('months', 'n'), ('ago', 'r'), ('began', 'v'), ('making', 'v'), ('episode', 'n'), ('new', 'a'), ('podcast', 'n'), ('bbc', 'n'), ('all', None), ('hail', 'n'), ('kale', 'n'), ('looking', 'v'), ('dairy', 'n'), ('scary', 'a'), ('for', None), ('time', 'n'), ('increasingly', 'r'

In [24]:
lemmatized_list = []
for article in lem_list:
  lemmatized_article = []
  for (word, tag)in article:
    if tag is None:
        # if there is no available tag, append the token as is
        lemmatized_article.append(word)
    else:        
        # else use the tag to lemmatize the token
        lemmatized_article.append(lemmatizer.lemmatize(word,tag))
  lemmatized_list.append(lemmatized_article)

In [25]:
print(lemmatized_list[0])
print(lemmatized_list[1])

['tim', 'cheese', 'addict', 'but', 'discover', 'recently', 'shaken', 'core', 'barely', 'look', 'babybel', 'face', 'half', 'eat', 'halloumi', 'squeaklessly', 'lie', 'yellow', 'fridge', 'cheese', 'dream', 'shatter', 'for', 'lifetime', 'unfettered', 'devotion', 'possibly', 'cheese', 'foe', 'friend', 'that', 'addict', 'good', 'body', 'that', 'cheese', 'toast', 'these', 'question', 'begin', 'surface', 'couple', 'month', 'ago', 'begin', 'make', 'episode', 'new', 'podcast', 'bbc', 'all', 'hail', 'kale', 'look', 'dairy', 'scary', 'for', 'time', 'increasingly', 'question', 'logic', 'adult', 'drinking', 'milk', 'while', 'milk', 'dairy', 'product', 'cheese', 'yoghurt', 'good', 'source', 'protein', 'calcium', 'form', 'healthy', 'balance', 'diet', 'michael', 'greger', 'nutritionfacts', 'org', 'there', 'animal', 'planet', 'drink', 'milk', 'wean', 'drink', 'milk', 'specie', 'sense', 'reel', 'series', 'study', 'show', 'life', 'shorten', 'potential', 'drinking', 'hormonal', 'stew', 'blithely', 'assume'

## bigram 

In [26]:
bigram = gensim.models.Phrases(lemmatized_list, min_count=5, threshold=100) 
bigram_2 = gensim.models.phrases.Phraser(bigram)
bigram_words = [bigram_2[text] for text in lemmatized_list]

2021-08-09 22:48:33,056: INFO : collecting all words and their counts
2021-08-09 22:48:33,058: INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-08-09 22:48:33,565: INFO : collected 166705 word types from a corpus of 228519 words (unigram + bigrams) and 712 sentences
2021-08-09 22:48:33,567: INFO : using 166705 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2021-08-09 22:48:33,568: INFO : source_vocab length 166705
2021-08-09 22:48:35,333: INFO : Phraser built with 443 phrasegrams


In [27]:
print(bigram_words[:5])

[['tim', 'cheese', 'addict', 'but', 'discover', 'recently', 'shaken', 'core', 'barely', 'look', 'babybel', 'face', 'half', 'eat', 'halloumi', 'squeaklessly', 'lie', 'yellow', 'fridge', 'cheese', 'dream', 'shatter', 'for', 'lifetime', 'unfettered', 'devotion', 'possibly', 'cheese', 'foe', 'friend', 'that', 'addict', 'good', 'body', 'that', 'cheese', 'toast', 'these', 'question', 'begin', 'surface', 'couple', 'month', 'ago', 'begin', 'make', 'episode', 'new', 'podcast', 'bbc', 'all', 'hail', 'kale', 'look', 'dairy', 'scary', 'for', 'time', 'increasingly', 'question', 'logic', 'adult', 'drinking', 'milk', 'while', 'milk', 'dairy', 'product', 'cheese', 'yoghurt', 'good', 'source', 'protein', 'calcium', 'form', 'healthy', 'balance', 'diet', 'michael', 'greger', 'nutritionfacts', 'org', 'there', 'animal', 'planet', 'drink', 'milk', 'wean', 'drink', 'milk', 'specie', 'sense', 'reel', 'series', 'study', 'show', 'life', 'shorten', 'potential', 'drinking', 'hormonal', 'stew', 'blithely', 'assume

Remove common and rare words

In [28]:
dictionary = Dictionary (bigram_words)

2021-08-09 22:48:36,030: INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-08-09 22:48:36,273: INFO : built Dictionary(14282 unique tokens: ['able', 'abood', 'absorb', 'accord', 'actual']...) from 712 documents (total 222821 corpus positions)


In [29]:
dictionary.filter_extremes(no_below=5, no_above=0.7)

2021-08-09 22:48:36,310: INFO : discarding 9960 tokens: [('abood', 1), ('babybel', 1), ('benign', 3), ('bewildering', 2), ('blithely', 1), ('brie', 1), ('butterfat', 1), ('confess', 4), ('controversially', 3), ('core', 4)]...
2021-08-09 22:48:36,313: INFO : keeping 4322 tokens which were in no less than 5 and no more than 498 (=70.0%) documents
2021-08-09 22:48:36,328: INFO : resulting dictionary: Dictionary(4322 unique tokens: ['able', 'absorb', 'accord', 'actual', 'addict']...)


Build bag of words 

In [30]:
# vectorizer = TfidfVectorizer(lowercase=False, ngram_range = (1,3))
# vector = [vectorizer.fit_transform(text) for text in lemmatized_list]

In [31]:
# idf = vectorizer.idf_
# dic = dict(zip(vectorizer.get_feature_names(), idf))
# pprint(dic)

In [32]:
corpus = [dictionary.doc2bow(text) for text in bigram_words]

In [33]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 4322
Number of documents: 712


In [34]:
# df = pd.DataFrame(vector[0].T.todense(),index = vectorizer.get_feature_names(),columns=['tfidf'])
# df.sort_values(by['tfidf'],ascending = False)

In [35]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 1),
  ('absorb', 1),
  ('accord', 1),
  ('actual', 1),
  ('addict', 2),
  ('addiction', 1),
  ('addictive', 1),
  ('adult', 2),
  ('ago', 1),
  ('agree', 1),
  ('all', 3),
  ('and', 1),
  ('animal', 1),
  ('answer', 1),
  ('apparently', 2),
  ('assume', 1),
  ('away', 2),
  ('bacteria', 1),
  ('balance', 1),
  ('barely', 1),
  ('bbc', 2),
  ('begin', 2),
  ('beneficial', 1),
  ('body', 1),
  ('bother', 1),
  ('bread', 1),
  ('break', 2),
  ('but', 3),
  ('calcium', 1),
  ('carry', 1),
  ('centre', 1),
  ('certainly', 1),
  ('change', 1),
  ('cheese', 21),
  ('chemical', 1),
  ('childhood', 1),
  ('combination', 1),
  ('concentrate', 1),
  ('confirm', 1),
  ('consensus', 2),
  ('consider', 1),
  ('consume', 1),
  ('contact', 1),
  ('contain', 1),
  ('couple', 1),
  ('crack', 1),
  ('daily', 1),
  ('dairy', 10),
  ('damage', 1),
  ('david', 1),
  ('day', 3),
  ('decide', 1),
  ('definitive', 1),
  ('denial', 1),
  ('diagnosis', 1),
  ('diet', 2),
  ('digest', 1),
  ('dinner', 

# TFIDF

In [36]:
tfidf =  gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for i in corpus_tfidf[0]:
  print(i)

2021-08-09 22:48:36,607: INFO : collecting document frequencies
2021-08-09 22:48:36,609: INFO : PROGRESS: processing document #0
2021-08-09 22:48:36,654: INFO : calculating IDF weights for 712 documents and 4321 features (119289 matrix non-zeros)


(0, 0.010734260135446887)
(1, 0.03388493634791294)
(2, 0.010649175293790846)
(3, 0.03084035418339577)
(4, 0.07511510837275956)
(5, 0.02944398008615281)
(6, 0.03647254109903107)
(7, 0.028143514308151707)
(8, 0.015101681704095003)
(9, 0.021913549125061455)
(10, 0.05989824183973659)
(11, 0.00775947150589179)
(12, 0.024958131289578626)
(13, 0.022998562212410166)
(14, 0.07103098345853676)
(15, 0.030347747113301318)
(16, 0.029913158263061988)
(17, 0.027545736040696763)
(18, 0.024480021151309164)
(19, 0.03192536727074447)
(20, 0.02563840293697651)
(21, 0.025749332079399143)
(22, 0.032527533683325525)
(23, 0.009506178950689859)
(24, 0.03551549172926838)
(25, 0.033177922956332075)
(26, 0.04191299951059751)
(27, 0.011022744974110153)
(28, 0.03755755418637978)
(29, 0.012494136996632698)
(30, 0.017463688174045703)
(31, 0.026588686670934062)
(32, 0.007641708167329578)
(33, 0.7887086379139754)
(34, 0.025466142209087324)
(35, 0.024715560197666013)
(36, 0.024480021151309164)
(37, 0.033177922956332075)

In [37]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

2021-08-09 22:48:36,697: INFO : using symmetric alpha at 0.1
2021-08-09 22:48:36,705: INFO : using symmetric eta at 0.1
2021-08-09 22:48:36,709: INFO : using serial LDA version on this node
2021-08-09 22:48:36,724: INFO : running online LDA training, 10 topics, 2 passes over the supplied corpus of 712 documents, updating every 8000 documents, evaluating every ~712 documents, iterating 50x with a convergence threshold of 0.001000
2021-08-09 22:48:36,732: INFO : training LDA model using 4 processes
2021-08-09 22:48:37,435: INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #712/712, outstanding queue size 1
2021-08-09 22:48:39,054: INFO : topic #6 (0.100): 0.002*"heart" + 0.002*"woman" + 0.002*"patient" + 0.002*"hiv" + 0.002*"cancer" + 0.002*"test" + 0.002*"boy" + 0.002*"child" + 0.001*"food" + 0.001*"brain"
2021-08-09 22:48:39,058: INFO : topic #9 (0.100): 0.002*"child" + 0.002*"sleep" + 0.002*"parent" + 0.002*"woman" + 0.002*"baby" + 0.002*"sugar" + 0.002*"study" + 0.001*"m

Topic: 0 Word: 0.003*"drug" + 0.003*"hiv" + 0.003*"cannabis" + 0.002*"cancer" + 0.002*"sex" + 0.002*"blood" + 0.002*"product" + 0.002*"men" + 0.001*"sexual" + 0.001*"meat"
Topic: 1 Word: 0.003*"cancer" + 0.003*"baby" + 0.002*"woman" + 0.002*"vaccine" + 0.002*"care" + 0.002*"mental" + 0.002*"child" + 0.002*"brain" + 0.002*"drug" + 0.002*"service"
Topic: 2 Word: 0.003*"baby" + 0.002*"child" + 0.002*"cannabis" + 0.002*"doctor" + 0.002*"woman" + 0.002*"drug" + 0.002*"patient" + 0.002*"cancer" + 0.002*"she" + 0.002*"twin"
Topic: 3 Word: 0.002*"hpv" + 0.002*"pill" + 0.001*"woman" + 0.001*"insulin" + 0.001*"menopause" + 0.001*"programme" + 0.001*"product" + 0.001*"plasma" + 0.001*"vaping" + 0.001*"child"
Topic: 4 Word: 0.003*"cancer" + 0.002*"bowel" + 0.002*"cigarette" + 0.002*"swim" + 0.002*"fibre" + 0.001*"asthma" + 0.001*"rod" + 0.001*"surgery" + 0.001*"air_pollution" + 0.001*"injury"
Topic: 5 Word: 0.004*"vaccination" + 0.003*"vaccine" + 0.002*"measles" + 0.002*"mental" + 0.002*"outbreak"

In [38]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_ldatfidf = CoherenceModel(model=lda_model_tfidf, texts=bigram_words, dictionary=dictionary, coherence='c_v')
coherence_ldatfidf = coherence_model_ldatfidf.get_coherence()
print('\nCoherence Score: ', coherence_ldatfidf)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt


Perplexity:  -8.968132818629408


2021-08-09 22:48:45,743: INFO : WordOccurrenceAccumulator accumulated stats from 8000 documents
2021-08-09 22:48:45,773: INFO : WordOccurrenceAccumulator accumulated stats from 9000 documents
2021-08-09 22:48:45,800: INFO : WordOccurrenceAccumulator accumulated stats from 10000 documents
2021-08-09 22:48:45,827: INFO : WordOccurrenceAccumulator accumulated stats from 11000 documents
2021-08-09 22:48:45,856: INFO : WordOccurrenceAccumulator accumulated stats from 12000 documents
2021-08-09 22:48:45,886: INFO : WordOccurrenceAccumulator accumulated stats from 13000 documents
2021-08-09 22:48:45,921: INFO : WordOccurrenceAccumulator accumulated stats from 14000 documents
2021-08-09 22:48:45,955: INFO : WordOccurrenceAccumulator accumulated stats from 15000 documents
2021-08-09 22:48:45,987: INFO : WordOccurrenceAccumulator accumulated stats from 16000 documents
2021-08-09 22:48:46,018: INFO : WordOccurrenceAccumulator accumulated stats from 17000 documents
2021-08-09 22:48:46,048: INFO : 


Coherence Score:  0.32798938687976087


In [39]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model_tfidf, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


## Find optimal number of topics 

In [40]:
coherence_values = []
model_list = []
for num_topics in range(20, 50, 5):
    model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, workers=2, passes=2, iterations = 100)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=bigram_words, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

2021-08-09 22:48:53,654: INFO : using symmetric alpha at 0.05
2021-08-09 22:48:53,658: INFO : using symmetric eta at 0.05
2021-08-09 22:48:53,662: INFO : using serial LDA version on this node
2021-08-09 22:48:53,681: INFO : running online LDA training, 20 topics, 2 passes over the supplied corpus of 712 documents, updating every 4000 documents, evaluating every ~712 documents, iterating 100x with a convergence threshold of 0.001000
2021-08-09 22:48:53,687: INFO : training LDA model using 2 processes
2021-08-09 22:48:54,335: INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #712/712, outstanding queue size 1
2021-08-09 22:48:55,993: INFO : topic #3 (0.050): 0.002*"air_pollution" + 0.002*"attack" + 0.002*"cancer" + 0.002*"obesity" + 0.002*"child" + 0.002*"service" + 0.002*"drug" + 0.002*"cheese" + 0.002*"pregnancy" + 0.002*"meat"
2021-08-09 22:48:55,998: INFO : topic #1 (0.050): 0.002*"blood" + 0.002*"sugar" + 0.002*"woman" + 0.002*"cancer" + 0.002*"donor" + 0.002*"sex" + 0.

KeyboardInterrupt: ignored

In [None]:
# Print the coherence scores
for m, cv in zip(range(20,50,5), coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Show graph
plt.plot(range(20,50,5), coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

## Optimal Model


In [None]:
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
pyLDAvis.gensim_models.prepare(optimal_model, corpus, dictionary)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', optimal_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_ldatfidf = CoherenceModel(model=optimal_model, texts=lemmatized_list, dictionary=dictionary, coherence='c_v')
coherence_ldatfidf = coherence_model_ldatfidf.get_coherence()
print('\nCoherence Score: ', coherence_ldatfidf)


## -----------------------Rewrite 




# Dominant topic in each sentence 

In [None]:
def format_topics_sentences(ldamodel=lda_model_tfidf, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

# Find the most representative document for each topic

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf= pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf.head()

## Topic distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics


# Trigram 

In [None]:
trigram = gensim.models.Phrases(bigram[bigram_words], threshold=100) 
trigram_2 = gensim.models.phrases.Phraser(trigram)
trigram_words = [trigram_2[text] for text in bigram_words]

In [None]:
lemmatized_texts_trigram= [[lemmatizer.lemmatize(element) for element in text] for text in trigram_words]

print(lemmatized_texts_trigram[0])

In [None]:
# Create Dictionary
dictionary_trigram = Dictionary(lemmatized_texts_trigram)
dictionary_trigram.filter_extremes(no_below=5, no_above=0.7)

# Create Corpus
corpus_trigram = [dictionary_trigram.doc2bow(text) for text in lemmatized_texts_trigram]
print('Number of unique tokens: %d' % len(dictionary_trigram))
print('Number of documents: %d' % len(corpus_trigram))

# Term Document Frequency
corpus_trigram = [dictionary_trigram.doc2bow(text) for text in lemmatized_texts_trigram]

# View
print(corpus_trigram[:1])

In [None]:
[[(dictionary_trigram[id], freq) for id, freq in cp] for cp in corpus_trigram[:1]]

In [None]:
tfidf_trigram =  gensim.models.TfidfModel(corpus_trigram)
corpus_tfidf_trigram = tfidf_trigram[corpus_trigram]
for i in corpus_tfidf_trigram[0]:
  print(i)

In [None]:
lda_model_trigram = gensim.models.LdaMulticore(corpus_tfidf_trigram, num_topics=30, id2word=dictionary_trigram, workers=2, passes=2, iterations = 100)

In [None]:
pprint(lda_model_trigram.print_topics())

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_trigram.log_perplexity(corpus_trigram))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_tri = CoherenceModel(model=lda_model_trigram, texts=lemmatized_texts_trigram, dictionary=dictionary_trigram, coherence='c_v')
coherence_lda_tri = coherence_model_lda_tri.get_coherence()
print('\nCoherence Score: ', coherence_lda_tri)

In [None]:
pyLDAvis.gensim_models.prepare(lda_model_trigram, corpus_trigram, dictionary_trigram)

## Find optimal number of topics 

In [None]:
# coherence_values_trigram = []
# model_list_trigram= []
# for num_topics in range(2, 20, 2):
#     model_trigram = gensim.models.LdaMulticore(corpus_tfidf_trigram, num_topics=num_topics, id2word=dictionary_trigram, workers=2, passes=2, iterations = 100)
#     model_list_trigram.append(model_trigram)
#     coherencemodel_trigram = CoherenceModel(model=model_trigram, texts=lemmatized_list, dictionary=dictionary_trigram, coherence='c_v')
#     coherence_values_trigram.append(coherencemodel_trigram.get_coherence())

In [None]:
# Print the coherence scores
# for m, cv in zip(range(2,20,2), coherence_values_trigram):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Show graph
# plt.plot(range(2,20,2), coherence_values_trigram)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

# Hyperparameters Tuning

In [48]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           workers=2,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=bigram_words, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import tqdm

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 10
max_topics = 61
step_size = 5
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus_tfidf)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus_tfidf, num_of_docs*0.75), 
               corpus_tfidf]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    graph = pd.read_csv('lda_tuning_results.csv')
    graph.head()
    pbar.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2021-08-10 04:54:38,960: INFO : PROGRESS: pass 5, dispatched chunk #0 = documents up to #100/712, outstanding queue size 1
2021-08-10 04:54:39,224: INFO : PROGRESS: pass 5, dispatched chunk #1 = documents up to #200/712, outstanding queue size 2
2021-08-10 04:54:39,516: INFO : PROGRESS: pass 5, dispatched chunk #2 = documents up to #300/712, outstanding queue size 2
2021-08-10 04:54:39,612: INFO : merging changes from 200 documents into a model of 712 documents
2021-08-10 04:54:39,677: INFO : topic #13 (0.310): 0.000*"vegan" + 0.000*"breakfast" + 0.000*"dialysis" + 0.000*"cheese" + 0.000*"yoghurt" + 0.000*"statin" + 0.000*"mosquito" + 0.000*"knife" + 0.000*"mould" + 0.000*"stillbirth"
2021-08-10 04:54:39,683: INFO : topic #26 (0.310): 0.000*"breakfast" + 0.000*"dialysis" + 0.000*"cheese" + 0.000*"statin" + 0.000*"knife" + 0.000*"mosquito" + 0.000*"mould" + 0.000*"stillbirth" + 0.000*"vaping" + 0.000*"smile"
2021-08-10 04:

https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [None]:
graph = pd.read_csv('lda_tuning_results.csv')
graph.head()