# Topic Modelling 
* LDA
* BERT

Ref: 

* https://becominghuman.ai/news-topic-classification-using-lstm-a1e8a38781fe
* https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24


### Load Packages

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer('english')

import nltk
nltk.download('wordnet')
from nltk.stem.porter import *

import numpy as np
np.random.seed(2018)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('drive/MyDrive/bbc_test_novideos_v2.csv',encoding = 'latin1')
df.head

Mounted at /content/drive


<bound method NDFrame.head of                                                  link  ...                                               text
0          https://www.bbc.co.uk/news/health-47749964  ...  Psychiatrists are being urged to ask children ...
1              https://www.bbc.co.uk/news/uk-47751285  ...  The number of adults seeking help to cope with...
2          https://www.bbc.co.uk/news/health-47735103  ...  The current system of checking newborns for hi...
3          https://www.bbc.co.uk/news/health-47735108  ...  Calorie-filled Easter eggs are being sold in s...
4          https://www.bbc.co.uk/news/health-47749025  ...  "She is so desperate to end it all, she curren...
..                                                ...  ...                                                ...
95         https://www.bbc.co.uk/news/health-48256759  ...  Glucosamine supplements, better known as a rem...
96  https://www.bbc.co.uk/news/world-us-canada-482...  ...  Alabama has become the latest 

In [3]:
med = list(df.loc[:, "text"].values)

### Preprocess Data

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

#remove html tags, emails, urls, numbers, stop words and words less than 3 chars 
def preprocess(text):
    result = []
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\S*@\S*\s?', '', text)
    text = re.sub('https?://[A-Za-z0-9]','',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
doc_sample = med[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Psychiatrists', 'are', 'being', 'urged', 'to', 'ask', 'children', 'with', 'mental', 'health', 'issues', 'how', 'long', 'they', 'spend', 'online', 'and', 'what', 'they', 'use', 'social', 'media', 'for.\nQuestions', 'about', 'technology', 'should', 'be', 'a', 'routine', 'part', 'of', 'assessments,', 'the', 'Royal', 'College', 'of', 'Psychiatrists', 'says.', '\nIt', 'is', 'concerned', 'about', 'how', 'time', 'spent', 'online', 'impacts', 'on', 'mood,', 'sleep,', 'diet', 'and', 'behaviour.\nThe', 'government', 'is', 'expected', 'to', 'announce', 'plans', 'to', 'regulate', 'social', 'media', 'companies', 'soon.\nThe', "College's", 'advice', 'comes', 'as', 'evidence', 'grows', 'of', 'a', 'possible', 'link', 'between', 'harmful', 'content', 'or', 'time', 'spent', 'online,', 'and', 'poor', 'mental', 'health.\nIt', 'is', 'planning', 'to', 'publish', 'a', 'report', 'later', 'this', 'year', 'about', 'its', 'stance', 'on', 'technology', 'use', 'and', "children's", 'mental', '

In [6]:
all_cleaned_texts = np.array([preprocess(m) for m in med])
print(all_cleaned_texts.shape)
print(all_cleaned_texts[0:5])

(100,)
[list(['psychiatrist', 'urg', 'children', 'mental', 'health', 'issu', 'long', 'spend', 'onlin', 'social', 'media', 'question', 'technolog', 'routin', 'assess', 'royal', 'colleg', 'psychiatrist', 'say', 'concern', 'time', 'spend', 'onlin', 'impact', 'mood', 'sleep', 'diet', 'behaviour', 'govern', 'expect', 'announc', 'plan', 'regul', 'social', 'media', 'compani', 'soon', 'colleg', 'advic', 'come', 'evid', 'grow', 'possibl', 'link', 'harm', 'content', 'time', 'spend', 'onlin', 'poor', 'mental', 'health', 'plan', 'publish', 'report', 'later', 'year', 'stanc', 'technolog', 'children', 'mental', 'health', 'includ', 'recommend', 'parent', 'children', 'doctor', 'assess', 'children', 'psychiatrist', 'advis', 'think', 'royal', 'colleg', 'psychiatrist', 'recommend', 'children', 'stop', 'technolog', 'hour', 'go', 'avoid', 'technolog', 'mealtim', 'view', 'echo', 'chief', 'medic', 'offic', 'recent', 'guidanc', 'social', 'media', 'group', 'recent', 'call', 'social', 'media', 'compani', 'profi

## Bags of Words 

In [7]:
dictionary = gensim.corpora.Dictionary(all_cleaned_texts)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break



0 access
1 activ
2 add
3 address
4 adolesc
5 advic
6 advis
7 affect
8 announc
9 anxieti
10 assess


In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 activ
1 add
2 affect
3 avoid
4 call
5 care
6 caus
7 chief
8 child
9 children
10 colleg


In [9]:
#For each document we create a dictionary reporting how many words and how many times those words appear. 
bow_corpus = [dictionary.doc2bow(doc) for doc in all_cleaned_texts]
bow_corpus[0]

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 2),
 (8, 1),
 (9, 7),
 (10, 3),
 (11, 3),
 (12, 1),
 (13, 2),
 (14, 1),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 3),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 1),
 (28, 2),
 (29, 7),
 (30, 1),
 (31, 1),
 (32, 2),
 (33, 1),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 2),
 (40, 3),
 (41, 1),
 (42, 1),
 (43, 10),
 (44, 1),
 (45, 10),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 4),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 3),
 (58, 1),
 (59, 2),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 10),
 (64, 3),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 9)]

In [10]:
bow_doc_0 = bow_corpus[0]
for i in range(len(bow_doc_0)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                               dictionary[bow_doc_0[i][0]], 
bow_doc_0[i][1]))

Word 0 ("activ") appears 1 time.
Word 1 ("add") appears 2 time.
Word 2 ("affect") appears 1 time.
Word 3 ("avoid") appears 1 time.
Word 4 ("call") appears 1 time.
Word 5 ("care") appears 1 time.
Word 6 ("caus") appears 2 time.
Word 7 ("chief") appears 2 time.
Word 8 ("child") appears 1 time.
Word 9 ("children") appears 7 time.
Word 10 ("colleg") appears 3 time.
Word 11 ("compani") appears 3 time.
Word 12 ("compar") appears 1 time.
Word 13 ("concern") appears 2 time.
Word 14 ("condit") appears 1 time.
Word 15 ("consid") appears 2 time.
Word 16 ("director") appears 1 time.
Word 17 ("doctor") appears 1 time.
Word 18 ("england") appears 1 time.
Word 19 ("evid") appears 1 time.
Word 20 ("exampl") appears 1 time.
Word 21 ("execut") appears 1 time.
Word 22 ("expect") appears 3 time.
Word 23 ("feel") appears 1 time.
Word 24 ("find") appears 1 time.
Word 25 ("go") appears 1 time.
Word 26 ("govern") appears 2 time.
Word 27 ("group") appears 1 time.
Word 28 ("grow") appears 2 time.
Word 29 ("harm

## LDA with Bag of Words 

In [11]:
#train model 
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

# For each topic, we will explore the words occuring in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"children" + 0.020*"babi" + 0.018*"public" + 0.018*"hospit" + 0.017*"surgeri" + 0.016*"diseas" + 0.016*"treat" + 0.015*"earli" + 0.015*"treatment" + 0.014*"case"
Topic: 1 
Words: 0.025*"women" + 0.022*"babi" + 0.018*"case" + 0.014*"countri" + 0.013*"research" + 0.013*"death" + 0.013*"kill" + 0.012*"world" + 0.012*"report" + 0.012*"treatment"
Topic: 2 
Words: 0.015*"suggest" + 0.014*"hear" + 0.014*"children" + 0.013*"harm" + 0.013*"parent" + 0.013*"organ" + 0.012*"compani" + 0.012*"think" + 0.012*"right" + 0.012*"govern"
Topic: 3 
Words: 0.022*"brain" + 0.021*"patient" + 0.020*"research" + 0.016*"children" + 0.016*"prof" + 0.015*"social" + 0.015*"parent" + 0.014*"medic" + 0.014*"media" + 0.013*"think"
Topic: 4 
Words: 0.017*"patient" + 0.016*"medic" + 0.016*"univers" + 0.013*"lose" + 0.013*"start" + 0.013*"month" + 0.013*"condit" + 0.013*"life" + 0.013*"leav" + 0.012*"surgeri"
Topic: 5 
Words: 0.032*"research" + 0.031*"brain" + 0.029*"studi" + 0.024*"diseas" + 0.0

## Create TF-IDF Model Object 
then apply to the whole corpus


In [12]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.04499913625193515),
 (1, 0.03813541225146295),
 (2, 0.03125774052782196),
 (3, 0.04351049540769138),
 (4, 0.023759077724102972),
 (5, 0.03125774052782196),
 (6, 0.040318443085434495),
 (7, 0.09316776620837153),
 (8, 0.03404053248128401),
 (9, 0.18543160416934554),
 (10, 0.11496536779619443),
 (11, 0.13053148622307412),
 (12, 0.046583883104185764),
 (13, 0.06430149996675939),
 (14, 0.03307746576365352),
 (15, 0.08421393497289102),
 (16, 0.036087972366373476),
 (17, 0.024413917963289882),
 (18, 0.03125774052782196),
 (19, 0.03125774052782196),
 (20, 0.04351049540769138),
 (21, 0.046583883104185764),
 (22, 0.10212159744385203),
 (23, 0.032150749983379696),
 (24, 0.036087972366373476),
 (25, 0.021301523024756134),
 (26, 0.07903966873321916),
 (27, 0.036087972366373476),
 (28, 0.07217594473274695),
 (29, 0.3149939537635461),
 (30, 0.040779343964744974),
 (31, 0.040779343964744974),
 (32, 0.051556877284992766),
 (33, 0.03307746576365352),
 (34, 0.03707924020329568),
 (35, 0.0279788700

Run LDA with TF-IDF

In [13]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.017*"treatment" + 0.014*"fall" + 0.013*"clinic" + 0.012*"hour" + 0.011*"news" + 0.010*"cost" + 0.010*"brain" + 0.010*"infect" + 0.010*"human" + 0.009*"diseas"
Topic: 1 Word: 0.016*"brain" + 0.016*"countri" + 0.012*"world" + 0.011*"reduc" + 0.011*"find" + 0.011*"stori" + 0.010*"cancer" + 0.010*"scientist" + 0.008*"group" + 0.008*"abl"
Topic: 2 Word: 0.014*"woman" + 0.013*"doctor" + 0.013*"hear" + 0.013*"state" + 0.012*"children" + 0.012*"bodi" + 0.011*"surgeri" + 0.011*"parent" + 0.010*"drug" + 0.009*"care"
Topic: 3 Word: 0.021*"organ" + 0.018*"plan" + 0.015*"right" + 0.013*"suggest" + 0.013*"care" + 0.011*"control" + 0.011*"longer" + 0.010*"decis" + 0.009*"base" + 0.009*"design"
Topic: 4 Word: 0.017*"babi" + 0.013*"children" + 0.012*"diseas" + 0.011*"women" + 0.011*"studi" + 0.010*"research" + 0.010*"child" + 0.008*"risk" + 0.008*"surgeri" + 0.008*"treatment"
Topic: 5 Word: 0.024*"women" + 0.024*"drug" + 0.018*"test" + 0.013*"cancer" + 0.013*"harm" + 0.012*"campaign" +

# Performance Evaluation 

In [14]:
#Original 
print(all_cleaned_texts[0])

['psychiatrist', 'urg', 'children', 'mental', 'health', 'issu', 'long', 'spend', 'onlin', 'social', 'media', 'question', 'technolog', 'routin', 'assess', 'royal', 'colleg', 'psychiatrist', 'say', 'concern', 'time', 'spend', 'onlin', 'impact', 'mood', 'sleep', 'diet', 'behaviour', 'govern', 'expect', 'announc', 'plan', 'regul', 'social', 'media', 'compani', 'soon', 'colleg', 'advic', 'come', 'evid', 'grow', 'possibl', 'link', 'harm', 'content', 'time', 'spend', 'onlin', 'poor', 'mental', 'health', 'plan', 'publish', 'report', 'later', 'year', 'stanc', 'technolog', 'children', 'mental', 'health', 'includ', 'recommend', 'parent', 'children', 'doctor', 'assess', 'children', 'psychiatrist', 'advis', 'think', 'royal', 'colleg', 'psychiatrist', 'recommend', 'children', 'stop', 'technolog', 'hour', 'go', 'avoid', 'technolog', 'mealtim', 'view', 'echo', 'chief', 'medic', 'offic', 'recent', 'guidanc', 'social', 'media', 'group', 'recent', 'call', 'social', 'media', 'compani', 'profit', 'say', 'f

LDA with Bag of words

In [15]:
for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5709567070007324	 
Topic: 0.022*"brain" + 0.021*"patient" + 0.020*"research" + 0.016*"children" + 0.016*"prof" + 0.015*"social" + 0.015*"parent" + 0.014*"medic" + 0.014*"media" + 0.013*"think"

Score: 0.4235626459121704	 
Topic: 0.015*"suggest" + 0.014*"hear" + 0.014*"children" + 0.013*"harm" + 0.013*"parent" + 0.013*"organ" + 0.012*"compani" + 0.012*"think" + 0.012*"right" + 0.012*"govern"


LDA with TF-IDF

In [16]:
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9506382346153259	 
Topic: 0.017*"babi" + 0.013*"children" + 0.012*"diseas" + 0.011*"women" + 0.011*"studi" + 0.010*"research" + 0.010*"child" + 0.008*"risk" + 0.008*"surgeri" + 0.008*"treatment"

Score: 0.043881453573703766	 
Topic: 0.014*"woman" + 0.013*"doctor" + 0.013*"hear" + 0.013*"state" + 0.012*"children" + 0.012*"bodi" + 0.011*"surgeri" + 0.011*"parent" + 0.010*"drug" + 0.009*"care"


## Tests


In [17]:
unseen_document = 'I have lung cancer, what I can do?'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5499820709228516	 Topic: 0.081*"drug" + 0.062*"cancer" + 0.021*"patient" + 0.014*"research" + 0.014*"treatment"
Score: 0.050004590302705765	 Topic: 0.017*"patient" + 0.016*"medic" + 0.016*"univers" + 0.013*"lose" + 0.013*"start"
Score: 0.05000389367341995	 Topic: 0.025*"women" + 0.022*"babi" + 0.018*"case" + 0.014*"countri" + 0.013*"research"
Score: 0.05000351369380951	 Topic: 0.032*"research" + 0.031*"brain" + 0.029*"studi" + 0.024*"diseas" + 0.019*"risk"
Score: 0.050002146512269974	 Topic: 0.026*"case" + 0.025*"diseas" + 0.025*"children" + 0.020*"risk" + 0.015*"think"
Score: 0.05000169202685356	 Topic: 0.022*"brain" + 0.021*"patient" + 0.020*"research" + 0.016*"children" + 0.016*"prof"
Score: 0.050000760704278946	 Topic: 0.029*"doctor" + 0.025*"countri" + 0.024*"patient" + 0.024*"medic" + 0.020*"report"
Score: 0.05000066012144089	 Topic: 0.026*"babi" + 0.025*"famili" + 0.023*"child" + 0.019*"children" + 0.018*"doctor"
Score: 0.05000060796737671	 Topic: 0.015*"suggest" + 0.01