## 1a. Import the libraries and sample data

Credits: [Selva Prabhakaran](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

Download [`newsgroups.json` here provided by @selva86](https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json)

Ensure that spacy `en_core_web_sm` is installed. You can do so by invoking `python -m spacy download en_core_web_sm` in the terminal.

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Set stopwords
sw = stopwords.words('english')
sw.extend(['from', 'subject', 're', 'edu', 'use'])

# Import Dataset
df = pd.read_json('newsgroups.json')
print(df.target_names.unique())
df.head()

  and should_run_async(code)


['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


## 1b. Clean-up sentences and Tokenize words

In [7]:
# Remove emails and newline characters
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

# Tokenize words and clean-up text - deacc=True removes punctuations
data_words = [gensim.utils.simple_preprocess(str(s), deacc=True) for s in sent_to_words(data)]

  and should_run_async(code)
  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]


## 1c. Remove Stopwords, make Bigrams and Lemmatize

In [11]:
# Remove Stopwords
data_words_nostops = [[word for word in simple_preprocess(str(doc)) if word not in sw] for doc in data_words]

# Form Bigrams - Higher threshold fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
bigram_mod = gensim.models.phrases.Phraser(bigram)

data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]

# Use tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization and keep only noun, adj, vb, adv
allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
data_lemmatized = [[token.lemma_ for token in nlp(' '.join(s)) if token.pos_ in allowed_postags] for s in data_words_bigrams]

  and should_run_async(code)


## 1d. Create the Dictionary and Corpus

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

  and should_run_async(code)


## 2. Build the LDA model

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


## 3a. Display keywords in topics

To get a sense, for each topic,

<img src="https://www.machinelearningplus.com/wp-content/uploads/2018/03/Inferring-Topic-from-Keywords-1200x781.png?ezimgfmt=ng:webp/ngcb4" width="600" height="auto" />

(Source: [Selva Prabhakaran, 2018](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/))

In [76]:
from IPython.display import display, HTML
import pandas as pd

top_topics_rank = {t[0]: r + 1 for r, t in enumerate(lda_model.show_topics())}
topics_df = pd.DataFrame([
    [no, top_topics_rank.get(no, '-'), kws.split(' + ')] for no, kws in lda_model.show_topics(num_topics=20)
], columns=['Topic No.', 'Rank', 'Top 10 Keywords (a + b)'])

display(HTML(topics_df.to_html(index=False)))

  and should_run_async(code)


Topic No.,Rank,Top 10 Keywords (a + b)
0,1,"[0.065*""data"", 0.034*""enable"", 0.011*""microsoft"", 0.003*""efficiently"", 0.000*""textual"", 0.000*""slave"", 0.000*""jumper"", 0.000*""cp_ut"", 0.000*""master_slave"", 0.000*""latch""]"
1,4,"[0.065*""scsi"", 0.064*""mb"", 0.058*""ide"", 0.054*""headache"", 0.044*""gateway"", 0.029*""water"", 0.028*""oil"", 0.025*""nuclear"", 0.023*""heat"", 0.022*""cylinder""]"
2,-,"[0.046*""gun"", 0.029*""whole"", 0.024*""bike"", 0.020*""black"", 0.019*""draw"", 0.019*""carry"", 0.017*""white"", 0.015*""police"", 0.015*""ride"", 0.014*""safety""]"
3,-,"[0.055*""year"", 0.049*""team"", 0.048*""game"", 0.035*""play"", 0.033*""win"", 0.016*""season"", 0.015*""fan"", 0.015*""last"", 0.015*""hit"", 0.014*""first""]"
4,-,"[0.053*""reality"", 0.049*""distribution_na"", 0.036*""concept"", 0.033*""boy"", 0.030*""poor"", 0.030*""parent"", 0.029*""door"", 0.028*""assumption"", 0.025*""benefit"", 0.022*""blood""]"
5,6,"[0.039*""number"", 0.032*""list"", 0.023*""include"", 0.023*""copy"", 0.023*""player"", 0.019*""name"", 0.016*""return"", 0.015*""section"", 0.015*""appreciate"", 0.015*""tape""]"
6,-,"[0.052*""kill"", 0.039*""soldier"", 0.035*""village"", 0.024*""turk"", 0.023*""turkish"", 0.023*""listen"", 0.022*""murder"", 0.021*""terrorism"", 0.020*""armenian"", 0.015*""girl""]"
7,-,"[0.096*""suggest"", 0.066*""community"", 0.048*""united_state"", 0.034*""impact"", 0.030*""weight"", 0.023*""legitimate"", 0.022*""role"", 0.022*""consideration"", 0.021*""approve"", 0.020*""progress""]"
8,9,"[0.035*""people"", 0.027*""say"", 0.018*""believe"", 0.016*""evidence"", 0.015*""reason"", 0.011*""fact"", 0.011*""claim"", 0.010*""think"", 0.010*""mean"", 0.009*""law""]"
9,-,"[0.133*""space"", 0.041*""launch"", 0.035*""earth"", 0.034*""mission"", 0.033*""corporation"", 0.033*""orbit"", 0.030*""satellite"", 0.027*""moon"", 0.023*""flight"", 0.017*""fuel""]"


## 3b. Compute Perplexity and Coherence scores

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is

In [77]:
# Compute Perplexity - a measure of how good the model is. lower the better.
print('Perplexity: ', lda_model.log_perplexity(corpus)) 

# Compute Coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('Coherence Score: ', coherence_lda)

  and should_run_async(code)


Perplexity:  -13.974043489993846 


Coherence Score:  0.4619199087319347


## 3d. Visualize the topics-keywords 

In [80]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)
