# Topic Modeling Using Gensim

This notebook applies Latent Dirichlet Allocation (LDA) using scikit-learn's LDA implementation to extract topics from the `all-the-news-25k.csv` dataset.

### Install packages

In [21]:
# Install packages
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import string
import en_core_web_sm
import gensim
%matplotlib inline

### Load the data

In [2]:
# Load data
df = pd.read_csv('./data/all-the-news-25k.csv')
df.head()

Unnamed: 0,date,year,month,day,title,article,section,publication
0,2018-05-02 17:09:00,2018,5.0,2,You Can Trick Your Brain Into Being More Focused,If only every day could be like this. You can’...,healthcare,Vice
1,2019-06-23 00:00:00,2019,6.0,23,Hudson's Bay's chairman's buyout bid pits reta...,(Reuters) - The success of Hudson’s Bay Co Exe...,business,Reuters
2,2018-12-28 00:00:00,2018,12.0,28,Wells Fargo to pay $575 million in settlement ...,NEW YORK (Reuters) - Wells Fargo & Co (WFC.N) ...,business,Reuters
3,2019-05-21 00:00:00,2019,5.0,21,Factbox: Investments by automakers in the U.S....,(Reuters) - Major automakers have announced a ...,business,Reuters
4,2019-02-05 00:00:00,2019,2.0,5,Exclusive: Britain's financial heartland unbow...,LONDON (Reuters) - Britain’s financial service...,business,Reuters


### Prepare the data (tokenize, remove stop words, lemmatize)

#### Tokenization

In [3]:
# tokenize articles using gensim simple_preprocess
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

In [4]:
# Convert to list
data = df.article.values.tolist()
data_words = list(sent_to_words(data))

In [5]:
print(data_words[:1])

[['if', 'only', 'every', 'day', 'could', 'be', 'like', 'this', 'you', 'can', 'put', 'your', 'finger', 'on', 'why', 'maybe', 'you', 'had', 'just', 'the', 'right', 'amount', 'of', 'sleep', 'maybe', 'the', 'stars', 'are', 'somehow', 'aligned', 'in', 'your', 'favor', 'whatever', 'the', 'reason', 'you', 're', 'cooking', 'on', 'gas', 'hours', 'fly', 'by', 'like', 'minutes', 'you', 're', 'feeling', 'great', 'and', 'before', 'you', 'know', 'it', 'it', 'pm', 'and', 'your', 'to', 'do', 'list', 'is', 'done', 'this', 'feeling', 'of', 'flow', 'or', 'being', 'in', 'the', 'zone', 'is', 'something', 'that', 'most', 'of', 'us', 'have', 'experienced', 'at', 'some', 'point', 'or', 'other', 'although', 'not', 'as', 'often', 'as', 'we', 'might', 'like', 'it', 'mental', 'state', 'that', 'elite', 'athletes', 'seem', 'to', 'have', 'at', 'their', 'beck', 'and', 'call', 'for', 'us', 'mere', 'mortals', 'though', 'it', 'hardly', 'ever', 'shows', 'up', 'when', 'we', 'need', 'it', 'since', 'the', 'psychologist', 'm

#### Remove Stopwords

In [8]:
from nltk.corpus import stopwords # Import the stop word list
stop_words= stopwords.words('english') + list(string.punctuation) # add punctuation to stop words

In [9]:
# Create a function to remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)


#### Apply Lemmatization

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # disable Named Entity Recognition for speed

In [16]:
# Create a function to lemmatize words
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
# Lemmatize words
data_lemmatized = lemmatization(data_words_nostops)
print(data_lemmatized[:1])

[['day', 'like', 'put', 'finger', 'maybe', 'right', 'amount', 'sleep', 'maybe', 'star', 'somehow', 'align', 'favor', 'reason', 'cook', 'gas', 'hour', 'fly', 'minute', 'feel', 'great', 'know', 'pm', 'list', 'do', 'feel', 'flow', 'zone', 'experience', 'point', 'often', 'like', 'mental', 'state', 'elite', 'athlete', 'seem', 'beck', 'call', 'mere', 'mortal', 'hardly', 'ever', 'show', 'need', 'first', 'describe', 'zone', 'call', 'flow', 'neuroscientist', 'try', 'figure', 'make', 'show', 'demand', 'close', 'secret', 'zone', 'truth', 'emerge', 'think', 'zone', 'actually', 'many', 'mental', 'state', 'person', 'work', 'particular', 'kind', 'think', 'master', 'several', 'flow', 'zoneto', 'state', 'work', 'well', 'make', 'sense', 'first', 'consider', 'know', 'original', 'zone', 'thing', 'definitely', 'know', 'feel', 'great', 'describe', 'optimal', 'experience', 'achieve', 'true', 'happiness', 'explanation', 'happen', 'feel', 'good', 'represent', 'perfect', 'match', 'activity', 'brain', 'network',

### Create Dictionary and Corpus for Gensim

In [18]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# View
print(corpus[:1])

[[(0, 1), (1, 3), (2, 2), (3, 2), (4, 1), (5, 1), (6, 1), (7, 8), (8, 1), (9, 4), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 3), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 7), (27, 1), (28, 2), (29, 1), (30, 2), (31, 2), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 5), (45, 1), (46, 1), (47, 1), (48, 12), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 4), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 6), (66, 1), (67, 4), (68, 1), (69, 1), (70, 1), (71, 1), (72, 6), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 2), (80, 1), (81, 1), (82, 2), (83, 3), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 1), (93, 2), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 4), (100, 1), (101, 1), (102, 1), (103, 1), (104, 3), (105, 1), (106, 2), (107, 1), (108, 1), (109, 1), (110, 1

In [19]:
# human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ability', 1),
  ('able', 3),
  ('accord', 2),
  ('achieve', 2),
  ('activate', 1),
  ('active', 1),
  ('actively', 1),
  ('activity', 8),
  ('actual', 1),
  ('actually', 4),
  ('adapt', 1),
  ('add', 1),
  ('admit', 1),
  ('adrenaline', 1),
  ('alert', 1),
  ('align', 1),
  ('allow', 1),
  ('almost', 1),
  ('also', 3),
  ('altogether', 1),
  ('amount', 2),
  ('area', 1),
  ('around', 1),
  ('ask', 1),
  ('athlete', 1),
  ('attend', 2),
  ('attention', 7),
  ('author', 1),
  ('away', 2),
  ('ay', 1),
  ('back', 2),
  ('background', 2),
  ('bad', 2),
  ('balance', 1),
  ('baseline', 1),
  ('beat', 1),
  ('beck', 1),
  ('behavior', 1),
  ('believe', 1),
  ('bind', 1),
  ('biscuit', 1),
  ('bit', 1),
  ('blue', 1),
  ('bodily', 2),
  ('body', 5),
  ('bonker', 1),
  ('border', 1),
  ('bore', 1),
  ('brain', 12),
  ('break', 1),
  ('breathing', 2),
  ('bring', 1),
  ('burst', 1),
  ('business', 1),
  ('buy', 1),
  ('call', 4),
  ('calm', 1),
  ('carry', 1),
  ('challenging', 1),
  ('chan

### Build the Topic Model

In [22]:
# Build a LDA model using gensim
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, \
    num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [25]:
# Print the 10 topics
lda_model.print_topics()

[(0,
  '0.020*"play" + 0.014*"get" + 0.012*"go" + 0.012*"good" + 0.011*"make" + 0.009*"year" + 0.009*"com" + 0.008*"screen" + 0.008*"come" + 0.007*"first"'),
 (1,
  '0.088*"producer" + 0.069*"globe" + 0.024*"plane" + 0.015*"flight" + 0.014*"fly" + 0.014*"finalize" + 0.013*"hudson" + 0.011*"candid" + 0.011*"retaliation" + 0.010*"pilot"'),
 (2,
  '0.056*"film" + 0.029*"movie" + 0.026*"actor" + 0.023*"star" + 0.016*"actress" + 0.013*"oscar" + 0.012*"character" + 0.011*"show" + 0.010*"theater" + 0.008*"scene"'),
 (3,
  '0.021*"song" + 0.020*"premiere" + 0.019*"live" + 0.019*"tv" + 0.016*"award" + 0.016*"host" + 0.015*"music" + 0.015*"reynold" + 0.012*"year" + 0.012*"video"'),
 (4,
  '0.026*"say" + 0.014*"source" + 0.012*"sexual" + 0.011*"statement" + 0.011*"twitter" + 0.011*"woman" + 0.011*"tell" + 0.011*"accord" + 0.009*"claim" + 0.009*"weinstein"'),
 (5,
  '0.021*"trailer" + 0.016*"use" + 0.014*"people" + 0.011*"drug" + 0.011*"treatment" + 0.011*"health" + 0.011*"facebook" + 0.011*"rape"

In [26]:
# Evaluate the model using perplexity and coherence
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence() # a measure of how coherent the topics are. higher the better.
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.884673846595373

Coherence Score:  0.4246168479084279


In [28]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(lda_model,corpus,id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  o

### Hyperparameter Tuning - Number of Topics and Alpha

In [29]:
def compute_coherence_values(dictionary, corpus, texts, num_topics_range,alpha_range):
    coherence_values=[]
    model_list=[]
    for alpha in alpha_range:
        for num_topics in num_topics_range:
            lda_model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, alpha=alpha,num_topics=num_topics,\
                                                      per_word_topics=True)
            model_list.append(lda_model)
            coherencemodel=CoherenceModel(model=lda_model,texts=texts,dictionary=dictionary,coherence='c_v')
            coherence_values.append((alpha,num_topics,coherencemodel.get_coherence()))
    return model_list,coherence_values

In [30]:
# Build models across a range of topics and alpha values
num_topics_range=range(5,15)
alpha_range=[0.01,0.1,0.5,1]
model_list,coherence_values=compute_coherence_values(dictionary=id2word,corpus=corpus,texts=data_lemmatized,\
                                                        num_topics_range=num_topics_range,alpha_range=alpha_range)

In [31]:
coherence_df = pd.DataFrame(coherence_values, columns=['alpha', 'num_topics', 'coherence_value'])
coherence_df

Unnamed: 0,alpha,num_topics,coherence_value
0,0.01,5,0.431496
1,0.01,6,0.415166
2,0.01,7,0.45643
3,0.01,8,0.420097
4,0.01,9,0.411664
5,0.01,10,0.407424
6,0.01,11,0.439925
7,0.01,12,0.450565
8,0.01,13,0.418852
9,0.01,14,0.42468


In [40]:
# Build LDA model with alpha=0.1 and 7 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, \
    num_topics=7, random_state=100, update_every=1, chunksize=100, passes=10, alpha=0.1, per_word_topics=True)

In [42]:
from pprint import pprint as pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"film" + 0.017*"star" + 0.014*"movie" + 0.010*"play" + 0.009*"get" + '
  '0.008*"make" + 0.008*"good" + 0.007*"go" + 0.007*"also" + 0.007*"first"'),
 (1,
  '0.026*"device" + 0.020*"car" + 0.020*"new" + 0.016*"engagement" + '
  '0.014*"company" + 0.013*"vehicle" + 0.013*"use" + 0.012*"product" + '
  '0.012*"phone" + 0.011*"design"'),
 (2,
  '0.026*"say" + 0.018*"people" + 0.012*"tell" + 0.010*"go" + 0.010*"get" + '
  '0.009*"actor" + 0.009*"time" + 0.009*"year" + 0.009*"actress" + '
  '0.008*"love"'),
 (3,
  '0.022*"award" + 0.016*"year" + 0.011*"first" + 0.010*"son" + 0.009*"time" + '
  '0.009*"globe" + 0.008*"host" + 0.008*"last" + 0.007*"spot" + '
  '0.007*"ceremony"'),
 (4,
  '0.021*"say" + 0.011*"statement" + 0.009*"sexual" + 0.009*"accord" + '
  '0.008*"weinstein" + 0.008*"claim" + 0.008*"report" + 0.007*"allegation" + '
  '0.007*"also" + 0.006*"producer"'),
 (5,
  '0.016*"use" + 0.016*"drug" + 0.015*"user" + 0.015*"facebook" + '
  '0.013*"custody" + 0.012*"reply" + 

In [48]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print(f'U_Mass Coherence Score: ', coherence_lda)

U_Mass Coherence Score:  -1.9628898822306227


In [49]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'C_V Coherence Score: ', coherence_lda)

C_V Coherence Score:  0.45128091716795204


I was expecting a lower u_mass score than the baseline LDA the team did using scikit-learn. Is it possible that I need to continue to refine the stop words to eliminate all non-meaningful words? Coud I Spacy to create a tokenized list that contains only noun, verbs, adverbs, and adjectives? Would this help to improve performance?