## Tutorial Taken from:
https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [4]:
"""Import libraries"""
import nltk
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jcm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
"""Prepare stopwords"""

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#Say appears frequently
stop_words.append("say")

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [23]:
"""Load dataset"""
# LoadDataset
file_path = "./enr-processed.jsonl"

df = pd.read_json(file_path, lines=True)
print(f"Number of articles:  {df.shape[0]}")
df.head()

Number of articles:  43105


Unnamed: 0,url,title,text,publish_date,keywords,meta_keywords,meta_description
0,https://www.enr.com/articles/58087-autodesk-re...,"Autodesk Releases Informed Design, an Inventor...","On February 7, Autodesk released Informed Desi...","February 7, 2024","[News, Tech, Building Information Modeling (BIM)]","[BIM, Manufacturing in Construction, PLM and BIM]","Autodesk has released Informed Design, add-ons..."
1,https://www.enr.com/articles/58107-constructio...,"Construction Economics for February 12, 2024",The shifting market landscape is transforming ...,"February 7, 2024","[Projects, Costs, Construction Economics, 2024]","[Economics, Materials Prices]","ENR’s 20-city average cost indexes, wages and ..."
2,https://www.enr.com/articles/58108-776m-broadw...,$776M Broadway Curve Job Among Projects Revamp...,The Arizona Dept. of Transportation will conti...,"February 7, 2024","[Projects, Southwest, Southwest Construction N...","[Transportation, Highways/Bridges]",Projects totaling more than $1.3 billion are r...
3,https://www.enr.com/articles/58106-mccarthy-st...,McCarthy Standardizes its Payments Using Oracl...,As cloud-based construction payments continue ...,"February 7, 2024","[Business, Tech, Companies, Information techno...","[McCarthy Building Cos., Payments, Textura]",McCarthy Building Cos. has standardized its su...
4,https://www.enr.com/articles/58109-in-final-ru...,"In Final Rule, EPA Tightens Air Quality Standa...",In a move it says will improve the health of m...,"February 7, 2024","[News, Business, Government]","[EPA, soot, Environmental Justice, NAAQS]",The tighter air pollution standard was long so...


In [25]:
"""Data cleaning:  Remove emails and newlines"""
# Convert to list 
data_full = df.text.values.tolist()  
# Remove new line characters 
data_full = [re.sub('\s+', ' ', sent) for sent in data_full]  
# Remove distracting single quotes 
data_full = [re.sub("\'", "", sent) for sent in data_full]
print(len(data_full))

  data_full = [re.sub('\s+', ' ', sent) for sent in data_full]


43105


In [26]:
"""Filter based on topics"""
keywords = ["mental health", "suicide prevention", "substance abuse", "substance use disorder", "peoplefirst culture", "narcan", 
            "health care", "drug addiction", "relevance", "illnesses", "mental and behavioral health services", 
            "mental illness and substance abuse issues", "addiction", "psychiatric and addiction treatment", "opioid abuse", 
            "brockton behavioral health center", "burnout", "addiction services", "public psychiatric facility", "pandemic", 
            "wellness", "overdose", "overdoses", "addiction", "suicide", "health", "suicides", "substance", "csdz", "disorders"]
#pprint(data[:1])
data = [sent for sent in data_full if any(keyword in sent.lower() for keyword in keywords)]
print(len(data))

7998


In [27]:
"""Tokenization and cleaning up text"""
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['the', 'arizona', 'dept', 'of', 'transportation', 'will', 'continue', 'or', 'begin', 'several', 'phoenix', 'area', 'projects', 'in', 'including', 'two', 'new', 'widening', 'contracts', 'along', 'separate', 'sections', 'of', 'the', 'heavily', 'traveled', 'loop', 'we', 've', 'been', 'very', 'busy', 'in', 'arizona', 'even', 'during', 'the', 'pandemic', 'years', 'and', 'this', 'year', 'is', 'no', 'different', 'says', 'greg', 'byres', 'state', 'engineer', 'for', 'adot', 'noting', 'several', 'large', 'freeway', 'projects', 'in', 'the', 'phoenix', 'area', 'the', 'biggest', 'of', 'these', 'is', 'the', 'million', 'interstate', 'broadway', 'curve', 'improvement', 'project', 'between', 'the', 'split', 'and', 'loop', 'in', 'chandler', 'representing', 'adot', 'largest', 'freeway', 'reconstruction', 'project', 'to', 'date', 'the', 'project', 'is', 'being', 'led', 'by', 'broadway', 'curve', 'contractors', 'joint', 'venture', 'of', 'pulice', 'construction', 'scottsdale', 'ariz', 'fnf', 'construction

In [28]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'arizona', 'dept', 'of', 'transportation', 'will', 'continue', 'or', 'begin', 'several', 'phoenix', 'area', 'projects', 'in', 'including', 'two', 'new', 'widening', 'contracts', 'along', 'separate', 'sections', 'of', 'the', 'heavily_traveled', 'loop', 'we', 've', 'been', 'very', 'busy', 'in', 'arizona', 'even', 'during', 'the', 'pandemic', 'years', 'and', 'this', 'year', 'is', 'no', 'different', 'says', 'greg', 'byres', 'state', 'engineer', 'for', 'adot', 'noting', 'several', 'large', 'freeway', 'projects', 'in', 'the', 'phoenix', 'area', 'the', 'biggest', 'of', 'these', 'is', 'the', 'million', 'interstate', 'broadway_curve', 'improvement', 'project', 'between', 'the', 'split', 'and', 'loop', 'in', 'chandler', 'representing', 'adot', 'largest', 'freeway', 'reconstruction', 'project', 'to', 'date', 'the', 'project', 'is', 'being', 'led', 'by', 'broadway_curve', 'contractors', 'joint_venture', 'of', 'pulice', 'construction', 'scottsdale_ariz', 'fnf', 'construction', 'tempe_ariz',

In [29]:
"""Remove stopwords, make bigrams, lemmatize"""
# Define function for stopwords, bigrams, trigrams and lemmatization
import math
len_segments = 100

def remove_stopwords(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"removing stopwords from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_segment])
    return return_ls

def make_bigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making bigrams from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return return_ls

def make_trigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making trigrams from {start_i} through {end_i}, ")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    for i, sent in enumerate(texts):
        if i % len_segments == 0:
            print(f"lemmztization {i} through {i + len_segments}, segment {int(i/len_segments)}/{num_segments}")
        
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [30]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

removing stopwords from 0 through 100, segment 0/80
removing stopwords from 100 through 200, segment 1/80
removing stopwords from 200 through 300, segment 2/80
removing stopwords from 300 through 400, segment 3/80
removing stopwords from 400 through 500, segment 4/80
removing stopwords from 500 through 600, segment 5/80
removing stopwords from 600 through 700, segment 6/80
removing stopwords from 700 through 800, segment 7/80
removing stopwords from 800 through 900, segment 8/80
removing stopwords from 900 through 1000, segment 9/80
removing stopwords from 1000 through 1100, segment 10/80
removing stopwords from 1100 through 1200, segment 11/80
removing stopwords from 1200 through 1300, segment 12/80
removing stopwords from 1300 through 1400, segment 13/80
removing stopwords from 1400 through 1500, segment 14/80
removing stopwords from 1500 through 1600, segment 15/80
removing stopwords from 1600 through 1700, segment 16/80
removing stopwords from 1700 through 1800, segment 17/80
remov

making bigrams from 7200 through 7300, segment 72/80
making bigrams from 7300 through 7400, segment 73/80
making bigrams from 7400 through 7500, segment 74/80
making bigrams from 7500 through 7600, segment 75/80
making bigrams from 7600 through 7700, segment 76/80
making bigrams from 7700 through 7800, segment 77/80
making bigrams from 7800 through 7900, segment 78/80
making bigrams from 7900 through 8000, segment 79/80
lemmztization 0 through 100, segment 0/80
lemmztization 100 through 200, segment 1/80
lemmztization 200 through 300, segment 2/80
lemmztization 300 through 400, segment 3/80
lemmztization 400 through 500, segment 4/80
lemmztization 500 through 600, segment 5/80
lemmztization 600 through 700, segment 6/80
lemmztization 700 through 800, segment 7/80
lemmztization 800 through 900, segment 8/80
lemmztization 900 through 1000, segment 9/80
lemmztization 1000 through 1100, segment 10/80
lemmztization 1100 through 1200, segment 11/80
lemmztization 1200 through 1300, segment 12

In [31]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 6), (2, 1), (3, 6), (4, 1), (5, 2), (6, 1), (7, 6), (8, 1), (9, 1), (10, 4), (11, 1), (12, 1), (13, 2), (14, 1), (15, 2), (16, 3), (17, 1), (18, 1), (19, 1), (20, 2), (21, 3), (22, 2), (23, 2), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 6), (32, 1), (33, 1), (34, 1), (35, 4), (36, 1), (37, 3), (38, 1), (39, 1), (40, 3), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 4), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 2), (65, 3), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 5), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 3), (82, 6), (83, 3), (84, 1), (85, 2), (86, 1), (87, 2), (88, 9), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 2), (95, 2), (96, 7), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 5), (104, 1), (105, 2), (106, 1), (107, 1), (108, 1), (109, 5), (110, 1)

In [32]:
"""Readable dictionary"""
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ability', 1),
  ('add', 6),
  ('additionally', 1),
  ('adot', 6),
  ('also', 1),
  ('anthem', 2),
  ('anticipate', 1),
  ('area', 6),
  ('award', 1),
  ('baseline', 1),
  ('begin', 4),
  ('benefit', 1),
  ('big', 1),
  ('black', 2),
  ('boulevard', 1),
  ('bridge', 2),
  ('broadway_curve', 3),
  ('broomfield_colo', 1),
  ('build', 1),
  ('business', 1),
  ('busy', 2),
  ('byre', 3),
  ('canyon', 2),
  ('capacity', 2),
  ('carry', 1),
  ('challenge', 1),
  ('chandler', 1),
  ('city', 2),
  ('community', 1),
  ('commuter', 1),
  ('complete', 1),
  ('completion', 6),
  ('configuration', 1),
  ('connection', 1),
  ('construct', 1),
  ('construction', 4),
  ('continue', 1),
  ('contract', 3),
  ('contractor', 1),
  ('convert', 1),
  ('crew', 3),
  ('critical', 1),
  ('cross', 1),
  ('date', 1),
  ('day', 1),
  ('deliver', 1),
  ('design', 1),
  ('diamond', 1),
  ('difference', 1),
  ('different', 1),
  ('direct', 1),
  ('direction', 4),
  ('dividend', 1),
  ('drive', 1),
  ('elevated', 

In [41]:
"""Build topic model"""
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [42]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.039*"company" + 0.032*"firm" + 0.019*"employee" + 0.018*"work" + '
  '0.017*"business" + 0.014*"service" + 0.014*"client" + 0.010*"management" + '
  '0.009*"also" + 0.009*"people"'),
 (1,
  '0.025*"state" + 0.024*"bill" + 0.023*"fund" + 0.021*"program" + 0.020*"say" '
  '+ 0.018*"federal" + 0.015*"plan" + 0.013*"transportation" + 0.013*"include" '
  '+ 0.012*"infrastructure"'),
 (2,
  '0.075*"construction" + 0.045*"year" + 0.025*"building" + 0.022*"start" + '
  '0.020*"increase" + 0.019*"decline" + 0.015*"report" + 0.014*"month" + '
  '0.014*"total" + 0.013*"housing"'),
 (3,
  '0.037*"project" + 0.033*"design" + 0.026*"construction" + 0.024*"owner" + '
  '0.024*"use" + 0.018*"cost" + 0.018*"process" + 0.018*"risk" + 0.016*"model" '
  '+ 0.015*"technology"'),
 (4,
  '0.085*"say" + 0.016*"work" + 0.015*"go" + 0.015*"get" + 0.012*"make" + '
  '0.011*"take" + 0.011*"year" + 0.011*"time" + 0.010*"need" + 0.009*"people"'),
 (5,
  '0.048*"water" + 0.029*"plant" + 0.022*"power" + 0.0

In [43]:
"""Evaluate topic models"""
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.948520474168237

Coherence Score:  0.4688607455525243


In [44]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow st