## Tutorial Taken from:
https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [1]:
"""Import libraries"""
import nltk
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as  pd
  "class": algorithms.Blowfish,
[nltk_data] Downloading package stopwords to /home/jcm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
"""Prepare stopwords"""

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#Say appears frequently
stop_words.append("say")

In [3]:
"""Load dataset"""
# LoadDataset
file_path = "./enr-processed.jsonl"

df = pd.read_json(file_path, lines=True)
print(f"Number of articles:  {df.shape[0]}")
df.head()

Number of articles:  43105


Unnamed: 0,url,title,text,publish_date,keywords,meta_keywords,meta_description
0,https://www.enr.com/articles/58087-autodesk-re...,"Autodesk Releases Informed Design, an Inventor...","On February 7, Autodesk released Informed Desi...","February 7, 2024","[News, Tech, Building Information Modeling (BIM)]","[BIM, Manufacturing in Construction, PLM and BIM]","Autodesk has released Informed Design, add-ons..."
1,https://www.enr.com/articles/58107-constructio...,"Construction Economics for February 12, 2024",The shifting market landscape is transforming ...,"February 7, 2024","[Projects, Costs, Construction Economics, 2024]","[Economics, Materials Prices]","ENR’s 20-city average cost indexes, wages and ..."
2,https://www.enr.com/articles/58108-776m-broadw...,$776M Broadway Curve Job Among Projects Revamp...,The Arizona Dept. of Transportation will conti...,"February 7, 2024","[Projects, Southwest, Southwest Construction N...","[Transportation, Highways/Bridges]",Projects totaling more than $1.3 billion are r...
3,https://www.enr.com/articles/58106-mccarthy-st...,McCarthy Standardizes its Payments Using Oracl...,As cloud-based construction payments continue ...,"February 7, 2024","[Business, Tech, Companies, Information techno...","[McCarthy Building Cos., Payments, Textura]",McCarthy Building Cos. has standardized its su...
4,https://www.enr.com/articles/58109-in-final-ru...,"In Final Rule, EPA Tightens Air Quality Standa...",In a move it says will improve the health of m...,"February 7, 2024","[News, Business, Government]","[EPA, soot, Environmental Justice, NAAQS]",The tighter air pollution standard was long so...


In [4]:
"""Data cleaning:  Remove emails and newlines"""
# Convert to list 
data_full = df.text.values.tolist()  
# Remove new line characters 
data_full = [re.sub('\s+', ' ', sent) for sent in data_full]  
# Remove distracting single quotes 
data_full = [re.sub("\'", "", sent) for sent in data_full]
print(len(data_full))

  data_full = [re.sub('\s+', ' ', sent) for sent in data_full]


43105


In [5]:
"""Filter based on topics"""
keywords_all = ["mental health", "suicide prevention", "substance abuse", "substance use disorder", "peoplefirst culture", "narcan", 
            "health care", "drug addiction", "relevance", "illnesses", "mental and behavioral health services", 
            "mental illness and substance abuse issues", "addiction", "psychiatric and addiction treatment", "opioid abuse", 
            "brockton behavioral health center", "burnout", "addiction services", "public psychiatric facility", "pandemic", 
            "wellness", "overdose", "overdoses", "addiction", "suicide", "health", "suicides", "substance", "csdz", "disorders"]

keyword_mental_health = ["mental health"]

keywords = keyword_mental_health


#pprint(data[:1])
data = [sent for sent in data_full if any(keyword in sent.lower() for keyword in keywords)]
print(len(data))

241


In [6]:
"""Tokenization and cleaning up text"""
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['planning', 'arizona', 'exact', 'sciences', 'is', 'planning', 'to', 'build', 'cancer', 'detection', 'facility', 'in', 'phoenix', 'the', 'sq', 'ft', 'building', 'will', 'contain', 'lab', 'office', 'manufacturing', 'and', 'warehouse', 'space', 'on', 'acre', 'site', 'at', 'sky', 'harbor', 'center', 'the', 'project', 'is', 'valued', 'at', 'million', 'or', 'more', 'exact', 'sciences', 'charmany', 'drive', 'madison', 'wis', 'dr', 'georgia', 'kinetic', 'fiber', 'internet', 'provider', 'plans', 'to', 'carry', 'out', 'public', 'private', 'partnership', 'broadband', 'project', 'in', 'in', 'colquitt', 'county', 'having', 'already', 'laid', 'miles', 'of', 'fiber', 'optic', 'cable', 'in', 'the', 'county', 'this', 'next', 'phase', 'entails', 'laying', 'an', 'additional', 'miles', 'kinetic', 'will', 'use', 'about', 'million', 'in', 'state', 'grant', 'money', 'from', 'the', 'federal', 'government', 'obtained', 'from', 'the', 'coronavirus', 'state', 'and', 'local', 'fiscal', 'recovery', 'funds', 'pro

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['planning', 'arizona', 'exact', 'sciences', 'is', 'planning', 'to', 'build', 'cancer', 'detection', 'facility', 'in', 'phoenix', 'the', 'sq_ft_building', 'will', 'contain', 'lab', 'office', 'manufacturing', 'and', 'warehouse', 'space', 'on', 'acre_site', 'at', 'sky_harbor', 'center', 'the', 'project', 'is', 'valued', 'at', 'million', 'or', 'more', 'exact', 'sciences', 'charmany', 'drive', 'madison', 'wis', 'dr', 'georgia', 'kinetic', 'fiber', 'internet', 'provider', 'plans', 'to', 'carry_out', 'public', 'private', 'partnership', 'broadband', 'project', 'in', 'in', 'colquitt', 'county', 'having', 'already', 'laid', 'miles', 'of', 'fiber', 'optic', 'cable', 'in', 'the', 'county', 'this', 'next', 'phase', 'entails', 'laying', 'an', 'additional', 'miles', 'kinetic', 'will', 'use', 'about', 'million', 'in', 'state', 'grant', 'money', 'from', 'the', 'federal', 'government', 'obtained', 'from', 'the', 'coronavirus', 'state', 'and', 'local', 'fiscal', 'recovery', 'funds', 'program', 'part', '

In [8]:
"""Remove stopwords, make bigrams, lemmatize"""
# Define function for stopwords, bigrams, trigrams and lemmatization
import math
len_segments = 100

def remove_stopwords(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"removing stopwords from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_segment])
    return return_ls

def make_bigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making bigrams from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return return_ls

def make_trigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making trigrams from {start_i} through {end_i}, ")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    for i, sent in enumerate(texts):
        if i % len_segments == 0:
            print(f"lemmztization {i} through {i + len_segments}, segment {int(i/len_segments)}/{num_segments}")
        
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

removing stopwords from 0 through 100, segment 0/3
removing stopwords from 100 through 200, segment 1/3
removing stopwords from 200 through 300, segment 2/3
making bigrams from 0 through 100, segment 0/3
making bigrams from 100 through 200, segment 1/3
making bigrams from 200 through 300, segment 2/3
lemmztization 0 through 100, segment 0/3
lemmztization 100 through 200, segment 1/3
lemmztization 200 through 300, segment 2/3
[['plan', 'exact', 'science', 'plan', 'build', 'cancer', 'detection', 'facility', 'build', 'contain', 'lab', 'office', 'manufacturing', 'warehouse', 'space', 'sky_harbor', 'center', 'project', 'value', 'exact', 'drive', 'fiber', 'internet', 'provider', 'plan', 'carry', 'public', 'private', 'partnership', 'broadband', 'already', 'lay', 'mile', 'fiber', 'optic', 'cable', 'county', 'next', 'phase', 'entail', 'lay', 'additional', 'mile', 'kinetic', 'use', 'state', 'grant', 'money', 'federal', 'government', 'obtain', 'coronavirus', 'state', 'local', 'fiscal', 'recovery'

In [10]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 3), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 6), (25, 4), (26, 1), (27, 1), (28, 1), (29, 1), (30, 3), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 3), (41, 4), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 2), (54, 4), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 1), (81, 1), (82, 2), (83, 3), (84, 1), (85, 3), (86, 2), (87, 3), (88, 1), (89, 1), (90, 7), (91, 3), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 2), (100, 1), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 2), (107, 1), (108, 1), (109, 1), (110, 1)

In [11]:
"""Readable dictionary"""
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 1),
  ('access', 2),
  ('acre', 1),
  ('acre_site', 1),
  ('action', 1),
  ('additional', 3),
  ('agency', 2),
  ('agreement', 2),
  ('agricultural', 1),
  ('alarm', 1),
  ('already', 1),
  ('also', 1),
  ('animal', 1),
  ('architecture', 1),
  ('array', 1),
  ('assist', 1),
  ('associate', 1),
  ('auditorium', 1),
  ('base', 1),
  ('begin', 2),
  ('benefit', 1),
  ('bhdp', 1),
  ('blvd', 1),
  ('broadband', 1),
  ('build', 6),
  ('building', 4),
  ('cable', 1),
  ('campus', 1),
  ('cancer', 1),
  ('capacity', 1),
  ('carry', 3),
  ('center', 2),
  ('cherrywood', 1),
  ('choose', 1),
  ('closet', 1),
  ('communication', 1),
  ('community', 1),
  ('compact', 1),
  ('complete', 3),
  ('conduct', 1),
  ('consist', 3),
  ('construction', 4),
  ('contain', 2),
  ('contaminate', 1),
  ('convey', 1),
  ('copper', 1),
  ('corner', 1),
  ('coronavirus', 1),
  ('cost', 1),
  ('county', 1),
  ('cover', 2),
  ('covid', 1),
  ('credible', 1),
  ('current', 2),
  ('dam', 4),
  ('date', 1)

In [12]:
"""Build topic model"""
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [13]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.007*"ilfi" + 0.007*"net_positive" + 0.007*"potable" + 0.007*"reclaim" + '
  '0.005*"content" + 0.005*"leed_platinum" + 0.005*"gassman" + '
  '0.004*"digital_twin" + 0.004*"website" + 0.004*"petal"'),
 (1,
  '0.000*"say" + 0.000*"work" + 0.000*"firm" + 0.000*"project" + 0.000*"also" '
  '+ 0.000*"year" + 0.000*"construction" + 0.000*"employee" + 0.000*"market" + '
  '0.000*"many"'),
 (2,
  '0.002*"kluber" + 0.001*"leopardo" + 0.001*"warmth" + 0.001*"kitchenette" + '
  '0.001*"replicate" + 0.000*"say" + 0.000*"work" + 0.000*"year" + '
  '0.000*"tuberculosis" + 0.000*"project"'),
 (3,
  '0.025*"say" + 0.019*"water" + 0.010*"health" + 0.009*"city" + 0.009*"base" '
  '+ 0.009*"system" + 0.009*"safety" + 0.008*"lead" + 0.008*"environmental" + '
  '0.008*"engineer"'),
 (4,
  '0.029*"worker" + 0.027*"opioid" + 0.017*"union" + 0.017*"construction" + '
  '0.016*"addiction" + 0.015*"trade" + 0.014*"program" + 0.013*"drug" + '
  '0.012*"recovery" + 0.012*"walsh"'),
 (5,
  '0.041*"say" + 

In [14]:
"""Evaluate topic models"""
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.819133528805613

Coherence Score:  0.3531431702196435


In [15]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow st