## Tutorial Taken from:
https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [2]:
"""Import libraries"""
import nltk
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')

  "class": algorithms.Blowfish,
[nltk_data] Downloading package stopwords to /home/jcm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
"""Prepare stopwords"""

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
"""Load dataset"""
# LoadDataset
file_path = "./enr-processed.jsonl"

df = pd.read_json(file_path, lines=True)
print(f"Number of articles:  {df.shape[0]}")
df.head()

Number of articles:  43105


Unnamed: 0,url,title,text,publish_date,keywords,meta_keywords,meta_description
0,https://www.enr.com/articles/58087-autodesk-re...,"Autodesk Releases Informed Design, an Inventor...","On February 7, Autodesk released Informed Desi...","February 7, 2024","[News, Tech, Building Information Modeling (BIM)]","[BIM, Manufacturing in Construction, PLM and BIM]","Autodesk has released Informed Design, add-ons..."
1,https://www.enr.com/articles/58107-constructio...,"Construction Economics for February 12, 2024",The shifting market landscape is transforming ...,"February 7, 2024","[Projects, Costs, Construction Economics, 2024]","[Economics, Materials Prices]","ENR’s 20-city average cost indexes, wages and ..."
2,https://www.enr.com/articles/58108-776m-broadw...,$776M Broadway Curve Job Among Projects Revamp...,The Arizona Dept. of Transportation will conti...,"February 7, 2024","[Projects, Southwest, Southwest Construction N...","[Transportation, Highways/Bridges]",Projects totaling more than $1.3 billion are r...
3,https://www.enr.com/articles/58106-mccarthy-st...,McCarthy Standardizes its Payments Using Oracl...,As cloud-based construction payments continue ...,"February 7, 2024","[Business, Tech, Companies, Information techno...","[McCarthy Building Cos., Payments, Textura]",McCarthy Building Cos. has standardized its su...
4,https://www.enr.com/articles/58109-in-final-ru...,"In Final Rule, EPA Tightens Air Quality Standa...",In a move it says will improve the health of m...,"February 7, 2024","[News, Business, Government]","[EPA, soot, Environmental Justice, NAAQS]",The tighter air pollution standard was long so...


In [7]:
"""Data cleaning:  Remove emails and newlines"""
# Convert to list 
data = df.text.values.tolist()  
# Remove new line characters 
data = [re.sub('\s+', ' ', sent) for sent in data]  
# Remove distracting single quotes 
data = [re.sub("\'", "", sent) for sent in data]  
pprint(data[:1])

  data = [re.sub('\s+', ' ', sent) for sent in data]


['On February 7, Autodesk released Informed Design, a pair of add-ons for the '
 'companys building information modeling and product design authoring tools '
 'intended to better automate the process of product design and placement of '
 'mechanical assemblies and other complex products into 3D building and '
 'infrastructure models. Autodesk says Informed Design connects design and '
 'manufacturing workflows, and will allow designers to specify products and '
 'assemblies that were created in Inventor, the companys manufacturing '
 'authoring tool, into Revit, its parametric BIM tool. This is a functionality '
 'that Autodesk says architects and engineers have been demanding for a long '
 'time. "Informed Design improves certainty," said Ryan McMahon, general '
 'manager manufacturing/Informed Design at Autodesk during a press event '
 'announcing the product. "Productization is the key concept that makes this '
 'possible. Fabricators and subcontractors will define their products '


In [8]:
"""Tokenization and cleaning up text"""
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['on', 'february', 'autodesk', 'released', 'informed', 'design', 'pair', 'of', 'add', 'ons', 'for', 'the', 'companys', 'building', 'information', 'modeling', 'and', 'product', 'design', 'authoring', 'tools', 'intended', 'to', 'better', 'automate', 'the', 'process', 'of', 'product', 'design', 'and', 'placement', 'of', 'mechanical', 'assemblies', 'and', 'other', 'complex', 'products', 'into', 'building', 'and', 'infrastructure', 'models', 'autodesk', 'says', 'informed', 'design', 'connects', 'design', 'and', 'manufacturing', 'workflows', 'and', 'will', 'allow', 'designers', 'to', 'specify', 'products', 'and', 'assemblies', 'that', 'were', 'created', 'in', 'inventor', 'the', 'companys', 'manufacturing', 'authoring', 'tool', 'into', 'revit', 'its', 'parametric', 'bim', 'tool', 'this', 'is', 'functionality', 'that', 'autodesk', 'says', 'architects', 'and', 'engineers', 'have', 'been', 'demanding', 'for', 'long', 'time', 'informed', 'design', 'improves', 'certainty', 'said', 'ryan', 'mcmaho

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['on', 'february', 'autodesk', 'released', 'informed', 'design', 'pair', 'of', 'add_ons', 'for', 'the', 'companys', 'building', 'information_modeling', 'and', 'product', 'design', 'authoring_tools', 'intended', 'to', 'better', 'automate', 'the', 'process', 'of', 'product', 'design', 'and', 'placement', 'of', 'mechanical', 'assemblies', 'and', 'other', 'complex', 'products', 'into', 'building', 'and', 'infrastructure', 'models', 'autodesk', 'says', 'informed', 'design', 'connects', 'design', 'and', 'manufacturing', 'workflows', 'and', 'will', 'allow', 'designers', 'to', 'specify', 'products', 'and', 'assemblies', 'that', 'were', 'created', 'in', 'inventor', 'the', 'companys', 'manufacturing', 'authoring_tool', 'into', 'revit', 'its', 'parametric', 'bim', 'tool', 'this', 'is', 'functionality', 'that', 'autodesk', 'says', 'architects', 'and', 'engineers', 'have', 'been', 'demanding', 'for', 'long', 'time', 'informed', 'design', 'improves', 'certainty', 'said', 'ryan', 'mcmahon', 'general'

In [10]:
"""Remove stopwords, make bigrams, lemmatize"""
# Define function for stopwords, bigrams, trigrams and lemmatization
import math
len_segments = 100

def remove_stopwords(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"removing stopwords from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_segment])
    return return_ls

def make_bigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making bigrams from {start_i} through {end_i}, segment {i}/{num_segments}")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return return_ls

def make_trigrams(texts):
    return_ls = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    
    for i in range(num_segments):
        start_i = i*len_segments
        end_i = start_i + len_segments
        text_segment = texts[start_i : end_i]
        
        print(f"making trigrams from {start_i} through {end_i}, ")
        return_ls.extend([bigram_mod[doc] for doc in text_segment])
    
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    num_segments = int(math.ceil(len(texts)/len_segments))
    for i, sent in enumerate(texts):
        if i % len_segments == 0:
            print(f"lemmztization {i} through {i + len_segments}, segment {i}/{num_segments}")
        
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

removing stopwords from 0 through 100
removing stopwords from 100 through 200
removing stopwords from 200 through 300
removing stopwords from 300 through 400
removing stopwords from 400 through 500
removing stopwords from 500 through 600
removing stopwords from 600 through 700
removing stopwords from 700 through 800
removing stopwords from 800 through 900
removing stopwords from 900 through 1000
removing stopwords from 1000 through 1100
removing stopwords from 1100 through 1200
removing stopwords from 1200 through 1300
removing stopwords from 1300 through 1400
removing stopwords from 1400 through 1500
removing stopwords from 1500 through 1600
removing stopwords from 1600 through 1700
removing stopwords from 1700 through 1800
removing stopwords from 1800 through 1900
removing stopwords from 1900 through 2000
removing stopwords from 2000 through 2100
removing stopwords from 2100 through 2200
removing stopwords from 2200 through 2300
removing stopwords from 2300 through 2400
removing stop

removing stopwords from 19400 through 19500
removing stopwords from 19500 through 19600
removing stopwords from 19600 through 19700
removing stopwords from 19700 through 19800
removing stopwords from 19800 through 19900
removing stopwords from 19900 through 20000
removing stopwords from 20000 through 20100
removing stopwords from 20100 through 20200
removing stopwords from 20200 through 20300
removing stopwords from 20300 through 20400
removing stopwords from 20400 through 20500
removing stopwords from 20500 through 20600
removing stopwords from 20600 through 20700
removing stopwords from 20700 through 20800
removing stopwords from 20800 through 20900
removing stopwords from 20900 through 21000
removing stopwords from 21000 through 21100
removing stopwords from 21100 through 21200
removing stopwords from 21200 through 21300
removing stopwords from 21300 through 21400
removing stopwords from 21400 through 21500
removing stopwords from 21500 through 21600
removing stopwords from 21600 th

removing stopwords from 38200 through 38300
removing stopwords from 38300 through 38400
removing stopwords from 38400 through 38500
removing stopwords from 38500 through 38600
removing stopwords from 38600 through 38700
removing stopwords from 38700 through 38800
removing stopwords from 38800 through 38900
removing stopwords from 38900 through 39000
removing stopwords from 39000 through 39100
removing stopwords from 39100 through 39200
removing stopwords from 39200 through 39300
removing stopwords from 39300 through 39400
removing stopwords from 39400 through 39500
removing stopwords from 39500 through 39600
removing stopwords from 39600 through 39700
removing stopwords from 39700 through 39800
removing stopwords from 39800 through 39900
removing stopwords from 39900 through 40000
removing stopwords from 40000 through 40100
removing stopwords from 40100 through 40200
removing stopwords from 40200 through 40300
removing stopwords from 40300 through 40400
removing stopwords from 40400 th

making bigrams from 15900 through 16000
making bigrams from 16000 through 16100
making bigrams from 16100 through 16200
making bigrams from 16200 through 16300
making bigrams from 16300 through 16400
making bigrams from 16400 through 16500
making bigrams from 16500 through 16600
making bigrams from 16600 through 16700
making bigrams from 16700 through 16800
making bigrams from 16800 through 16900
making bigrams from 16900 through 17000
making bigrams from 17000 through 17100
making bigrams from 17100 through 17200
making bigrams from 17200 through 17300
making bigrams from 17300 through 17400
making bigrams from 17400 through 17500
making bigrams from 17500 through 17600
making bigrams from 17600 through 17700
making bigrams from 17700 through 17800
making bigrams from 17800 through 17900
making bigrams from 17900 through 18000
making bigrams from 18000 through 18100
making bigrams from 18100 through 18200
making bigrams from 18200 through 18300
making bigrams from 18300 through 18400


making bigrams from 37700 through 37800
making bigrams from 37800 through 37900
making bigrams from 37900 through 38000
making bigrams from 38000 through 38100
making bigrams from 38100 through 38200
making bigrams from 38200 through 38300
making bigrams from 38300 through 38400
making bigrams from 38400 through 38500
making bigrams from 38500 through 38600
making bigrams from 38600 through 38700
making bigrams from 38700 through 38800
making bigrams from 38800 through 38900
making bigrams from 38900 through 39000
making bigrams from 39000 through 39100
making bigrams from 39100 through 39200
making bigrams from 39200 through 39300
making bigrams from 39300 through 39400
making bigrams from 39400 through 39500
making bigrams from 39500 through 39600
making bigrams from 39600 through 39700
making bigrams from 39700 through 39800
making bigrams from 39800 through 39900
making bigrams from 39900 through 40000
making bigrams from 40000 through 40100
making bigrams from 40100 through 40200


removing make trigram from 1220000 through 1220100
removing make trigram from 1230000 through 1230100
removing make trigram from 1240000 through 1240100
removing make trigram from 1250000 through 1250100
removing make trigram from 1260000 through 1260100
removing make trigram from 1270000 through 1270100
removing make trigram from 1280000 through 1280100
removing make trigram from 1290000 through 1290100
removing make trigram from 1300000 through 1300100
removing make trigram from 1310000 through 1310100
removing make trigram from 1320000 through 1320100
removing make trigram from 1330000 through 1330100
removing make trigram from 1340000 through 1340100
removing make trigram from 1350000 through 1350100
removing make trigram from 1360000 through 1360100
removing make trigram from 1370000 through 1370100
removing make trigram from 1380000 through 1380100
removing make trigram from 1390000 through 1390100
removing make trigram from 1400000 through 1400100
removing make trigram from 1410

removing make trigram from 2830000 through 2830100
removing make trigram from 2840000 through 2840100
removing make trigram from 2850000 through 2850100
removing make trigram from 2860000 through 2860100
removing make trigram from 2870000 through 2870100
removing make trigram from 2880000 through 2880100
removing make trigram from 2890000 through 2890100
removing make trigram from 2900000 through 2900100
removing make trigram from 2910000 through 2910100
removing make trigram from 2920000 through 2920100
removing make trigram from 2930000 through 2930100
removing make trigram from 2940000 through 2940100
removing make trigram from 2950000 through 2950100
removing make trigram from 2960000 through 2960100
removing make trigram from 2970000 through 2970100
removing make trigram from 2980000 through 2980100
removing make trigram from 2990000 through 2990100
removing make trigram from 3000000 through 3000100
removing make trigram from 3010000 through 3010100
removing make trigram from 3020

In [12]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 3), (10, 3), (11, 1), (12, 2), (13, 3), (14, 7), (15, 2), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 5), (25, 5), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1), (32, 1), (33, 1), (34, 3), (35, 1), (36, 1), (37, 1), (38, 4), (39, 1), (40, 1), (41, 2), (42, 1), (43, 5), (44, 1), (45, 1), (46, 1), (47, 1), (48, 6), (49, 1), (50, 1), (51, 1), (52, 3), (53, 1), (54, 3), (55, 2), (56, 1), (57, 3), (58, 25), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 3), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 4), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 10), (100, 1), (101, 1), (102, 4), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 

In [13]:
"""Readable dictionary"""
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 1),
  ('add', 2),
  ('add_on', 2),
  ('advisory', 1),
  ('aeco', 1),
  ('aesthetic', 1),
  ('allow', 5),
  ('also', 1),
  ('announce', 1),
  ('architect', 3),
  ('assembly', 3),
  ('assessment', 1),
  ('author', 2),
  ('authoring_tool', 3),
  ('autodesk', 7),
  ('automate', 2),
  ('available', 3),
  ('base', 1),
  ('basic', 1),
  ('begin', 1),
  ('bidirectional', 1),
  ('bill', 1),
  ('body', 1),
  ('bring', 1),
  ('build', 5),
  ('building', 5),
  ('capability', 1),
  ('category', 1),
  ('certainty', 1),
  ('charge', 4),
  ('choice', 1),
  ('clash_detection', 1),
  ('cloud', 1),
  ('collaboration', 1),
  ('company', 3),
  ('compatible', 1),
  ('complement', 1),
  ('complex', 1),
  ('component', 4),
  ('concept', 1),
  ('confirm', 1),
  ('connect', 2),
  ('constraint', 1),
  ('construction', 5),
  ('consultant', 1),
  ('consumer', 1),
  ('coordinate', 1),
  ('corporation', 1),
  ('create', 6),
  ('creativity', 1),
  ('curb', 1),
  ('curtainwall', 1),
  ('customer', 3),
  ('c

In [14]:
"""Build topic model"""
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.049*"year" + 0.040*"construction" + 0.037*"price" + 0.028*"increase" + '
  '0.023*"market" + 0.016*"cost" + 0.013*"last" + 0.011*"industry" + '
  '0.011*"rate" + 0.010*"report"'),
 (1,
  '0.082*"firm" + 0.050*"company" + 0.040*"market" + 0.025*"business" + '
  '0.021*"revenue" + 0.021*"base" + 0.020*"year" + 0.018*"say" + 0.017*"top" + '
  '0.016*"contractor"'),
 (2,
  '0.057*"water" + 0.021*"engineer" + 0.018*"corps" + 0.018*"damage" + '
  '0.017*"system" + 0.016*"area" + 0.014*"repair" + 0.011*"emergency" + '
  '0.010*"protection" + 0.009*"storm"'),
 (3,
  '0.097*"say" + 0.022*"work" + 0.015*"go" + 0.014*"time" + 0.014*"get" + '
  '0.013*"year" + 0.013*"take" + 0.012*"make" + 0.009*"see" + 0.009*"add"'),
 (4,
  '0.034*"rule" + 0.032*"claim" + 0.026*"contractor" + 0.018*"law" + '
  '0.017*"insurance" + 0.016*"court" + 0.015*"case" + 0.014*"pay" + '
  '0.013*"issue" + 0.013*"file"'),
 (5,
  '0.046*"power" + 0.039*"plant" + 0.036*"energy" + 0.020*"utility" + '
  '0.018*"unit" 

In [16]:
"""Evaluate topic models"""
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.579978397292168

Coherence Score:  0.48233085153359045


In [17]:
import pandas as pd

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow st