LDA
data: A Million News Headlines

In [40]:
import zipfile
import pandas as pd
import os


Loading data

In [41]:
data = pd.read_csv('D:/ds/ds/S3/NLP_LDA/data/abcnews-date-text.csv', error_bad_lines=False, parse_dates=['publish_date'])
data = data.rename(columns={'publish_date':'date','headline_text':'text'})
data = data.groupby(['date'], as_index = False).agg({'text': ' '.join})
data = data.drop_duplicates()
data.head(3)
#Sorting And Feature Engineering
data['year']         = pd.DatetimeIndex(data['date']).year
data['month']        = pd.DatetimeIndex(data['date']).month
data['day']          = pd.DatetimeIndex(data['date']).day
yearly = data.groupby(['year'], as_index = False).agg({'text': ' '.join})
monthly = data.groupby(['month'], as_index = False).agg({'text': ' '.join})



  exec(code_obj, self.user_global_ns, self.user_ns)


In [42]:
data.head()

Unnamed: 0,date,text,year,month,day
0,2003-02-19,aba decides against community broadcasting lic...,2003,2,19
1,2003-02-20,15 dead in rebel bombing raid philippines army...,2003,2,20
2,2003-02-21,accc too timid in petrol price investigations ...,2003,2,21
3,2003-02-22,86 confirmed dead after us nightclub fire act ...,2003,2,22
4,2003-02-23,accused people smuggler to face darwin court a...,2003,2,23


In [43]:
data_date = data.loc[(data.year >= 2019)&(data.year <= 2021)]
data_text = data_date[['text']]
data_text['index'] = data_text.index
papers = data_text

# Print head
data_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


Unnamed: 0,text,index
5787,after expo ministers approved an artificial is...,5787
5788,adelaide man arrested over police officer hit ...,5788
5789,adapt potentially fatal impulse to rescue some...,5789
5790,alexandria ocasio cortez dancing video viral s...,5790
5791,1960s amphicar designed to run on road and wat...,5791


In [44]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1095 entries, 5787 to 6881
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1095 non-null   object
 1   index   1095 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 25.7+ KB


Data Processing

In [45]:
# Load the regular expression library
import re

# Remove punctuation
papers['headline_text_processed'] = papers['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
papers['pheadline_text_processed'] = papers['headline_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
papers['headline_text_processed'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['headline_text_processed'] = papers['text'].map(lambda x: re.sub('[,\.!?]', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['pheadline_text_processed'] = papers['headline_text_processed'].map(lambda x: x.lower())


5787    after expo ministers approved an artificial is...
5788    adelaide man arrested over police officer hit ...
5789    adapt potentially fatal impulse to rescue some...
5790    alexandria ocasio cortez dancing video viral s...
5791    1960s amphicar designed to run on road and wat...
Name: headline_text_processed, dtype: object


Step 2: Data Cleaning

In [46]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = papers.headline_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['after', 'expo', 'ministers', 'approved', 'an', 'artificial', 'island', 'alyssa', 'healy', 'named', 'worlds', 'best', 'womens', 'player', 'for', 'australia', 'called', 'free', 'rider', 'on', 'tackling', 'climate', 'change', 'howard', 'australia', 'still', 'has', 'no', 'us', 'ambassador']


Step 3: Phrase Modeling: Bigram and Trigram Models

In [47]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

Remove Stopwords, Make Bigrams and Lemmatize

In [48]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [50]:

import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['expo', 'minister', 'approve', 'artificial', 'island', 'name', 'world', 'well', 'women', 'call', 'free', 'rider', 'tackle', 'still', 'ambassador', 'bangladesh', 'rule', 'coalition', 'declare', 'winner', 'dispute', 'vote', 'toll', 'northern_territory', 'horror', 'road', 'cabinet', 'documents_reveal', 'pine', 'gap']



Step 4: Data transformation: Corpus and Dictionary

In [51]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 4), (27, 1), (28, 1), (29, 1)]


5. base model

In [52]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [53]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.004*"woman" + 0.003*"man" + 0.002*"government" + 0.002*"say" + '
  '0.002*"charge" + 0.002*"police" + 0.002*"arrest" + 0.002*"protest" + '
  '0.002*"murder" + 0.002*"change"'),
 (1,
  '0.007*"police" + 0.007*"australian" + 0.006*"bushfire" + 0.006*"say" + '
  '0.006*"man" + 0.006*"fire" + 0.005*"woman" + 0.004*"new" + 0.004*"find" + '
  '0.004*"charge"'),
 (2,
  '0.012*"election" + 0.003*"presidential" + 0.003*"vote" + 0.002*"win" + '
  '0.002*"victory" + 0.002*"state" + 0.002*"attack" + 0.002*"race" + '
  '0.002*"labor" + 0.002*"public_house"'),
 (3,
  '0.003*"fire" + 0.003*"laker" + 0.002*"emmys" + 0.002*"kobe_bryant" + '
  '0.002*"international_student" + 0.001*"bushfire" + 0.001*"memorial" + '
  '0.001*"pocket" + 0.001*"open" + 0.001*"crash"'),
 (4,
  '0.017*"covid" + 0.012*"say" + 0.012*"new" + 0.009*"case" + '
  '0.008*"australian" + 0.007*"vaccine" + 0.006*"record" + 0.006*"lockdown" + '
  '0.005*"call" + 0.005*"government"'),
 (5,
  '0.001*"tyson_fury" + 0.001*"frock"

Compute Model Perplexity and Coherence Score

In [54]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3403542561408782



Step 6: Hyperparameter tuning

In [55]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [56]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('D:/ds/ds/S3/NLP_LDA/result/lda_tuning_results.csv', index=False)
    pbar.close()

 24%|██▎       | 127/540 [55:49<2:52:48, 25.10s/it]


Step 7: Final Model

In [None]:
num_topics = 8

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9)

In [None]:

from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"high" + 0.004*"govt" + 0.004*"investigation" + 0.004*"price" + '
  '0.004*"move" + 0.003*"claim" + 0.003*"war" + 0.003*"protect" + '
  '0.003*"korean" + 0.003*"police"'),
 (1,
  '0.004*"rule" + 0.003*"profit" + 0.003*"record" + 0.003*"arsenal" + '
  '0.003*"power" + 0.003*"federal" + 0.003*"scrap" + 0.003*"brawl" + '
  '0.003*"cairn" + 0.003*"injure"'),
 (2,
  '0.004*"day" + 0.003*"charge" + 0.003*"rain" + 0.003*"force" + 0.003*"air" + '
  '0.003*"price" + 0.003*"visit" + 0.003*"upset" + 0.003*"fire" + '
  '0.003*"crash"'),
 (3,
  '0.006*"govt" + 0.005*"police" + 0.004*"raid" + 0.004*"work" + 0.003*"new" + '
  '0.003*"aid" + 0.003*"fire" + 0.003*"appeal" + 0.003*"danger" + '
  '0.003*"claim"'),
 (4,
  '0.005*"plane" + 0.004*"war" + 0.004*"woman" + 0.004*"man" + 0.003*"race" + '
  '0.003*"kill" + 0.003*"deny" + 0.003*"ethanol" + 0.003*"fuel" + '
  '0.003*"miss"'),
 (5,
  '0.005*"police" + 0.005*"action" + 0.004*"probe" + 0.004*"consider" + '
  '0.003*"man" + 0.003*"win" +

In [None]:
topic_word = lda_model.components_

print("Shape of Lda Components :",topic_word.shape)


Step 8: Visualize Results

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('D:/ds/ds/S3/NLP_LDA/result/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'D:/ds/ds/S3/NLP_LDA/result/ldavis_tuned_'+ str(num_topics) +'.html')

LDAvis_prepared

  default_term_info = default_term_info.sort_values(
