This notebook and data are derived from a Kaggle tutorial:

https://www.kaggle.com/code/nilaychauhan/topic-modeling-of-news-articles-lda/notebook

In [26]:
import os
import pandas as pd

#Gensim and nltk libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from pprint import pprint
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/omostrander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/omostrander/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
#print items in working directory
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/.DS_Store
./data/abcnews-date-text.csv


In [12]:
#Load the dataset from the csv and save it to 'data_text'
data = pd.read_csv('./data/abcnews-date-text.csv')

# we only need to headlines from the data
data_text = data[:300000][['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [13]:
#Get the total number of documents
print(len(documents), "\n")
documents.head()

300000 



Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


### Data Formatting
- Tokenization
  - Sentences -> words
  - Lowercase all words
  - Remove punctuation 
- Remove stopwords
- Lemmatization
  - 3rd person -> to 1st person
  - Verbs: past and future tenses converted to present tense
- Stem words - reducing them to their root forms
    

In [14]:
#Functions to perform the pre-processing steps on the entire dataset

stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [17]:
#preview document after pre-processing

document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))
print("\n", documents.head())

Original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


Tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

                                        headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


Let's now preprocess all the news headlines we have. To do that, let's use the map function from pandas to apply preprocess() to the headline_text column.

In [18]:
#preprocess all the headlines, saving the list of results as 'processed_docs'
processed_docs = documents['headline_text'].map(preprocess)

#preview processed docs
processed_docs.head()

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

### Bag of words on the dataset
Now let's create a dictionary from 'processed_docs' containing the number of times a word appears in the training set. To do that, let's pass processed_docs to gensim.corpora.Dictionary() and call it 'dictionary'.

In [19]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [20]:
#Verify the dictionary creation
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


### Gensim filter_extremes 
- filter_extremes(no_below=5, no_above=0.5, keep_n=100000) 
- Filter out tokens that appear in 
- less than no_below documents (absolute number) or
  more than no_above documents (fraction of total corpus size, not absolute number).
  after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).

In the example below, we will remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents


In [21]:
#Remove very rare and very common words
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

#### Gensim doc2bow

doc2bow(document)

- Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further    preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [22]:
#Convert document into the bag-of-words format
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [23]:
#Check Bag-of-Words corpus for our sample document --> (token_id, token_count)
bow_corpus[document_num]

[(154, 1), (228, 1), (276, 1), (563, 1), (806, 1), (3175, 1), (3176, 1)]

### Preview BOW for our sample preprocessed document

Here document_num is document number 4310 which we have checked in Step 2

In [24]:
#Preview BOW for our sample pre-processed document
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 154 ("govt") appears 1 time.
Word 228 ("group") appears 1 time.
Word 276 ("vote") appears 1 time.
Word 563 ("local") appears 1 time.
Word 806 ("want") appears 1 time.
Word 3175 ("compulsori") appears 1 time.
Word 3176 ("ratepay") appears 1 time.


#### TF-IDF on our document set
While performing TF-IDF on the corpus is not necessary for LDA implemention using the gensim model, it is recemmended. TF-IDF expects a bag-of-words (integer values) training corpus during initialization. During transformation, it will take a vector and return another vector of the same dimensionality.

- TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)
- IDF(w) = log_e(Total number of documents / Number of documents with term w in it).

In [28]:
#Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
tfidf = models.TfidfModel(bow_corpus)

#Apply transformation to the entire corpus and call it 'corpus_tfidf'
corpus_tfidf = tfidf[bow_corpus]

#Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5959919082495837),
 (1, 0.3920069955308767),
 (2, 0.48532280284497653),
 (3, 0.5055550788930631)]


### Running LDA using Bag of Words
We are going for 10 topics in the document corpus.

We will be running LDA using all CPU cores to parallelize and speed up model training.

Some of the parameters we will be tweaking are:

- number of topics
- id2word (determine vocabulary size)
- workers
- alpha (per document, topic distribution: higher they are more similar, less and the documents are less similar)
- eta (per word distribution)
- passes (number of training passes)

In [29]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 

#Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'

lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers=2)

In [30]:
#For each topic, we will explore the words occuring in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

Topic: 0.023*"open" + 0.019*"win" + 0.015*"time" + 0.015*"leav" + 0.012*"australian" + 0.012*"rescu" + 0.011*"award" + 0.011*"nuclear" + 0.011*"back" + 0.011*"damag" 
Words: 0


Topic: 0.036*"charg" + 0.031*"polic" + 0.022*"miss" + 0.019*"murder" + 0.017*"attack" + 0.015*"woman" + 0.014*"search" + 0.014*"arrest" + 0.014*"home" + 0.014*"drug" 
Words: 1


Topic: 0.032*"report" + 0.026*"iraq" + 0.021*"forc" + 0.017*"kill" + 0.016*"test" + 0.013*"leader" + 0.013*"troop" + 0.012*"releas" + 0.010*"work" + 0.010*"hick" 
Words: 2


Topic: 0.024*"urg" + 0.021*"health" + 0.019*"servic" + 0.018*"worker" + 0.017*"govt" + 0.016*"help" + 0.015*"opposit" + 0.013*"jail" + 0.013*"communiti" + 0.013*"say" 
Words: 3


Topic: 0.062*"polic" + 0.036*"crash" + 0.032*"death" + 0.026*"investig" + 0.020*"road" + 0.016*"probe" + 0.015*"year" + 0.015*"break" + 0.013*"driver" + 0.010*"fatal" 
Words: 4


Topic: 0.046*"plan" + 0.034*"council" + 0.019*"govt" + 0.019*"water" + 0.015*"group" + 0.014*"chang" + 0.014*"cl

## Running LDA using TF-IDF

In [31]:
# Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                       num_topics=10, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers=2)

In [32]:
#For each topic, we will explore the words occuring in that topic and its relative weight
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.010*"hick" + 0.009*"price" + 0.009*"liber" + 0.008*"govt" + 0.008*"drink" + 0.008*"sale" + 0.007*"drive" + 0.007*"say" + 0.007*"water" + 0.006*"iemma"


Topic: 1 Word: 0.008*"grower" + 0.008*"toll" + 0.008*"export" + 0.007*"miner" + 0.006*"week" + 0.006*"coal" + 0.006*"theft" + 0.006*"afghanistan" + 0.006*"revamp" + 0.006*"violenc"


Topic: 2 Word: 0.023*"crash" + 0.021*"kill" + 0.016*"polic" + 0.014*"miss" + 0.012*"search" + 0.011*"die" + 0.010*"iraq" + 0.010*"investig" + 0.009*"attack" + 0.009*"dead"


Topic: 3 Word: 0.046*"closer" + 0.008*"fish" + 0.007*"scientist" + 0.007*"cancer" + 0.006*"lake" + 0.006*"central" + 0.006*"doubt" + 0.006*"illeg" + 0.005*"monitor" + 0.005*"confirm"


Topic: 4 Word: 0.010*"market" + 0.007*"solomon" + 0.007*"eas" + 0.007*"hill" + 0.007*"takeov" + 0.006*"break" + 0.006*"rate" + 0.006*"rat" + 0.006*"council" + 0.006*"rise"


Topic: 5 Word: 0.010*"councillor" + 0.009*"indonesia" + 0.008*"speed" + 0.008*"steal" + 0.008*"retir" + 0.007*"cha

###  Performance evaluation by classifying sample document using LDA Bag of Words model

We will check to see where our test document would be classified.

In [34]:
#Text of sample document 4310
print(processed_docs[4310])

#Check which topic our test document belongs to using the LDA Bag of Words model.
document_num = 4310

# Our test document is document number 4310
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

Score: 0.6282708048820496	 
Topic: 0.046*"plan" + 0.034*"council" + 0.019*"govt" + 0.019*"water" + 0.015*"group" + 0.014*"chang" + 0.014*"closer" + 0.013*"nation" + 0.010*"urg" + 0.009*"fund"

Score: 0.14638423919677734	 
Topic: 0.015*"south" + 0.015*"rain" + 0.014*"australia" + 0.012*"guilti" + 0.012*"storm" + 0.011*"flood" + 0.011*"england" + 0.011*"east" + 0.010*"cyclon" + 0.009*"world"

Score: 0.13780218362808228	 
Topic: 0.062*"polic" + 0.036*"crash" + 0.032*"death" + 0.026*"investig" + 0.020*"road" + 0.016*"probe" + 0.015*"year" + 0.015*"break" + 0.013*"driver" + 0.010*"fatal"

Score: 0.012507736682891846	 
Topic: 0.040*"govt" + 0.019*"hospit" + 0.019*"reject" + 0.018*"court" + 0.018*"face" + 0.017*"power" + 0.014*"industri" + 0.014*"fund" + 0.013*"accus" + 0.013*"claim"

Score: 0.01250767707824707	 
Topic: 0.024*"urg" + 0.021*"health" + 0.019*"servic" + 0.018*"worker" + 0.017*"govt" + 0.016*"help" + 0.015*"oppo

It has the highest probability (0.61) to be part of the topic that we assigned as Topic X, which is the accurate classification.

---
### Performance evaluation by classifying sample document using LDA TF-IDF model

In [35]:
#Check which topic our test document belongs to using the LDA TF-IDF model.
# Our test document is document number 4310
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.48170599341392517	 
Topic: 0.018*"water" + 0.012*"drought" + 0.010*"plan" + 0.010*"govt" + 0.009*"council" + 0.008*"rain" + 0.008*"farm" + 0.008*"farmer" + 0.007*"fund" + 0.006*"urg"

Score: 0.25961044430732727	 
Topic: 0.046*"closer" + 0.008*"fish" + 0.007*"scientist" + 0.007*"cancer" + 0.006*"lake" + 0.006*"central" + 0.006*"doubt" + 0.006*"illeg" + 0.005*"monitor" + 0.005*"confirm"

Score: 0.17114600539207458	 
Topic: 0.021*"charg" + 0.020*"court" + 0.016*"murder" + 0.013*"face" + 0.013*"jail" + 0.013*"blaze" + 0.012*"polic" + 0.011*"assault" + 0.011*"firefight" + 0.010*"accus"

Score: 0.012507629580795765	 
Topic: 0.011*"govt" + 0.010*"health" + 0.009*"servic" + 0.008*"opposit" + 0.008*"indigen" + 0.007*"urg" + 0.007*"plan" + 0.007*"hospit" + 0.006*"union" + 0.006*"fund"

Score: 0.012506485916674137	 
Topic: 0.010*"hick" + 0.009*"price" + 0.009*"liber" + 0.008*"govt" + 0.008*"drink" + 0.008*"sale" + 0.007*"drive" + 0.007*"say" + 0.007*"water" + 0.006*"iemma"

Score: 0.012

#### It has the highest probability (59%) to be part of the topic that we assigned as topic X.

---
### Testing model on unseen document

In [36]:
unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.4201062023639679	 Topic: 0.023*"open" + 0.019*"win" + 0.015*"time" + 0.015*"leav" + 0.012*"australian"
Score: 0.2205706089735031	 Topic: 0.040*"govt" + 0.019*"hospit" + 0.019*"reject" + 0.018*"court" + 0.018*"face"
Score: 0.21926896274089813	 Topic: 0.015*"south" + 0.015*"rain" + 0.014*"australia" + 0.012*"guilti" + 0.012*"storm"
Score: 0.02001144550740719	 Topic: 0.024*"urg" + 0.021*"health" + 0.019*"servic" + 0.018*"worker" + 0.017*"govt"
Score: 0.020009590312838554	 Topic: 0.046*"plan" + 0.034*"council" + 0.019*"govt" + 0.019*"water" + 0.015*"group"
Score: 0.020006636157631874	 Topic: 0.032*"report" + 0.026*"iraq" + 0.021*"forc" + 0.017*"kill" + 0.016*"test"
Score: 0.020006630569696426	 Topic: 0.036*"charg" + 0.031*"polic" + 0.022*"miss" + 0.019*"murder" + 0.017*"attack"
Score: 0.020006630569696426	 Topic: 0.062*"polic" + 0.036*"crash" + 0.032*"death" + 0.026*"investig" + 0.020*"road"
Score: 0.020006630569696426	 Topic: 0.040*"warn" + 0.018*"die" + 0.017*"deal" + 0.017*"coa

The model correctly classifies the unseen document with '42'% probability to the X category.