### Dataset

In [1]:
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

### Pre-Proccessing

In [2]:

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [None]:
import nltk
nltk.download('wordnet')

In [3]:
# not stemming as it will not provide valid results
#lemmatizing
#removing stopwords and words with len<3
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
import gensim
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [5]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [6]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [7]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5918674193999763),
 (1, 0.3937180767686992),
 (2, 0.5009876624450964),
 (3, 0.49365007440105513)]


In [8]:
# training lda model
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [9]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.059*"australia" + 0.025*"australian" + 0.023*"china" + 0.020*"sydney" + 0.020*"world" + 0.020*"coronavirus" + 0.018*"open" + 0.015*"border" + 0.012*"win" + 0.012*"women"
Topic: 1 
Words: 0.023*"market" + 0.020*"record" + 0.015*"year" + 0.013*"lose" + 0.012*"care" + 0.012*"price" + 0.012*"years" + 0.011*"business" + 0.011*"australian" + 0.010*"age"
Topic: 2 
Words: 0.053*"coronavirus" + 0.030*"government" + 0.021*"covid" + 0.015*"rise" + 0.015*"restrictions" + 0.014*"water" + 0.013*"royal" + 0.013*"scott" + 0.012*"tasmanian" + 0.011*"concern"
Topic: 3 
Words: 0.027*"kill" + 0.022*"die" + 0.019*"coast" + 0.018*"shoot" + 0.017*"miss" + 0.017*"crash" + 0.015*"attack" + 0.015*"gold" + 0.014*"dead" + 0.013*"island"
Topic: 4 
Words: 0.040*"police" + 0.026*"charge" + 0.026*"case" + 0.025*"court" + 0.020*"death" + 0.020*"murder" + 0.017*"face" + 0.013*"jail" + 0.013*"people" + 0.012*"arrest"
Topic: 5 
Words: 0.054*"trump" + 0.026*"test" + 0.020*"tasmania" + 0.014*"morrison" +

In [15]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [11]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.031*"coronavirus" + 0.027*"covid" + 0.012*"market" + 0.009*"australia" + 0.009*"victoria" + 0.008*"scott" + 0.008*"case" + 0.008*"australian" + 0.007*"record" + 0.006*"share"
Topic: 1 Word: 0.009*"street" + 0.008*"violence" + 0.006*"domestic" + 0.006*"energy" + 0.006*"food" + 0.006*"outback" + 0.005*"rat" + 0.005*"rail" + 0.005*"andrews" + 0.005*"ash"
Topic: 2 Word: 0.016*"rural" + 0.014*"news" + 0.010*"health" + 0.010*"royal" + 0.009*"live" + 0.009*"commission" + 0.008*"national" + 0.007*"finance" + 0.007*"david" + 0.006*"mental"
Topic: 3 Word: 0.021*"trump" + 0.018*"police" + 0.015*"charge" + 0.012*"murder" + 0.012*"crash" + 0.011*"woman" + 0.010*"court" + 0.009*"shoot" + 0.009*"death" + 0.009*"kill"
Topic: 4 Word: 0.016*"country" + 0.011*"hour" + 0.007*"turnbull" + 0.007*"korea" + 0.006*"north" + 0.006*"prison" + 0.006*"action" + 0.005*"morning" + 0.005*"malcolm" + 0.005*"syria"
Topic: 5 Word: 0.019*"donald" + 0.009*"friday" + 0.009*"government" + 0.009*"climate" + 

### Testing both the models

In [None]:
# Bag Of Words
# Compute Perplexity
from gensim.models.coherencemodel import CoherenceModel
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# TFIDF
# Compute Perplexity
from gensim.models.coherencemodel import CoherenceModel
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [14]:
#since tdidf has more coherence score therefore it is more effective than bow