To download [***Dataset***](https://drive.google.com/file/d/10MykpG-2TrCN-TQvz0JQ2HNd3Sw4e1xI/view?usp=drive_link)

# **Importing libraries & Some helper functions**  

In [None]:
import pandas as pd

In [None]:
import nltk
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

import spacy
from gensim import corpora
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def remove_stopwords(text : str):
    textArr = tokenizer.tokenize(text)
    rem_text = " ".join([word for word in textArr if word.lower() not in stop_words ])
    return rem_text

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
       output = []
       for sent in texts:
             doc = nlp(sent)
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

# **Load Data**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
data=pd.read_csv("/content/gdrive/MyDrive/articles1.csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [None]:
data.count()

Unnamed: 0     50000
id             50000
title          50000
publication    50000
author         43694
date           50000
year           50000
month          50000
url                0
content        50000
dtype: int64

In [None]:
data.isnull().sum()

Unnamed: 0         0
id                 0
title              0
publication        0
author          6306
date               0
year               0
month              0
url            50000
content            0
dtype: int64

# **Preprocessing**

**preprocessing for LDA algorithm**

In [None]:
data_=data["content"].drop_duplicates().dropna()[:36000]

In [None]:
data_=data["content"].drop_duplicates()

In [None]:
data_=data_.apply(remove_stopwords)

print(data_.head())

0    WASHINGTON Congressional Republicans new fear ...
1    bullet shells get counted blood dries votive c...
2    Walt Disney Bambi opened 1942 critics praised ...
3    Death may great equalizer necessarily evenhand...
4    SEOUL South Korea North Korea leader Kim said ...
Name: content, dtype: object


In [None]:
data_lemma = lemmatization(data_.tolist())

In [None]:
print(sum(len(x) for x in data_lemma))

8148124


In [None]:

# print number of tokenization
from importlib.util import find_spec as isModule
if(isModule('humanize') != None):
      from humanize import intword
      print(intword(sum(len(x) for x in data_lemma)), " Tokenizations")
else:
      print(sum(len(x) for x in data_lemma), " Tokenizations")

print(data_lemma[:2])

8.1 million  Tokenizations
[['new', 'fear', 'health', 'care', 'lawsuit', 'administration', 'incoming', 'administration', 'executive', 'branch', 'suit', 'administration', 'authority', 'billion', 'dollar', 'health', 'insurance', 'subsidy', 'big', 'victory', 'issue', 'sudden', 'loss', 'subsidy', 'health', 'care', 'program', 'implode', 'million', 'people', 'access', 'health', 'insurance', 'replacement', 'chaos', 'insurance', 'market', 'political', 'backlash', 'full', 'control', 'government', 'stave', 'outcome', 'awkward', 'position', 'huge', 'sum', 'health', 'care', 'law', 'conservative', 'voter', 'end', 'law', 'year', 'twist', 'administration', 'executive', 'branch', 'prerogative', 'republican', 'ally', 'central', 'question', 'ugly', 'political', 'pileup', 'transition', 'team', 'gaming', 'handle', 'lawsuit', 'election', 'limbo', 'late', 'ready', 'divulge', 'strategy', 'litigation', 'administration', 'inappropriate', 'comment', 'spokesman', 'transition', 'effort', 'office', 'administration

In [None]:
# Create a dictionary from the preprocessed data
dictionary = corpora.Dictionary(data_lemma)
# bag of words
corpus = [dictionary.doc2bow(doc) for doc in data_lemma]

In [None]:
print(doc_term_matrix[:2])

[[(0, 1), (1, 13), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 5), (15, 1), (16, 7), (17, 4), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 5), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 11), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 5), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 4), (82, 2), (83, 3), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 2), (95, 2), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data,test_data=train_test_split(corpus, test_size=0.3, random_state=42)

In [None]:
import gensim
Lda = gensim.models.LdaMulticore
ldamodel = Lda(corpus=train_data, id2word=dictionary, num_topics=25, passes=30)

In [None]:
print('\nPerplexity: ', ldamodel.log_perplexity(test_data)  )# a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=data_lemma, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.174915078686011

Coherence Score:  0.5326707378242082


In [None]:
for topic_id, topic_words in ldamodel.show_topics():
    print(f"Topic #{topic_id + 1}: {topic_words}\n")

Topic #14: 0.394*"woman" + 0.070*"abortion" + 0.033*"female" + 0.031*"baby" + 0.022*"birth" + 0.020*"death" + 0.018*"execution" + 0.013*"pregnancy" + 0.012*"life" + 0.011*"pregnant"

Topic #12: 0.040*"campaign" + 0.026*"republican" + 0.025*"vote" + 0.025*"election" + 0.025*"candidate" + 0.024*"presidential" + 0.023*"voter" + 0.018*"nominee" + 0.018*"state" + 0.016*"party"

Topic #25: 0.057*"police" + 0.035*"officer" + 0.022*"man" + 0.011*"video" + 0.010*"crime" + 0.010*"death" + 0.010*"victim" + 0.009*"authority" + 0.009*"prison" + 0.008*"murder"

Topic #3: 0.041*"company" + 0.016*"new" + 0.014*"year" + 0.010*"car" + 0.009*"product" + 0.009*"technology" + 0.007*"customer" + 0.007*"employee" + 0.007*"service" + 0.007*"business"

Topic #11: 0.016*"administration" + 0.016*"official" + 0.012*"country" + 0.011*"leader" + 0.011*"policy" + 0.010*"president" + 0.009*"russian" + 0.009*"nuclear" + 0.009*"foreign" + 0.009*"government"

Topic #15: 0.066*"student" + 0.053*"school" + 0.049*"twitter"

In [None]:
from gensim.models import HdpModel
hdp_model = HdpModel(train_data, id2word=dictionary)

# Print the topics and their corresponding word distributions




Topic 0: 0.009*people + 0.008*year + 0.006*time + 0.005*country + 0.004*many + 0.004*new + 0.004*last + 0.004*day + 0.004*government + 0.004*state + 0.003*woman + 0.003*man + 0.003*official + 0.003*way + 0.003*group + 0.003*first + 0.003*company + 0.003*week + 0.003*police + 0.003*good

Topic 1: 0.010*campaign + 0.009*people + 0.006*election + 0.006*state + 0.006*year + 0.006*percent + 0.006*voter + 0.006*time + 0.006*candidate + 0.005*vote + 0.005*presidential + 0.005*republican + 0.005*former + 0.005*last + 0.004*poll + 0.004*week + 0.004*country + 0.004*president + 0.004*email + 0.004*political

Topic 2: 0.008*people + 0.008*year + 0.006*time + 0.005*new + 0.004*last + 0.004*company + 0.004*country + 0.004*day + 0.004*missile + 0.003*first + 0.003*week + 0.003*good + 0.003*many + 0.003*man + 0.003*way + 0.003*campaign + 0.003*woman + 0.003*former + 0.003*world + 0.003*official

Topic 3: 0.006*people + 0.004*year + 0.004*wall + 0.004*time + 0.003*police + 0.003*student + 0.003*countr

AttributeError: ignored

In [None]:
coherence_model_HDP = CoherenceModel(model=hdp_model, texts=data_lemma, dictionary=dictionary , coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)
coherence_score = coherence_model_HDP.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.42631258254390364


In [None]:
for topic_id, topic in hdp_model.show_topics(formatted=True):
    print(f"Topic {topic_id}: {topic}\n")



Topic 0: 0.009*people + 0.008*year + 0.006*time + 0.005*country + 0.004*many + 0.004*new + 0.004*last + 0.004*day + 0.004*government + 0.004*state + 0.003*woman + 0.003*man + 0.003*official + 0.003*way + 0.003*group + 0.003*first + 0.003*company + 0.003*week + 0.003*police + 0.003*good

Topic 1: 0.010*campaign + 0.009*people + 0.006*election + 0.006*state + 0.006*year + 0.006*percent + 0.006*voter + 0.006*time + 0.006*candidate + 0.005*vote + 0.005*presidential + 0.005*republican + 0.005*former + 0.005*last + 0.004*poll + 0.004*week + 0.004*country + 0.004*president + 0.004*email + 0.004*political

Topic 2: 0.008*people + 0.008*year + 0.006*time + 0.005*new + 0.004*last + 0.004*company + 0.004*country + 0.004*day + 0.004*missile + 0.003*first + 0.003*week + 0.003*good + 0.003*many + 0.003*man + 0.003*way + 0.003*campaign + 0.003*woman + 0.003*former + 0.003*world + 0.003*official

Topic 3: 0.006*people + 0.004*year + 0.004*wall + 0.004*time + 0.003*police + 0.003*student + 0.003*countr

In [None]:
new_document = "I love playing soccer and watching basketball."
new_doc_bow = dictionary.doc2bow(new_document.lower().split())
new_doc_topics = hdp_model.get_topics()
new_doc_topic_dist = hdp_model[new_doc_bow]
most_likely_topic = max(new_doc_topic_dist, key=lambda x: x[1])[0]
print(f"Most likely topic for the new document: {most_likely_topic}")

Most likely topic for the new document: 0


In [None]:
topics = ldamodel.print_topics()

In [None]:
!apt-get install -y -qq locales > /dev/null
!locale-gen en_US.UTF-8

NotImplementedError: ignored

In [None]:
import os
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'

In [None]:
!pip install pyLDAvis==2.1.2

NotImplementedError: ignored

In [None]:
!pip install pyLDAvis==2.1.2

NotImplementedError: ignored

In [None]:
for topic in topics:
    print(topic)

(11, '0.046*"black" + 0.032*"white" + 0.025*"group" + 0.022*"people" + 0.022*"protest" + 0.018*"protester" + 0.018*"speech" + 0.016*"community" + 0.016*"event" + 0.014*"right"')
(10, '0.016*"store" + 0.013*"year" + 0.012*"building" + 0.012*"new" + 0.011*"city" + 0.010*"home" + 0.008*"restaurant" + 0.008*"project" + 0.008*"work" + 0.008*"place"')
(27, '0.059*"drug" + 0.043*"book" + 0.025*"death" + 0.023*"human" + 0.015*"cuban" + 0.014*"history" + 0.014*"year" + 0.012*"execution" + 0.010*"story" + 0.009*"author"')
(9, '0.030*"film" + 0.029*"show" + 0.026*"movie" + 0.018*"year" + 0.017*"tv" + 0.015*"series" + 0.015*"actor" + 0.013*"character" + 0.012*"new" + 0.011*"star"')
(19, '0.040*"people" + 0.023*"thing" + 0.018*"way" + 0.015*"good" + 0.013*"time" + 0.012*"lot" + 0.010*"many" + 0.008*"bad" + 0.008*"question" + 0.007*"right"')
(24, '0.028*"country" + 0.022*"political" + 0.021*"government" + 0.017*"leader" + 0.014*"party" + 0.014*"power" + 0.013*"year" + 0.012*"world" + 0.011*"nation" 

In [None]:
!pip install pandas==1.5.3

NotImplementedError: ignored

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pickle
import pyLDAvis
import os
import pandas as pd

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('/content/gdrive_'+str(30))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, '/content/gdrive_'+ str(30) +'.html')

LDAvis_prepared

  and should_run_async(code)


In [None]:
import gensim

lda_model = gensim.models.LdaMulticore(corpus=doc_term_matrix,
                                            id2word=dictionary,
                                            num_topics=30,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=30,
                                            alpha='auto',
                                            per_word_topics=True)

TypeError: ignored