In [1]:
import pickle

# Open the pickle file in binary mode
with open('negative_sent_articles.pkl', 'rb') as file:
    # Load the object from the pickle file
    df_n = pickle.load(file)

## Negative Articles LDA Topics

In [2]:
df = df_n

In [3]:
len(df)

70401

In [4]:
import nltk
from nltk.corpus import stopwords
import pandas as pd

# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

# Get the column as a Series
column_data = df['clean_text']

# Combine all the words in the column into a single string
combined_text = ' '.join(column_data.astype(str))

# Split the string into individual words
words = combined_text.split()

# Remove stopwords
stopwords_set = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stopwords_set]

# Count the frequency of each word
word_counts = pd.Series(filtered_words).value_counts()

# Get the most common words
most_common_words = word_counts.head(10)  # Change the number to get more or fewer common words

print(most_common_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AI            412781
,             174787
data          163763
new           115785
also          111869
said          102470
market         97890
use            93274
.              92266
technology     89398
Name: count, dtype: int64


In [5]:
len(df)

70401

In [6]:
#!pip install gensim
#!pip install pyLDAvis

In [7]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# import pyLDAvis.gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [8]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [9]:
import warnings
warnings.simplefilter('ignore')

In [10]:
%%time

#Parallelly cleaning the text

from multiprocessing import Pool

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def parallel_clean(data):
    with Pool() as pool:
        results = pool.map(clean, data)
    return [result.split() for result in results]

news_list = df['clean_text'].tolist()
news_clean = parallel_clean(news_list)

title_list = df['title'].tolist()
title_clean = parallel_clean(title_list)

CPU times: user 10.3 s, sys: 7.83 s, total: 18.1 s
Wall time: 56.6 s


In [11]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(title_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

CPU times: user 22.2 s, sys: 1.16 s, total: 23.3 s
Wall time: 23.3 s


In [14]:
# Save the dictionary and corpus
with open('dictionary_neg.pkl', 'wb') as f:
    pickle.dump(dictionary, f)
with open('corpus_neg.pkl', 'wb') as f:
    pickle.dump(doc_term_matrix, f)

In [15]:
import multiprocessing
num_processors = multiprocessing.cpu_count()
num_processors

32

In [18]:
%%time

import gensim
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

# Build the LDA model
num_topics = 10
iterations = 100
passes = 20
workers = num_processors - 1
eval_every = None

ldamodel_neg = LdaMulticore(corpus=doc_term_matrix,
                        id2word=dictionary,
                        eta='auto',
                        num_topics=num_topics,
                        iterations=iterations,
                        passes=passes,
                        eval_every=eval_every,
                        workers=workers)

# Print the topics
print(*ldamodel_neg.print_topics(num_topics=num_topics, num_words=15), sep='\n')

# Save the model
ldamodel_neg.save('ldamodel_neg')

# Compute coherence score
coherence_model_lda = CoherenceModel(model=ldamodel_neg, texts=title_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

(0, '0.020*"news" + 0.020*"email" + 0.015*"ai" + 0.013*"machine" + 0.011*"learning" + 0.011*"account" + 0.009*"dollar" + 0.009*"password" + 0.008*"traded" + 0.008*"trade" + 0.008*"link" + 0.007*"facebook" + 0.007*"u" + 0.007*"btc" + 0.007*"said"')
(1, '0.009*"said" + 0.007*"u" + 0.007*"ai" + 0.006*"news" + 0.005*"intelligence" + 0.005*"artificial" + 0.005*"new" + 0.005*"technology" + 0.004*"also" + 0.003*"world" + 0.003*"year" + 0.003*"time" + 0.003*"comment" + 0.003*"people" + 0.003*"would"')
(2, '0.031*"image" + 0.008*"ai" + 0.006*"content" + 0.006*"technology" + 0.006*"said" + 0.006*"system" + 0.006*"user" + 0.006*"video" + 0.005*"model" + 0.005*"company" + 0.005*"openai" + 0.005*"deepfake" + 0.005*"medium" + 0.005*"it’s" + 0.005*"new"')
(3, '0.023*"ai" + 0.013*"data" + 0.010*"customer" + 0.009*"business" + 0.008*"technology" + 0.007*"company" + 0.006*"platform" + 0.006*"solution" + 0.006*"medium" + 0.005*"gray" + 0.005*"group" + 0.005*"help" + 0.005*"service" + 0.005*"new" + 0.004*

In [19]:
# Visualize the topics
lda_display = gensimvis.prepare(ldamodel_neg, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)