In [9]:
import pickle

# Open the pickle file in binary mode
with open('positive_sent_articles.pkl', 'rb') as file:
    # Load the object from the pickle file
    df_p = pickle.load(file)

## Positive Articles LDA Topics
Let us work on the positively classified articles first

In [11]:
df = df_p

In [12]:
len(df)

60888

In [13]:
import nltk
from nltk.corpus import stopwords
import pandas as pd

# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

# Get the column as a Series
column_data = df['clean_text']

# Combine all the words in the column into a single string
combined_text = ' '.join(column_data.astype(str))

# Split the string into individual words
words = combined_text.split()

# Remove stopwords
stopwords_set = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stopwords_set]

# Count the frequency of each word
word_counts = pd.Series(filtered_words).value_counts()

# Get the most common words
most_common_words = word_counts.head(10)  # Change the number to get more or fewer common words

print(most_common_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AI              344650
market          180004
data            149812
,               126852
Artificial      115890
-               115299
Intelligence    110646
new             104689
Market          103273
also             90085
Name: count, dtype: int64


In [14]:
len(df)

60888

In [15]:
#!pip install gensim
#!pip install pyLDAvis

In [16]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# import pyLDAvis.gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [17]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [18]:
import warnings
warnings.simplefilter('ignore')

In [21]:
%%time

#Parallelly cleaning the text

from multiprocessing import Pool

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def parallel_clean(data):
    with Pool() as pool:
        results = pool.map(clean, data)
    return [result.split() for result in results]

news_list = df['clean_text'].tolist()
news_clean = parallel_clean(news_list)

title_list = df['title'].tolist()
title_clean = parallel_clean(title_list)

CPU times: user 7.63 s, sys: 8.91 s, total: 16.5 s
Wall time: 48.4 s


In [22]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(title_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

CPU times: user 16.6 s, sys: 1.08 s, total: 17.7 s
Wall time: 17.7 s


In [38]:
# Save the dictionary and corpus
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)
with open('corpus.pkl', 'wb') as f:
    pickle.dump(doc_term_matrix, f)

In [24]:
import multiprocessing
num_processors = multiprocessing.cpu_count()
num_processors

32

In [35]:
%%time

import gensim
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

# Build the LDA model
num_topics = 10
iterations = 100
passes = 20
workers = num_processors - 1
eval_every = None

ldamodel_pos = LdaMulticore(corpus=doc_term_matrix,
                        id2word=dictionary,
                        eta='auto',
                        num_topics=num_topics,
                        iterations=iterations,
                        passes=passes,
                        eval_every=eval_every,
                        workers=workers)

# Print the topics
print(*ldamodel_pos.print_topics(num_topics=num_topics, num_words=15), sep='\n')

# Save the model
ldamodel_pos.save('ldamodel_pos')

# Compute coherence score
coherence_model_lda = CoherenceModel(model=ldamodel_pos, texts=title_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

(0, '0.009*"ai" + 0.007*"data" + 0.007*"company" + 0.006*"insurance" + 0.006*"microsoft" + 0.006*"technology" + 0.005*"search" + 0.005*"new" + 0.004*"product" + 0.004*"information" + 0.004*"service" + 0.004*"website" + 0.004*"und" + 0.004*"market" + 0.004*"also"')
(1, '0.032*"ai" + 0.016*"data" + 0.009*"technology" + 0.008*"gray" + 0.007*"company" + 0.007*"group" + 0.007*"medium" + 0.007*"solution" + 0.006*"platform" + 0.005*"business" + 0.005*"new" + 0.005*"learning" + 0.005*"release" + 0.005*"inc" + 0.005*"model"')
(2, '0.015*"data" + 0.012*"learning" + 0.009*"machine" + 0.008*"ai" + 0.008*"model" + 0.006*"science" + 0.005*"new" + 0.005*"system" + 0.004*"one" + 0.004*"use" + 0.004*"also" + 0.003*"stock" + 0.003*"time" + 0.003*"using" + 0.003*"algorithm"')
(3, '0.014*"patient" + 0.013*"ai" + 0.010*"medical" + 0.009*"health" + 0.009*"clinical" + 0.008*"technology" + 0.008*"care" + 0.007*"healthcare" + 0.007*"gray" + 0.006*"group" + 0.006*"solution" + 0.006*"medium" + 0.006*"cancer" + 0

### Summaries

The topics can be summarized as follows:

Topic 0: AI, data, company, insurance, Microsoft, technology, search, new, product, information, service, website, und, market, also.

Topic 1: AI, data, technology, gray, company, group, medium, solution, platform, business, new, learning, release, inc, model.

Topic 2: Data, learning, machine, AI, model, science, new, system, one, use, also, stock, time, using, algorithm.

Topic 3: Patient, AI, medical, health, clinical, technology, care, healthcare, gray, group, solution, medium, cancer, disease, company.

Topic 4: AI, customer, technology, company, experience, medium, solution, new, gray, group, data, statement, platform, service, inc.

Topic 5: AI, new, chatgpt, u, like, one, image, also, work, technology, people, time, make, news, get.

Topic 6: AI, intelligence, artificial, u, data, also, technology, news, one, human, new, work, like, job, year.

Topic 7: Market, report, intelligence, artificial, analysis, global, research, growth, industry, key, data, AI, player, forecast, trend.

Topic 8: Market, AI, technology, intelligence, artificial, company, industry, u, global, growth, research, service, business, report, data.

Topic 9: AI, company, medium, technology, gray, content, group, platform, new, release, business, data, intelligence, press, said.

Please note that these summaries are based on the keywords with the highest weights in each topic and may not capture the complete meaning and context of the topics.

In [37]:
# Visualize the topics
lda_display = gensimvis.prepare(ldamodel_pos, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)