In [5]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [6]:
#read CSV into DataFrame
data = pd.read_csv("news_dataset.csv")
data

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043
...,...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt,2022-08-02 13:48:37.251043
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt,2022-08-02 13:48:37.251043
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt,2022-08-02 13:48:37.251043
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt,2022-08-02 13:48:37.251043


In [7]:
# Clean text
def preprocess(text):
    text = re.sub(r'\s+', ' ', str(text))  # remove multiple spaces
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())  # remove non-alphabetic chars
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]

# Apply preprocessing
data['tokens'] = data['text'].apply(preprocess)

In [9]:
# Filter out empty results
data = data[data['tokens'].map(lambda x: len(x) > 0)]

In [27]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(data['tokens'])

# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in data['tokens']]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=9, passes=10, random_state=42)

In [28]:
# Interpret Results
# empty list to store dominant topic labels for each document
article_labels = []

# iterate over each processed document
for i, doc in enumerate(data['tokens']):
    # for each document, convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # append to the list
    article_labels.append(dominant_topic)

# Create DataFrame
df_result = pd.DataFrame({"Article": data['text'], "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      0
1      I recently posted an article asking what kind ...      0
2      \nIt depends on your priorities.  A lot of peo...      0
3      an excellent automatic can be found in the sub...      0
4      : Ford and his automobile.  I need information...      0
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      6
11310  Hi !\n\nI am interested in the source of FEAL ...      8
11311  The actual algorithm is classified, however, t...      6
11312  \n\tThis appears to be generic calling upon th...      0
11313  \nProbably keep quiet and take it, lest they g...      0

[11214 rows x 2 columns]



In [29]:
from gensim.models import CoherenceModel

# Calculate Coherence Score using 'c_v' (good for text topic coherence)
coherence_model_lda = CoherenceModel(model=lda_model, 
                                      texts=data['tokens'], 
                                      dictionary=dictionary, 
                                      coherence='c_v')

coherence_score = coherence_model_lda.get_coherence()

print(f"Coherence Score: {coherence_score:.4f}")

Coherence Score: 0.6201


# Interpretation

### 0.6201 in coherence score indicates that topics produced by LDA model are reasonably meaningful and semantically interpretable. This score reflects a pleasant balance of topic diversity and specificity and shows that prominent words within a topic are discovered to co-occur in a dataset and therefore form coherent topics. It is perhaps not overwhelmingly high but reflects a strongly performing model for exploratory topic analysis.