In [None]:
# Lab Assignment 3 - Topic Modeling
# Name: Muhammad Adam Khan bin Abu Ganim Khan, Muhammad Farhan bin Abdul Mutalib
# Student ID: IS01083956, IS01082992

# Step 1: Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

# Step 2: Download NLTK resources (requires internet)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Step 3: Load the data
df = pd.read_csv("news_dataset.csv")
documents = df['text'].dropna().tolist()

# Step 4: Preprocess the data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Step 5: Create the document-term matrix
dictionary = corpora.Dictionary(preprocessed_documents)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# Step 6: Run LDA
lda_model = LdaModel(corpus=corpus, num_topics=4, id2word=dictionary, passes=15, random_state=42)

# Step 7: Print top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

# Step 8: Assign dominant topic to each document
article_labels = []
for doc in preprocessed_documents:
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

# Step 9: Create and show result DataFrame
df_result = pd.DataFrame({"Document": documents, "Topic": article_labels})
print("Table with Document Topics:")
print(df_result.head())

# Step 10: Evaluate with Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"\nCoherence Score: {coherence_score:.4f}")

# Step 11: Interpretation
print("\n--- Interpretation ---")
print("The Coherence Score indicates how meaningful the topics are.")
print("Scores closer to 1.0 suggest better topic separation and clarity.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mfarh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mfarh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mfarh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top Terms for Each Topic:
Topic 0:
- "1" (weight: 0.061)
- "0" (weight: 0.045)
- "max" (weight: 0.043)
- "2" (weight: 0.042)
- "q" (weight: 0.041)
- "7" (weight: 0.028)
- "g" (weight: 0.028)
- "r" (weight: 0.027)
- "3" (weight: 0.026)
- "p" (weight: 0.023)

Topic 1:
- "would" (weight: 0.013)
- "one" (weight: 0.012)
- "know" (weight: 0.008)
- "people" (weight: 0.008)
- "like" (weight: 0.008)
- "think" (weight: 0.007)
- "say" (weight: 0.006)
- "get" (weight: 0.006)
- "god" (weight: 0.006)
- "thing" (weight: 0.006)

Topic 2:
- "people" (weight: 0.007)
- "government" (weight: 0.007)
- "year" (weight: 0.007)
- "would" (weight: 0.006)
- "state" (weight: 0.005)
- "right" (weight: 0.005)
- "president" (weight: 0.005)
- "one" (weight: 0.005)
- "said" (weight: 0.004)
- "game" (weight: 0.004)

Topic 3:
- "x" (weight: 0.014)
- "key" (weight: 0.012)
- "use" (weight: 0.009)
- "file" (weight: 0.009)
- "system" (weight: 0.009)
- "chip" (weight: 0.007)
- "program" (weight: 0.006)
- "db" (weight: 0.006)