In [3]:
# Name: Siti Nur Lyana binti Mohd Nazri, Mohammad Naufal Eiman bin Shahromi
# ID: IS01082645, IS01082514

import pandas as pd
import nltk
import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel
import warnings
warnings.filterwarnings("ignore")


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


df = pd.read_csv("news_dataset.csv")
df = df[['text']]


df.dropna(inplace=True)


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    
    text = text.lower()
    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    tokens = nltk.word_tokenize(text)

    tokens = [word for word in tokens if word not in stop_words and len(word) > 3]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

df['tokens'] = df['text'].apply(preprocess)


dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]


lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=4,
                                   random_state=42,
                                   passes=10,
                                   per_word_topics=True)


for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print("\nCoherence Score:", coherence_score)

print("\n--- Interpretation ---")
print("The coherence score indicates how consistent and semantically meaningful the topics are.")
print("Scores range from 0 to 1. A higher score (closer to 1) means better topic modeling.")
print("In this case, a score of", round(coherence_score, 3), "suggests the topics are reasonably interpretable.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Topic 1: 0.010*"file" + 0.007*"program" + 0.007*"system" + 0.006*"information" + 0.005*"available" + 0.005*"window" + 0.004*"data" + 0.004*"user" + 0.004*"message" + 0.004*"anonymous"
Topic 2: 0.008*"would" + 0.007*"like" + 0.006*"know" + 0.006*"dont" + 0.006*"game" + 0.006*"year" + 0.005*"good" + 0.005*"time" + 0.005*"team" + 0.005*"think"
Topic 3: 0.028*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.023*"chip" + 0.011*"clipper" + 0.008*"drive" + 0.007*"system" + 0.007*"device" + 0.007*"disk" + 0.006*"card" + 0.006*"escrow" + 0.005*"phone"
Topic 4: 0.009*"people" + 0.008*"would" + 0.005*"dont" + 0.005*"think" + 0.004*"government" + 0.004*"know" + 0.004*"right" + 0.004*"time" + 0.003*"like" + 0.003*"even"

Coherence Score: 0.5675296311458514

--- Interpretation ---
The coherence score indicates how consistent and semantically meaningful the topics are.
Scores range from 0 to 1. A higher score (closer to 1) means better topic modeling.
In this case, a score of 0.568 suggests the topics are reaso