#### LAB ASSIGNMENT 3
##### Mysara Qistina binti Mahadzir SW01083524
##### Addelina binti Mohd Zulkifli SW01082366

###### Import the necessary libraries

In [121]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

###### Read the data

In [122]:
documents = pd.read_csv("news_dataset.csv", usecols=['text'])

documents = documents.dropna(subset=['text'])

###### Perform text pre-processing

In [123]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    tokens = word_tokenize(text.lower())
    # Remove null values
    tokens = [token for token in tokens if token.isalnum()]
    # Stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents['text']]
preprocessed_documents

[['wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  'sport',
  'car',
  'looked',
  'late',
  'early',
  '70',
  'called',
  'bricklin',
  'door',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'spec',
  'year',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please'],
 ['recently',
  'posted',
  'article',
  'asking',
  'kind',
  'rate',
  'single',
  'male',
  'driver',
  '25',
  'yr',
  'old',
  'paying',
  'performance',
  'car',
  'summary',
  'reply',
  'received',
  '25',
  'anymore',
  '27',
  'close',
  'enough',
  '1992',
  'dodge',
  'stealth',
  'turbo',
  '300hp',
  'model',
  'ticket',
  'accident',
  'house',
  'taken',
  'defensive',
  'driving',
  '1',
  'airbag',
  'ab',
  'security',
  'alarm',
  'single',
  '500',
  'decut',
  'state',
  'farm',
  'insurance',
  '

###### Perform LDA using Gensim

In [124]:
dictionary = corpora.Dictionary(preprocessed_documents)

corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [125]:
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [126]:
article_labels = []

for i, doc in enumerate(preprocessed_documents):
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

In [127]:
df = pd.DataFrame({"Article": documents['text'], "Topic": article_labels})

print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      3
1      I recently posted an article asking what kind ...      3
2      \nIt depends on your priorities.  A lot of peo...      3
3      an excellent automatic can be found in the sub...      3
4      : Ford and his automobile.  I need information...      3
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      0
11310  Hi !\n\nI am interested in the source of FEAL ...      0
11311  The actual algorithm is classified, however, t...      3
11312  \n\tThis appears to be generic calling upon th...      3
11313  \nProbably keep quiet and take it, lest they g...      3

[11096 rows x 2 columns]



###### Evaluate the LDA model using Coherence score

In [128]:
coherence_model_lda = CoherenceModel(model=lda_model,
                                      texts=preprocessed_documents,
                                      dictionary=dictionary,
                                      coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score:.4f}')


Coherence Score: 0.6364


###### Interpret the result

In [129]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"-{word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
-"x" (weight: 0.016)
-"key" (weight: 0.009)
-"file" (weight: 0.007)
-"use" (weight: 0.007)
-"system" (weight: 0.006)
-"db" (weight: 0.005)
-"one" (weight: 0.005)
-"window" (weight: 0.005)
-"chip" (weight: 0.004)
-"program" (weight: 0.004)

Topic 1:
-"people" (weight: 0.009)
-"would" (weight: 0.008)
-"one" (weight: 0.008)
-"say" (weight: 0.005)
-"think" (weight: 0.004)
-"know" (weight: 0.004)
-"god" (weight: 0.004)
-"right" (weight: 0.004)
-"u" (weight: 0.004)
-"like" (weight: 0.004)

Topic 2:
-"1" (weight: 0.033)
-"max" (weight: 0.027)
-"0" (weight: 0.026)
-"2" (weight: 0.021)
-"g" (weight: 0.017)
-"r" (weight: 0.017)
-"7" (weight: 0.016)
-"q" (weight: 0.015)
-"p" (weight: 0.014)
-"3" (weight: 0.013)

Topic 3:
-"would" (weight: 0.006)
-"year" (weight: 0.005)
-"q" (weight: 0.005)
-"one" (weight: 0.005)
-"get" (weight: 0.004)
-"new" (weight: 0.004)
-"game" (weight: 0.004)
-"president" (weight: 0.004)
-"think" (weight: 0.003)
-"time" (weight: 0.003)



##### Topic 0: Technology

##### Topic 1: Religion

##### Topic 2: Numeri

##### Topic 3: Sports

#### Discussion
##### The coherence score for the document is 0.6364, which indicates that the topic words are highly related to each other and the model effectively groups similar words together. 