In [4]:
# Imports
import os
import re
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.datasets import fetch_20newsgroups
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Preprocessing Function
def preprocess(text):
    tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    tokens = [
        token for token in tokens
        if token not in STOPWORDS and len(token) >= 3
    ]
    return tokens

In [5]:
# Preprocessing Function
print("Loading dataset...")

newsgroups = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes')
)

documents = newsgroups.data[:1000]
print(f"Loaded {len(documents)} documents")

Loading dataset...
Loaded 1000 documents


In [6]:
# Preprocess Documents
processed_docs = [preprocess(doc) for doc in documents]

In [7]:
# Create Dictionary & Filter Words
dictionary = corpora.Dictionary(processed_docs)

dictionary.filter_extremes(
    no_below=5,     # appears in at least 5 docs
    no_above=0.5    # appears in <= 50% docs
)

print(f"Vocabulary size: {len(dictionary)}")

Vocabulary size: 2540


In [8]:
# Create BoW Corpus
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [9]:
# Train LDA Model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    passes=15,
    alpha='auto',
    eta='auto',
    random_state=42
)

In [10]:
# Save Model & Dictionary
os.makedirs("models", exist_ok=True)

lda_model.save("models/lda_model")
dictionary.save("models/dictionary.dict")

print("Model saved successfully!")

Model saved successfully!


In [11]:
# Print Discovered Topics
for i, topic in lda_model.print_topics(num_words=15):
    print(f"\nTopic {i}")
    print(topic)
    print("-" * 60)


Topic 0
0.009*"greek" + 0.008*"gun" + 0.007*"nasa" + 0.007*"data" + 0.007*"state" + 0.006*"space" + 0.006*"time" + 0.006*"think" + 0.005*"government" + 0.005*"use" + 0.005*"people" + 0.005*"going" + 0.004*"new" + 0.004*"guns" + 0.004*"way"
------------------------------------------------------------

Topic 1
0.014*"argument" + 0.012*"true" + 0.011*"god" + 0.011*"example" + 0.011*"know" + 0.009*"truth" + 0.008*"believe" + 0.007*"conclusion" + 0.007*"son" + 0.007*"people" + 0.007*"father" + 0.007*"spirit" + 0.006*"bible" + 0.006*"way" + 0.006*"like"
------------------------------------------------------------

Topic 2
0.025*"armenian" + 0.020*"turkish" + 0.017*"armenians" + 0.015*"genocide" + 0.013*"people" + 0.010*"soviet" + 0.009*"russian" + 0.009*"turks" + 0.008*"war" + 0.008*"government" + 0.008*"killed" + 0.007*"jews" + 0.007*"muslim" + 0.006*"population" + 0.006*"army"
------------------------------------------------------------

Topic 3
0.055*"max" + 0.018*"use" + 0.011*"health" 