In [4]:
import os
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import re

# 1. Load Dataset 
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = data.data[:1000] 

# 2. Preprocessing 
def preprocess(text):
    # Tokenize, lowercase, and filter by length 
    tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if len(token) >= 3]
    # Remove stopwords 
    return [token for token in tokens if token not in STOPWORDS]

processed_docs = [preprocess(doc) for doc in documents]

# 3. Dictionary and Corpus 
dictionary = corpora.Dictionary(processed_docs)
# Filter rare and common words 
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# 4. Train LDA Model 
print("Training LDA model...")
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    passes=15,
    alpha='auto',
    eta='auto'
)

# 5. Saving 
os.makedirs("models", exist_ok=True)
lda_model.save("models/lda_model.model")
dictionary.save("models/lda_dict.dict")

# 6. Print Topics
print("\nDiscovered Topics (Top 15 words):")
for idx, topic in lda_model.print_topics(num_words=15):
    print(f"Topic {idx}: {topic}")

Training LDA model...

Discovered Topics (Top 15 words):
Topic 0: 0.108*"max" + 0.032*"145" + 0.013*"year" + 0.008*"insurance" + 0.007*"like" + 0.007*"state" + 0.006*"edu" + 0.005*"university" + 0.005*"think" + 0.004*"great" + 0.004*"look" + 0.004*"want" + 0.004*"100" + 0.004*"heard" + 0.004*"good"
Topic 1: 0.014*"space" + 0.011*"like" + 0.008*"use" + 0.007*"scsi" + 0.007*"shuttle" + 0.006*"nasa" + 0.006*"bit" + 0.006*"know" + 0.006*"image" + 0.005*"want" + 0.005*"files" + 0.005*"chip" + 0.005*"orbit" + 0.005*"mission" + 0.005*"power"
Topic 2: 0.016*"god" + 0.012*"know" + 0.010*"true" + 0.010*"argument" + 0.009*"believe" + 0.008*"truth" + 0.008*"example" + 0.008*"way" + 0.008*"think" + 0.007*"people" + 0.007*"bible" + 0.007*"spirit" + 0.007*"good" + 0.006*"son" + 0.006*"father"
Topic 3: 0.010*"said" + 0.009*"time" + 0.008*"like" + 0.008*"people" + 0.006*"gun" + 0.006*"think" + 0.005*"way" + 0.005*"right" + 0.005*"know" + 0.005*"problem" + 0.005*"got" + 0.005*"israel" + 0.004*"away" + 0