In [None]:
!pip -q install gensim scikit-learn nltk

import os
os.makedirs("src", exist_ok=True)
os.makedirs("models", exist_ok=True)

# -----------------------------
# src/train_lda.py
# -----------------------------
train_py = r'''
import os
from typing import List

from sklearn.datasets import fetch_20newsgroups

import nltk
from nltk.stem import WordNetLemmatizer

from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess

MODELS_DIR = "models"

N_DOCS = 1000
NUM_TOPICS = 10
PASSES = 15

CUSTOM_STOPWORDS = {
    # generic conversation words in 20NG
    "people","know","like","time","said","think","use","thanks","year","want","good",
    # email/meta tokens
    "edu","com","subject","organization","writes","article","lines","from","re",
}

def ensure_nltk() -> None:
    nltk.download("wordnet", quiet=True)
    nltk.download("omw-1.4", quiet=True)

def preprocess(text: str, lemmatizer: WordNetLemmatizer) -> List[str]:
    # tokenize + lowercase (gensim)
    tokens = simple_preprocess(text, deacc=True, min_len=3, max_len=20)
    # remove stopwords
    stop = set(STOPWORDS) | CUSTOM_STOPWORDS
    tokens = [t for t in tokens if t not in stop]
    # lemmatize (reduces variations: games->game)
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

def main() -> None:
    ensure_nltk()
    lemmatizer = WordNetLemmatizer()

    data = fetch_20newsgroups(subset="train", remove=("headers","footers","quotes"))
    docs_raw = data.data[:N_DOCS]

    docs = [preprocess(d, lemmatizer) for d in docs_raw]

    # bigrams (helps: "space_shuttle", "gun_control" type patterns)
    phrases = Phrases(docs, min_count=5, threshold=10.0)
    bigram = Phraser(phrases)
    docs = [bigram[d] for d in docs]

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(d) for d in docs]

    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=NUM_TOPICS,
        passes=PASSES,
        alpha="auto",
        eta="auto",
        random_state=42,
        eval_every=None,
    )

    os.makedirs(MODELS_DIR, exist_ok=True)
    lda.save(os.path.join(MODELS_DIR, "lda_model"))
    dictionary.save(os.path.join(MODELS_DIR, "lda_dictionary.dict"))

    print("\n=== Discovered Topics (Top 15 words) ===")
    for tid, tstr in lda.print_topics(num_topics=NUM_TOPICS, num_words=15):
        print(f"Topic {tid}: {tstr}")

    print(f"\nSaved model: {MODELS_DIR}/lda_model")
    print(f"Saved dictionary: {MODELS_DIR}/lda_dictionary.dict")

if __name__ == "__main__":
    main()
'''
open("src/train_lda.py","w",encoding="utf-8").write(train_py)

# -----------------------------
# src/label_topics.py
# -----------------------------
label_py = r'''
import os
import json
from typing import Dict

from gensim.models import LdaModel
from gensim.corpora import Dictionary

MODELS_DIR = "models"

def main() -> None:
    lda = LdaModel.load(os.path.join(MODELS_DIR, "lda_model"))
    _dictionary = Dictionary.load(os.path.join(MODELS_DIR, "lda_dictionary.dict"))

    labels: Dict[str, str] = {}

    print("\n=== Topics to Label (Top 20 words with probabilities) ===")
    for topic_id in range(lda.num_topics):
        terms = lda.show_topic(topic_id, topn=20)
        pretty = ", ".join([f"{w} ({p:.3f})" for w, p in terms])
        print(f"\nTopic {topic_id}:\n  {pretty}")

        name = input("Enter a meaningful topic name (Enter to keep default): ").strip()
        labels[str(topic_id)] = name if name else f"Topic {topic_id}"

    labels_path = os.path.join(MODELS_DIR, "topic_labels.json")
    with open(labels_path, "w", encoding="utf-8") as f:
        json.dump(labels, f, ensure_ascii=False, indent=2)

    print("\n=== Final Topic Label Summary ===")
    for k in sorted(labels.keys(), key=lambda x: int(x)):
        print(f"{k}: {labels[k]}")
    print(f"\nSaved labels to: {labels_path}")

if __name__ == "__main__":
    main()
'''
open("src/label_topics.py","w",encoding="utf-8").write(label_py)

# -----------------------------
# src/infer_topics.py
# -----------------------------
infer_py = r'''
import os
import json
from typing import List, Tuple, Dict

import nltk
from nltk.stem import WordNetLemmatizer

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess

MODELS_DIR = "models"

CUSTOM_STOPWORDS = {
    "people","know","like","time","said","think","use","thanks","year","want","good",
    "edu","com","subject","organization","writes","article","lines","from","re",
}

def ensure_nltk() -> None:
    nltk.download("wordnet", quiet=True)
    nltk.download("omw-1.4", quiet=True)

def preprocess(text: str, lemmatizer: WordNetLemmatizer) -> List[str]:
    tokens = simple_preprocess(text, deacc=True, min_len=3, max_len=20)
    stop = set(STOPWORDS) | CUSTOM_STOPWORDS
    tokens = [t for t in tokens if t not in stop]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

def load_labels(num_topics: int) -> Dict[int, str]:
    path = os.path.join(MODELS_DIR, "topic_labels.json")
    labels: Dict[int, str] = {}
    if os.path.exists(path):
        raw = json.load(open(path, "r", encoding="utf-8"))
        labels = {int(k): str(v) for k, v in raw.items()}
    for i in range(num_topics):
        labels.setdefault(i, f"Topic {i}")
    return labels

def top_words(lda: LdaModel, topic_id: int, topn: int = 5) -> str:
    return ", ".join([w for w, _p in lda.show_topic(topic_id, topn=topn)])

def classify(text: str, lda: LdaModel, dictionary: Dictionary, lemmatizer: WordNetLemmatizer) -> List[Tuple[int, float]]:
    bow = dictionary.doc2bow(preprocess(text, lemmatizer))
    dist = lda.get_document_topics(bow, minimum_probability=0.0)
    dist = sorted(dist, key=lambda x: x[1], reverse=True)[:3]
    return dist

def main() -> None:
    ensure_nltk()
    lemmatizer = WordNetLemmatizer()

    lda = LdaModel.load(os.path.join(MODELS_DIR, "lda_model"))
    dictionary = Dictionary.load(os.path.join(MODELS_DIR, "lda_dictionary.dict"))
    labels = load_labels(lda.num_topics)

    print("\n=== Loaded Topics Summary (Top 5 words) ===")
    for tid in range(lda.num_topics):
        print(f"- {tid}: {labels[tid]} | {top_words(lda, tid, topn=5)}")

    samples = [
        "The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates.",
        "Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research team published their findings in Nature journal. This discovery could provide insights into planetary formation.",
        "The basketball team won the championship after an incredible final game. The players celebrated with fans in the stadium. It was the team's first title in twenty years.",
        "Congress passed a new bill regarding healthcare reform. The president is expected to sign the legislation next week. The policy will affect millions of citizens across the country.",
        "I love cooking Italian food at home. Pasta carbonara and margherita pizza are my favorite dishes to make. Fresh ingredients make all the difference in authentic recipes.",
    ]

    print("\n\n=== Running 5 sample classifications ===")
    for s in samples:
        preview = (s[:200] + "...") if len(s) > 200 else s
        print("\n" + "=" * 80)
        print("Document preview:")
        print(preview)

        top3 = classify(s, lda, dictionary, lemmatizer)
        print("\nTop 3 topics:")
        for tid, prob in top3:
            print(f"- {labels[tid]} (Topic {tid}) | P={prob:.4f} | top words: {top_words(lda, tid, topn=5)}")

if __name__ == "__main__":
    main()
'''
open("src/infer_topics.py","w",encoding="utf-8").write(infer_py)

print(" Created: src/train_lda.py, src/label_topics.py, src/infer_topics.py")

# Run in required order
!python src/train_lda.py
!python src/label_topics.py
!python src/infer_topics.py


âœ… Created: src/train_lda.py, src/label_topics.py, src/infer_topics.py

=== Discovered Topics (Top 15 words) ===
Topic 0: 0.019*"armenian" + 0.008*"government" + 0.007*"greek" + 0.006*"state" + 0.006*"case" + 0.006*"turkish" + 0.005*"killed" + 0.005*"health" + 0.005*"source" + 0.005*"person" + 0.004*"genocide" + 0.004*"word" + 0.004*"year" + 0.004*"russian" + 0.004*"day"
Topic 1: 0.013*"space" + 0.011*"mission" + 0.009*"shuttle" + 0.007*"orbit" + 0.007*"nasa" + 0.007*"launch" + 0.006*"pitcher" + 0.006*"flight" + 0.006*"bike" + 0.006*"satellite" + 0.006*"league" + 0.005*"cost" + 0.005*"earth" + 0.005*"new" + 0.005*"better"
Topic 2: 0.017*"jesus" + 0.013*"god" + 0.009*"argument" + 0.008*"thing" + 0.007*"way" + 0.007*"christian" + 0.007*"matthew" + 0.006*"true" + 0.006*"come" + 0.006*"father" + 0.006*"example" + 0.006*"believe" + 0.005*"course" + 0.005*"belief" + 0.005*"man"
Topic 3: 0.008*"nasa" + 0.007*"application" + 0.007*"program" + 0.007*"information" + 0.006*"member" + 0.006*"musl