In [2]:
import json
import re
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.parsing.preprocessing import STOPWORDS

# Load artifacts 
lda_model = LdaModel.load("models/lda_model.model")
dictionary = corpora.Dictionary.load("models/lda_dict.dict")
with open("models/topic_labels.json", "r") as f:
    labels = json.load(f)

def preprocess_new(text): # Preprocess same as training
    tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if len(token) >= 3]
    return [token for token in tokens if token not in STOPWORDS]

def classify_text(text):
    bow = dictionary.doc2bow(preprocess_new(text)) 
    topic_dist = lda_model.get_document_topics(bow) 
    # Sort and get top 3 
    return sorted(topic_dist, key=lambda x: x[1], reverse=True)[:3]

# Sample Documents 
samples = [
    "The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates.",
    "Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research team published their findings in Nature journal. This discovery could provide insights into planetary formation.",
    "The basketball team won the championship after an incredible final game. The players celebrated with fans in the stadium. It was the team's first title in twenty years.",
    "Congress passed a new bill regarding healthcare reform. The president is expected to sign the legislation next week. The policy will affect millions of citizens across the country.",
    "I love cooking Italian food at home. Pasta carbonara and margherita pizza are my favorite dishes to make. Fresh ingredients make all the difference in authentic recipes."
]

print("--- Inference Results ---")
for i, sample in enumerate(samples, 1):
    print(f"\nSample {i} Preview: {sample[:75]}...") 
    top_topics = classify_text(sample)
    
    for topic_id, prob in top_topics:
        name = labels[str(topic_id)] 
        print(f"- {name}: {prob:.2%} probability")
        # Show top 5 words for topic 
        top_words = [w for w, p in lda_model.show_topic(topic_id, topn=5)]
        print(f"  Keywords: {', '.join(top_words)}")

--- Inference Results ---

Sample 1 Preview: The new graphics card delivers amazing performance for gaming. The GPU can ...
- Technology: 63.52% probability
  Keywords: windows, good, thanks, use, mail
- Cars: 21.30% probability
  Keywords: time, know, better, years, new
- Game Statistics: 11.93% probability
  Keywords: max, 145, year, insurance, like

Sample 2 Preview: Scientists discovered a new exoplanet orbiting a distant star in the habita...
- Space: 61.23% probability
  Keywords: space, like, use, scsi, shuttle
- History: 33.83% probability
  Keywords: people, armenian, armenians, turkish, genocide

Sample 3 Preview: The basketball team won the championship after an incredible final game. Th...
- Cars: 74.16% probability
  Keywords: time, know, better, years, new
- Education: 21.50% probability
  Keywords: edu, government, think, church, new

Sample 4 Preview: Congress passed a new bill regarding healthcare reform. The president is ex...
- Education: 50.10% probability
  Keyword