# Script part 1

Imports

In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import os
import json
import logging
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS

Logging

In [4]:
logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s",
    level=logging.INFO
)

Load Dataset

In [5]:
newsgroups = fetch_20newsgroups(
    subset="train",
    remove=("headers", "footers", "quotes"),
    random_state=42
)
documents = newsgroups.data[:1000]
print(f"Loaded {len(documents)} documents.")

Loaded 1000 documents.


Preprocessing

In [6]:
def preprocess(text: str) -> list[str]:
    """Tokenise, lowercase, remove stopwords, keep words ≥ 3 chars."""
    tokens = []
    for token in text.lower().split():
        # keep only alphabetic characters
        word = "".join(c for c in token if c.isalpha())
        if len(word) >= 3 and word not in STOPWORDS:
            tokens.append(word)
    return tokens


processed_docs = [preprocess(doc) for doc in documents]
print("Preprocessed documents")

Preprocessed documents


Build Dictionary & Corpus

In [7]:
dictionary = corpora.Dictionary(processed_docs)

# Filter rare / very common words
dictionary.filter_extremes(no_below=5, no_above=0.50)
dictionary.compactify()
print(f"Dictionary size after filtering: {len(dictionary)} tokens")

corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print("Converted to Bag-of-Words corpus")

Dictionary size after filtering: 2485 tokens
Converted to Bag-of-Words corpus


Train LDA Model

In [None]:
NUM_TOPICS = 10
PASSES = 15
RANDOM_STATE = 42

print(f"Training LDA model (topics={NUM_TOPICS}, passes={PASSES})")
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=PASSES,
    alpha="auto",
    eta="auto",
    random_state=RANDOM_STATE,
    per_word_topics=True
)
print("Training complete.")


Training LDA model  (topics=10, passes=15) …
Training complete.


Save Outputs

In [None]:
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

lda_model.save(os.path.join(MODELS_DIR, "lda_model"))
dictionary.save(os.path.join(MODELS_DIR, "dictionary.gensim"))
print(f"Model and dictionary saved to '{MODELS_DIR}/'.")

Model and dictionary saved to 'models/'.


Display Discovered Topics

In [None]:
print("DISCOVERED TOPICS (top 15 words each)")
print("=" * 40)

topic_labels = {}
for topic_id in range(NUM_TOPICS):
    top_words = lda_model.show_topic(topic_id, topn=15)
    words_str = ", ".join(word for word, _ in top_words)
    print(f"\nTopic {topic_id:02d}: {words_str}")
    topic_labels[topic_id] = [word for word, _ in top_words]

# Save topic labels as JSON for later use
labels_path = os.path.join(MODELS_DIR, "topic_labels.json")
with open(labels_path, "w") as fh:
    json.dump({str(k): v for k, v in topic_labels.items()}, fh, indent=2)
print(f"\n\nTopic labels saved to '{labels_path}'.")

DISCOVERED TOPICS (top 15 words each)

Topic 00: data, use, program, space, nasa, mission, dont, clipper, chip, applications, want, images, key, read, way

Topic 01: jesus, people, god, know, argument, true, think, dont, believe, example, truth, said, matthew, point, things

Topic 02: son, father, spirit, time, people, holy, years, right, space, john, earth, way, god, new, read

Topic 03: armenian, people, government, genocide, xsoviet, turkish, turks, like, think, new, armenia, russian, right, muslim, got

Topic 04: like, dont, files, file, use, want, people, image, know, dos, window, windows, time, bios, control

Topic 05: think, windows, dont, know, want, new, speed, car, like, case, people, battery, cars, thing, going

Topic 06: good, game, team, excellent, like, missing, games, year, runs, shuttle, know, dont, cover, season, fair

Topic 07: people, armenians, armenian, turkish, said, killed, government, genocide, greek, gun, israel, state, came, going, left

Topic 08: problem, ive

# Script part 2

In [None]:
import os
import json
from gensim import corpora
from gensim.models import LdaModel

Paths

In [None]:
MODELS_DIR = "models"
MODEL_PATH = os.path.join(MODELS_DIR, "lda_model")
DICT_PATH = os.path.join(MODELS_DIR, "dictionary.gensim")
LABELS_PATH = os.path.join(MODELS_DIR, "topic_names.json")
TOP_N_WORDS = 20

Load model & dictionary

In [None]:
lda_model = LdaModel.load(MODEL_PATH)
dictionary = corpora.Dictionary.load(DICT_PATH)
num_topics = lda_model.num_topics
print(f"  Loaded model with {num_topics} topics.\n")

  Loaded model with 10 topics.



Load model & dictionary

In [None]:
def display_topic(topic_id: int, topn: int = TOP_N_WORDS) -> None:
    """Pretty-print a topic with word probabilities."""
    top_words = lda_model.show_topic(topic_id, topn=topn)
    print(f"\n{'─'*55}")
    print(f"  Topic {topic_id:02d}")
    print(f"{'─'*55}")
    print(f"  {'Word':<20} {'Probability':>12}")
    print(f"  {'----':<20} {'-----------':>12}")
    for word, prob in top_words:
        print(f"  {word:<20} {prob:>12.6f}")


print("=" * 55)
print("       ALL DISCOVERED TOPICS  (top 20 words each)")
print("=" * 55)

for tid in range(num_topics):
    display_topic(tid)

print(f"\n{'='*55}\n")

       ALL DISCOVERED TOPICS  (top 20 words each)

───────────────────────────────────────────────────────
  Topic 00
───────────────────────────────────────────────────────
  Word                  Probability
  ----                  -----------
  data                     0.010870
  use                      0.008161
  program                  0.008092
  space                    0.007984
  nasa                     0.007913
  mission                  0.006374
  dont                     0.006235
  clipper                  0.006172
  chip                     0.006042
  applications             0.005543
  want                     0.005452
  images                   0.005187
  key                      0.005144
  read                     0.004995
  way                      0.004939
  like                     0.004938
  brian                    0.004656
  application              0.004624
  shuttle                  0.004382
  look                     0.004267

─────────────────────────────────

Interactive naming

In [None]:
topic_names: dict[str, str] = {}

print("Assigning a meaningful name to each topic.")

for tid in range(num_topics):
    default_name = f"Topic_{tid:02d}"
    top_words    = [w for w, _ in lda_model.show_topic(tid, topn=5)]
    hint         = ", ".join(top_words)
    prompt       = f"  Topic {tid:02d}  [{hint}]  → name: "

    user_input = input(prompt).strip()
    chosen     = user_input if user_input else default_name
    topic_names[str(tid)] = chosen
    print(f"    ✓ Topic {tid:02d} labelled as  '{chosen}'\n")

Now assign a meaningful name to each topic.
Press ENTER to keep the default name  (Topic_XX).

  Topic 00  [data, use, program, space, nasa]  → name: Nasa/Mision
    ✓ Topic 00 labelled as  'Nasa/Mision'

  Topic 01  [jesus, people, god, know, argument]  → name: Religion
    ✓ Topic 01 labelled as  'Religion'

  Topic 02  [son, father, spirit, time, people]  → name: Time/People
    ✓ Topic 02 labelled as  'Time/People'

  Topic 03  [armenian, people, government, genocide, xsoviet]  → name: Armenian/Genocid
    ✓ Topic 03 labelled as  'Armenian/Genocid'

  Topic 04  [like, dont, files, file, use]  → name: Social Media/File System
    ✓ Topic 04 labelled as  'Social Media/File System'

  Topic 05  [think, windows, dont, know, want]  → name: Prompting wording
    ✓ Topic 05 labelled as  'Prompting wording'

  Topic 06  [good, game, team, excellent, like]  → name: Competition
    ✓ Topic 06 labelled as  'Competition'

  Topic 07  [people, armenians, armenian, turkish, said]  → name: War
  

Save labels

In [None]:
os.makedirs(MODELS_DIR, exist_ok=True)
with open(LABELS_PATH, "w") as fh:
    json.dump(topic_names, fh, indent=2, ensure_ascii=False)

print(f"\nTopic names saved to  '{LABELS_PATH}'.")


Topic names saved to  'models/topic_names.json'.


Final summary

In [None]:
print(f"  {'ID':<6}  {'Assigned Name':<25}  Top-5 Words")
print(f"  {'--':<6}  {'-------------':<25}  ----------")
for tid in range(num_topics):
    name      = topic_names[str(tid)]
    top_words = ", ".join(w for w, _ in lda_model.show_topic(tid, topn=5))
    print(f"  {tid:<6}  {name:<25}  {top_words}")


  ID      Assigned Name              Top-5 Words
  --      -------------              ----------
  0       Nasa/Mision                data, use, program, space, nasa
  1       Religion                   jesus, people, god, know, argument
  2       Time/People                son, father, spirit, time, people
  3       Armenian/Genocid           armenian, people, government, genocide, xsoviet
  4       Social Media/File System   like, dont, files, file, use
  5       Prompting wording          think, windows, dont, know, want
  6       Competition                good, game, team, excellent, like
  7       War                        people, armenians, armenian, turkish, said
  8       Lawyer                     problem, ive, know, use, like
  9       Health/Medical             health, use, period, power, play


# Script part 3

Imports

In [None]:
import os
import json
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS

Paths

In [None]:
MODELS_DIR = "models"
MODEL_PATH = os.path.join(MODELS_DIR, "lda_model")
DICT_PATH = os.path.join(MODELS_DIR, "dictionary.gensim")
NAMES_PATH = os.path.join(MODELS_DIR, "topic_names.json")   # from Part 2
TOP_TOPICS = 3    # how many topics to return per document
TOP_WORDS = 5    # top words to display per topic

Preprocessing

In [None]:
def preprocess(text: str) -> list[str]:
    """Tokenise, lowercase, remove stopwords, keep alphabetic words ≥ 3 chars."""
    tokens = []
    for token in text.lower().split():
        word = "".join(c for c in token if c.isalpha())
        if len(word) >= 3 and word not in STOPWORDS:
            tokens.append(word)
    return tokens

Load model, dictionary & topic names

In [None]:
lda_model  = LdaModel.load(MODEL_PATH)
dictionary = corpora.Dictionary.load(DICT_PATH)
num_topics = lda_model.num_topics

# Load user-defined names (fall back to default if file missing)
if os.path.exists(NAMES_PATH):
    with open(NAMES_PATH) as fh:
        topic_names: dict[str, str] = json.load(fh)
    print(f"Topic names loaded from '{NAMES_PATH}'.")
else:
    topic_names = {str(i): f"Topic_{i:02d}" for i in range(num_topics)}
    print(f"  '{NAMES_PATH}' not found – using default names.")

print(f"Model has {num_topics} topics.\n")

Topic names loaded from 'models/topic_names.json'.
Model has 10 topics.



Startup: Summary of all topics

In [None]:
def print_topic_summary() -> None:
    print(f"  {'ID':<5}  {'Name':<25}  Top-5 Words")
    print(f"  {'--':<5}  {'----':<25}  ----------")
    for tid in range(num_topics):
        name      = topic_names.get(str(tid), f"Topic_{tid:02d}")
        top_words = ", ".join(w for w, _ in lda_model.show_topic(tid, topn=5))
        print(f"  {tid:<5}  {name:<25}  {top_words}")

print_topic_summary()

  ID     Name                       Top-5 Words
  --     ----                       ----------
  0      Nasa/Mision                data, use, program, space, nasa
  1      Religion                   jesus, people, god, know, argument
  2      Time/People                son, father, spirit, time, people
  3      Armenian/Genocid           armenian, people, government, genocide, xsoviet
  4      Social Media/File System   like, dont, files, file, use
  5      Prompting wording          think, windows, dont, know, want
  6      Competition                good, game, team, excellent, like
  7      War                        people, armenians, armenian, turkish, said
  8      Lawyer                     problem, ive, know, use, like
  9      Health/Medical             health, use, period, power, play


Classification function

In [None]:
def classify_document(text: str, top_n: int = TOP_TOPICS) -> list[dict]:
    """
    Classify a text document.

    Returns
    -------
    List of dicts (sorted by probability, descending), each containing:
        topic_id    : int
        name        : str
        probability : float
        top_words   : list[str]
    """
    tokens  = preprocess(text)
    bow     = dictionary.doc2bow(tokens)
    # get_document_topics returns sorted list by default; ensure completeness
    topic_dist = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_dist_sorted = sorted(topic_dist, key=lambda x: x[1], reverse=True)

    results = []
    for tid, prob in topic_dist_sorted[:top_n]:
        results.append({
            "topic_id"   : tid,
            "name"       : topic_names.get(str(tid), f"Topic_{tid:02d}"),
            "probability": float(prob),
            "top_words"  : [w for w, _ in lda_model.show_topic(tid, topn=TOP_WORDS)],
        })
    return results

Display helper

In [None]:
def display_classification(doc_num: int, text: str) -> None:
    PREVIEW_LEN = 200
    preview = text.strip().replace("\n", " ")
    preview = preview[:PREVIEW_LEN] + ("…" if len(preview) > PREVIEW_LEN else "")

    print("─" * 62)
    print(f"  Document {doc_num}")
    print("─" * 62)
    print(f"  Preview : {preview}\n")

    results = classify_document(text)

    print(f"  {'Rank':<5}  {'Topic Name':<25}  {'Probability':>11}  Top-5 Words")
    print(f"  {'----':<5}  {'----------':<25}  {'-----------':>11}  ----------")
    for rank, r in enumerate(results, start=1):
        words_str = ", ".join(r["top_words"])
        print(f"  {rank:<5}  {r['name']:<25}  {r['probability']:>11.4f}  {words_str}")
    print()

Sample documents

In [None]:
SAMPLE_DOCS = [
    # 1 – Gaming / Technology
    (
        "Gaming Technology",
        "The new graphics card delivers amazing performance for gaming. The GPU can "
        "handle 4K resolution easily with ray tracing enabled. Gamers will love the improved "
        "frame rates."
    ),

    # 2 – Science / Space
    (
        "Space Science",
        "Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. "
        "The research team published their findings in Nature journal. This discovery could "
        "provide insights into planetary formation."
    ),

    # 3 – Sports
    (
        "Sports",
        "The basketball team won the championship after an incredible final game. The "
        "players celebrated with fans in the stadium. It was the team's first title in twenty "
        "years."
    ),

    # 4 – Politics / Government
    (
        "Politics",
        "Congress passed a new bill regarding healthcare reform. The president is expected "
        "to sign the legislation next week. The policy will affect millions of citizens across the "
        "country."
    ),

    # 5 – Food / Cooking
    (
        "Food & Cooking",
        "I love cooking Italian food at home. Pasta carbonara and margherita pizza are my "
        "favorite dishes to make. Fresh ingredients make all the difference in authentic "
        "recipes."
    ),
]

Run example classifications

In [None]:
for idx, (label, doc) in enumerate(SAMPLE_DOCS, start=1):
    print(f"  Sample Topic: {label}")
    display_classification(idx, doc)

print("Classification complete.")

  Sample Topic: Gaming Technology
──────────────────────────────────────────────────────────────
  Document 1
──────────────────────────────────────────────────────────────
  Preview : The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates.

  Rank   Topic Name                 Probability  Top-5 Words
  ----   ----------                 -----------  ----------
  1      Lawyer                          0.6343  problem, ive, know, use, like
  2      Social Media/File System        0.3284  like, dont, files, file, use
  3      Religion                        0.0059  jesus, people, god, know, argument

  Sample Topic: Space Science
──────────────────────────────────────────────────────────────
  Document 2
──────────────────────────────────────────────────────────────
  Preview : Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research 

In [None]:
# Կարծում եմ 1000-ը քիչ է եղել, չափը մեծացնելը կլավացնի արդյունքները