In [None]:
!git clone https://github.com/karennik98/nlp_course_mag_2026.git
%cd nlp_course_mag_2026

Cloning into 'nlp_course_mag_2026'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/content/nlp_course_mag_2026/nlp_course_mag_2026


# Installing the required packages


In [None]:
!pip install gensim
!pip install -U scikit-learn


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Succes

# Make Model Train


In [6]:
import os
from collections import defaultdict

from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.datasets import fetch_20newsgroups

def preprocess(text):
  return [word for word in text.lower().split() if word not in STOPWORDS]

#Loading dataset
dataset = fetch_20newsgroups(
    subset="train",
    remove=("headers", "footers", "quotes")
)

documents = dataset.data[:1000]

processed_docs = [preprocess(doc) for doc in documents]

dictionary = corpora.Dictionary(processed_docs)

dictionary.filter_extremes(
    no_below=5,
    no_above=0.5,
)

corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    passes=15,
    alpha="auto",
    eta="auto",
    random_state=42
)

os.makedirs("models", exist_ok=True)

lda_model.save("models/lda_model.model")
dictionary.save("models/dictionary.dict")

#Topics
for idx, topic in lda_model.print_topics(num_topics=10, num_words=15):
    print(f"Topic {idx}: {topic}")


Topic 0: 0.131*"1" + 0.069*"0" + 0.046*"2" + 0.026*"---" + 0.020*"3" + 0.018*"4" + 0.014*"period" + 0.012*"power" + 0.010*"5" + 0.008*"*" + 0.007*"8" + 0.007*"7" + 0.007*"second" + 0.007*"-" + 0.006*"1,"
Topic 1: 0.095*"-" + 0.015*"armenian" + 0.012*"armenians" + 0.010*"good" + 0.008*"people" + 0.007*"turkish" + 0.006*"genocide" + 0.006*"excellent" + 0.006*"russian" + 0.005*"missing" + 0.004*"x-soviet" + 0.004*"like" + 0.004*"came" + 0.004*"said" + 0.004*"new"
Topic 2: 0.010*"don't" + 0.009*"like" + 0.008*"know" + 0.007*"right" + 0.007*"think" + 0.006*"it's" + 0.006*"car" + 0.006*"drive" + 0.006*"lot" + 0.005*"want" + 0.005*"run" + 0.004*"look" + 0.004*"didn't" + 0.004*"program" + 0.004*"it."
Topic 3: 0.012*"space" + 0.009*"nasa" + 0.008*"shuttle" + 0.008*"government" + 0.007*"turkish" + 0.006*"people" + 0.006*"mission" + 0.005*"-" + 0.005*"military" + 0.004*"muslim" + 0.004*"killed" + 0.004*"medical" + 0.004*"turks" + 0.004*"runs" + 0.004*"better"
Topic 4: 0.009*"people" + 0.007*"use"

# Topic Labeling


In [7]:
import json
from gensim.models import LdaModel
from gensim import corpora

lda_model = LdaModel.load("models/lda_model.model")
dictionary = corpora.Dictionary.load("models/dictionary.dict")

topic_labels = {}

for topic_id in range(lda_model.num_topics):
  print(f"\nTopic {topic_id}")

  words = lda_model.show_topic(topic_id, topn=20)

  for word, prob in words:
    print(f"{word:<15} {prob:.4f}")

  label = input("\nEnter a meaningful name (or press Enter to skip): ")

  if label.strip() == "":
    topic_labels[topic_id] = f"Topic {topic_id}"
  else:
    topic_labels[topic_id] = label.strip()

with open("models/topic_labels.json", "w") as f:
    json.dump(topic_labels, f, indent=4)

print("\nFinal Topic Summary:\n")

for topic_id, label in topic_labels.items():
    print(f"{topic_id}: {label}")




Topic 0
1               0.1313
0               0.0692
2               0.0464
---             0.0261
3               0.0199
4               0.0184
period          0.0145
power           0.0122
5               0.0101
*               0.0080
8               0.0073
7               0.0070
second          0.0066
-               0.0066
1,              0.0065
6               0.0058
card            0.0056
2,              0.0054
3,              0.0050
20              0.0048

Enter a meaningful name (or press Enter to skip): 

Topic 1
-               0.0952
armenian        0.0146
armenians       0.0115
good            0.0101
people          0.0079
turkish         0.0066
genocide        0.0062
excellent       0.0062
russian         0.0058
missing         0.0049
x-soviet        0.0044
like            0.0044
came            0.0043
said            0.0040
new             0.0040
o               0.0038
right           0.0036
fair            0.0034
left            0.0031
war             0.0030

Enter a m

# Inference Script

In [8]:
import json

from gensim.models import LdaModel
from gensim import corpora
from gensim.parsing.preprocessing import STOPWORDS

def preprocess(text):
  return [word for word in text.lower().split() if word not in STOPWORDS]

lda_model = LdaModel.load("models/lda_model.model")
dictionary = corpora.Dictionary.load("models/dictionary.dict")

try:
    with open("models/topic_labels.json", "r") as f:
        topic_labels = json.load(f)
except:
    topic_labels = {i: f"Topic {i}" for i in range(lda_model.num_topics)}

#Load topics summary
for topic_id in range(lda_model.num_topics):
    label = topic_labels.get(str(topic_id), f"Topic {topic_id}")
    words = lda_model.show_topic(topic_id, topn=5)
    word_list = ", ".join([w for w, _ in words])

    print(f"{topic_id}: {label}")
    print(f"   Top words: {word_list}\n")

def classify_document(text):
    print("\n" + "="*60)
    print("Document Preview:")
    print(text[:200], "...\n")

    processed = preprocess(text)
    bow = dictionary.doc2bow(processed)

    topics = lda_model.get_document_topics(bow)
    topics = sorted(topics, key=lambda x: x[1], reverse=True)[:3]

    print("Top 3 Topics:\n")

    for topic_id, prob in topics:
        label = topic_labels.get(str(topic_id), f"Topic {topic_id}")
        print(f"{label} (Probability: {prob:.4f})")

        words = lda_model.show_topic(topic_id, topn=5)
        word_list = ", ".join([w for w, _ in words])
        print(f"   Top words: {word_list}\n")

samples = [
    "The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates.",

    "Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research team published their findings in Nature journal. This discovery could provide insights into planetary formation.",

    "The basketball team won the championship after an incredible final game. The players celebrated with fans in the stadium. It was the team's first title in twenty years.",

    "Congress passed a new bill regarding healthcare reform. The president is expected to sign the legislation next week. The policy will affect millions of citizens across the country.",

    "I love cooking Italian food at home. Pasta carbonara and margherita pizza are my favorite dishes to make. Fresh ingredients make all the difference in authentic recipes."
]

for sample in samples:
    classify_document(sample)


0: Topic 0
   Top words: 1, 0, 2, ---, 3

1: Topic 1
   Top words: -, armenian, armenians, good, people

2: Topic 2
   Top words: don't, like, know, right, think

3: Topic 3
   Top words: space, nasa, shuttle, government, turkish

4: Topic 4
   Top words: people, use, health, --, jesus

5: Topic 5
   Top words: ., |, x, don't, line

6: Topic 6
   Top words: jesus, know, like, people, matthew

7: Topic 7
   Top words: =, |, *, use, }

8: Topic 8
   Top words: :, don't, like, it's, people

9: Topic 9
   Top words: windows, #, use, want, know


Document Preview:
The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates. ...

Top 3 Topics:

Topic 0 (Probability: 0.6085)
   Top words: 1, 0, 2, ---, 3

Topic 4 (Probability: 0.3418)
   Top words: people, use, health, --, jesus


Document Preview:
Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. T