In [1]:
import pandas as pd
import numpy as np
import torch
from bertopic import BERTopic
device = "mps" if torch.backends.mps.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data  = pd.read_csv("data/IMDbMovies-Clean.csv")

data_filtered = data[["Title","Summary"]].dropna()
data_filtered.head()


Unnamed: 0,Title,Summary
0,Napoleon,An epic that details the checkered rise and fa...
1,The Hunger Games: The Ballad of Songbirds & Sn...,Coriolanus Snow mentors and develops feelings ...
2,The Killer,"After a fateful near-miss, an assassin battles..."
3,Leo,A 74-year-old lizard named Leo and his turtle ...
4,Thanksgiving,"After a Black Friday riot ends in tragedy, a m..."


In [3]:
descriptions = data_filtered["Summary"]
titles = data_filtered["Title"]

We need :
- embedding model
- dimensionally reduction model
- clustering model
for performing BERTopic

In [4]:
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer

topic_model = BERTopic(language="English",
    embedding_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine",n_neighbors=30,random_state=101),
    hdbscan_model = HDBSCAN(min_samples=1,min_cluster_size=35, metric="euclidean", cluster_selection_method="eom",prediction_data=True),
    verbose=True
)
topics, probs = topic_model.fit_transform(descriptions)



2026-01-12 20:32:26,814 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 284/284 [00:38<00:00,  7.37it/s]
2026-01-12 20:33:05,390 - BERTopic - Embedding - Completed ✓
2026-01-12 20:33:05,390 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-12 20:33:21,771 - BERTopic - Dimensionality - Completed ✓
2026-01-12 20:33:21,772 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-12 20:33:21,931 - BERTopic - Cluster - Completed ✓
2026-01-12 20:33:21,933 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-12 20:33:22,012 - BERTopic - Representation - Completed ✓


In [5]:
topics_df = topic_model.get_topic_info()
topics_df.shape

# 51 + 1 outlier topic

(52, 5)

In [6]:
topics_df.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4124,-1_the_of_to_and,"[the, of, to, and, in, his, her, is, an, with]",[A 16-year-old girl returns home from camp and...
1,0,335,0_police_detective_cop_drug,"[police, detective, cop, drug, murder, the, is...",[A quadriplegic ex-homicide detective and his ...
2,1,321,1_king_the_of_and,"[king, the, of, and, to, princess, his, witch,...","[At the end of the ice age, an evil queen and ..."
3,2,277,2_their_couple_woman_her,"[their, couple, woman, her, love, two, wedding...",[The relationships of two couples become compl...
4,3,265,3_agent_cia_the_terrorist,"[agent, cia, the, terrorist, spy, an, operativ...",[A young pickpocket and an unruly CIA agent te...


apply the reranker :

In [7]:
import stopwordsiso as stopwords
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

en_stop_list = list(stopwords.stopwords("en"))
vectorizer_model = CountVectorizer(
    stop_words=en_stop_list,
    ngram_range=(1, 2),   
    min_df=5         )


representation_model = {
    "KeyBERT": KeyBERTInspired(top_n_words=5),
    "MMR": MaximalMarginalRelevance(diversity=0.2)
}



topic_model = BERTopic(
 language="English",
    embedding_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine",n_neighbors=30,random_state=101),
    hdbscan_model = HDBSCAN(    min_samples=1,min_cluster_size=35, metric="euclidean", cluster_selection_method="eom",prediction_data=True),
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

    top_n_words=10,
    verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(descriptions)

# Get initial results
topic_df = topic_model.get_topic_info()
topic_df.head()

#  uyguladığımız representation modelleri sayesinde clusterları çok daha iyi açıklayan isimler elde ettik

# mesela 1.cluster ilk olarak king_the_of_and olarak isimlendirilmişken şuan  king_prince_warrior_quest olarak ifade edilmiş

2026-01-12 20:33:25,025 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 284/284 [00:37<00:00,  7.58it/s]
2026-01-12 20:34:02,559 - BERTopic - Embedding - Completed ✓
2026-01-12 20:34:02,559 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-12 20:34:13,529 - BERTopic - Dimensionality - Completed ✓
2026-01-12 20:34:13,530 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-12 20:34:13,687 - BERTopic - Cluster - Completed ✓
2026-01-12 20:34:13,688 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-12 20:34:18,183 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,4124,-1_life_woman_family_girl,"[life, woman, family, girl, love, mother, frie...","[film, story, relationship, murder, couple]","[life, woman, family, story, town, murder, sch...","[While doing a thesis about violence, Ángela f..."
1,0,335,0_police_detective_cop_drug,"[police, detective, cop, drug, murder, cops, s...","[detective, policeman, investigate murder, cop...","[police, detective, cops, serial killer, polic...",[A quadriplegic ex-homicide detective and his ...
2,1,321,1_king_prince_warrior_quest,"[king, prince, warrior, quest, evil, magic, th...","[throne, arthur, adventures, adventure, quest]","[king, warrior, quest, throne, roman, arthur, ...","[Alex, a 12-year-old boy, and his friends figh..."
3,2,277,2_couple_wedding_woman_love,"[couple, wedding, woman, love, relationship, m...","[married couple, wedding, couples, marriage, a...","[wedding, relationship, marriage, couples, aff...",[A middle-aged couple's career and marriage ar...
4,3,265,3_agent_cia_terrorist_spy,"[agent, cia, terrorist, spy, russian, presiden...","[cia, espionage, undercover, agent, spy]","[agent, cia, terrorist, spy, soviet, undercove...",[When a crooked federal agent is involved in a...


**GRAPHS**

In [8]:
sentence_model  = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = sentence_model.encode(descriptions, show_progress_bar=False)
reduced_embeddings = UMAP(n_components=2, min_dist=0.0, metric="cosine",random_state=101).fit_transform(embeddings)

fig = topic_model.visualize_documents(
    titles,
    reduced_embeddings=reduced_embeddings,
    width=1200,
    hide_annotations=True
)


fig.update_traces(visible="legendonly")

fig.update_layout(
    font=dict(size=16),
    showlegend=True,
    legend_title_text="Topics"
)


In [9]:

topic_model.visualize_topics([0,3,9])



In [10]:

topic_model.visualize_heatmap(n_clusters=10,top_n_topics=25)

In [11]:

topic_model.visualize_hierarchy(  top_n_topics=10)

In [12]:
data_filtered["topic_generated"] = topics


In [13]:
master_table_merged = data_filtered.merge(topic_df[topic_df["Topic"]!=-1][["Name","Topic"]], left_on = "topic_generated",right_on = "Topic", how = "inner")
master_table_merged = master_table_merged[["Title","Summary","Name"]]

**SAVE THE ARTIFACTS SO WE CAN USE IN STREAMLIT APP**

In [15]:
topic_model.save("artifacts/bertopic_model")



In [16]:
np.save("artifacts/reduced_embeddings.npy", reduced_embeddings)

In [17]:
master_table_merged.to_parquet("artifacts/movies.parquet")

In [18]:
import re
def parse_topic_id(s: str) -> int:
        m = re.match(r"^\s*(\d+)_", s)
        if not m:
            raise ValueError(
                f"Topic label '{s}' does not start with '<int>_'. "
                "Store a numeric 'topic' column or ensure labels start like '13_...'."
            )
        return int(m.group(1))

In [19]:
master_table_merged["topic_id"] = master_table_merged["Name"].apply(parse_topic_id)

In [20]:
titles.to_csv("artifacts/titles.csv")

In [21]:
fig.write_html("artifacts/topic_map.html")
