In [2]:
import pandas as pd
import numpy as np
import torch
from bertopic import BERTopic
device = "mps" if torch.backends.mps.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data  = pd.read_csv("data/IMDbMovies-Clean.csv")

data_filtered = data[["Title","Summary"]].dropna()
data_filtered.head()


Unnamed: 0,Title,Summary
0,Napoleon,An epic that details the checkered rise and fa...
1,The Hunger Games: The Ballad of Songbirds & Sn...,Coriolanus Snow mentors and develops feelings ...
2,The Killer,"After a fateful near-miss, an assassin battles..."
3,Leo,A 74-year-old lizard named Leo and his turtle ...
4,Thanksgiving,"After a Black Friday riot ends in tragedy, a m..."


In [4]:
descriptions = data_filtered["Summary"]
titles = data_filtered["Title"]

We need :
- embedding model
- dimensionally reduction model
- clustering model
for performing BERTopic

In [4]:
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer

topic_model = BERTopic(language="English",
    embedding_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine",n_neighbors=30,random_state=101),
    hdbscan_model = HDBSCAN(min_samples=1,min_cluster_size=35, metric="euclidean", cluster_selection_method="eom",prediction_data=True),
    verbose=True
)
topics, probs = topic_model.fit_transform(descriptions)



2026-01-11 23:33:46,375 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 284/284 [00:40<00:00,  7.08it/s]
2026-01-11 23:34:26,562 - BERTopic - Embedding - Completed ✓
2026-01-11 23:34:26,562 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-11 23:34:43,160 - BERTopic - Dimensionality - Completed ✓
2026-01-11 23:34:43,161 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-11 23:34:43,322 - BERTopic - Cluster - Completed ✓
2026-01-11 23:34:43,324 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-11 23:34:43,405 - BERTopic - Representation - Completed ✓


In [5]:
topics_df = topic_model.get_topic_info()
print(topics_df.head(), topics_df.shape)

   Topic  Count                            Name  \
0     -1   3895                -1_the_to_of_and   
1      0    318               0_king_the_of_and   
2      1    304              1_war_ii_during_of   
3      2    263  2_school_high_teacher_students   
4      3    229       3_couple_their_woman_love   

                                      Representation  \
0     [the, to, of, and, in, his, her, is, an, with]   
1  [king, the, of, and, to, princess, witch, his,...   
2  [war, ii, during, of, german, the, world, sold...   
3  [school, high, teacher, students, at, college,...   
4  [couple, their, woman, love, two, her, they, m...   

                                 Representative_Docs  
0  [A 30-something woman navigating through love ...  
1  [Hal, wayward prince and heir to the English t...  
2  [The British military recruits a small group o...  
3  [A group of high school dropouts is forced to ...  
4  [A young woman engaged to be married finds her...   (54, 5)


apply the reranker :

In [33]:
import stopwordsiso as stopwords
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

en_stop_list = list(stopwords.stopwords("en"))
vectorizer_model = CountVectorizer(
    stop_words=en_stop_list,
    ngram_range=(1, 2),   
    min_df=5         )


representation_model = {
    "KeyBERT": KeyBERTInspired(top_n_words=5),
    "MMR": MaximalMarginalRelevance(diversity=0.2)
}



topic_model = BERTopic(
 language="English",
    embedding_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine",n_neighbors=30,random_state=101),
    hdbscan_model = HDBSCAN(    min_samples=1,min_cluster_size=30, metric="euclidean", cluster_selection_method="eom",prediction_data=True),
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

    top_n_words=10,
    verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(descriptions)

# Get initial results
topic_df = topic_model.get_topic_info()
topic_df

2026-01-12 00:27:59,845 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 284/284 [00:38<00:00,  7.34it/s]
2026-01-12 00:28:38,580 - BERTopic - Embedding - Completed ✓
2026-01-12 00:28:38,580 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-12 00:28:49,594 - BERTopic - Dimensionality - Completed ✓
2026-01-12 00:28:49,595 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-12 00:28:49,753 - BERTopic - Cluster - Completed ✓
2026-01-12 00:28:49,754 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-12 00:28:54,913 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,3816,-1_life_family_woman_love,"[life, family, woman, love, girl, father, town...","[story, crime, relationship, daughter, kidnapped]","[life, family, town, story, american, island, ...","[Karl Childers, a simple man hospitalized sinc..."
1,0,304,0_war_german_soldiers_nazi,"[war, german, soldiers, nazi, army, jewish, so...","[wwii, soldiers, army, vietnam war, special fo...","[war, soldiers, nazi, army, vietnam, wwii, mil...",[In the Nazi-occupied Netherlands during World...
2,1,263,1_school_teacher_students_girls,"[school, teacher, students, girls, student, co...","[teacher, school teacher, teachers, school stu...","[school, college, school teacher, school stude...",[An inner-city junior high school teacher with...
3,2,229,2_couple_woman_love_married,"[couple, woman, love, married, relationship, m...","[married couple, affair, couples, marriage, un...","[relationship, marriage, couples, affair, meet...",[The relationships of two couples become compl...
4,3,221,3_singer_music_band_rock,"[singer, music, band, rock, musical, musician,...","[life story, musical, musician, rock roll, sin...","[singer, music, band, musical, musician, radio...",[The story of the famous and influential 1960s...
...,...,...,...,...,...,...,...
58,57,32,57_cave_treasure_trapped_lost,"[cave, treasure, trapped, lost, ice, miners, u...","[cave, adventure, rescue mission, epic journey...","[cave, treasure, trapped, underground, rescue,...","[A chronicle of the enthralling, against-all-o..."
59,58,31,58_time_future_travel_time machine,"[time, future, travel, time machine, machine, ...","[time machine, time, century, future, 20th cen...","[time, future, time machine, machine, inventor...",[A young eccentric inventor and businessman wi...
60,59,30,59_peter_land_magical_factory,"[peter, land, magical, factory, queen, anna, c...","[peter, adventure, kidnaps, orphan, children]","[peter, land, anna, children, enchanted, stori...","[Follow the adventures of Peter Pan, a boy who..."
61,60,30,60_awakens_psychic_body_wakes,"[awakens, psychic, body, wakes, memory, dies, ...","[awakens, wakes, morgue, coma, visions]","[awakens, psychic, body, wakes, mysterious dea...","[A man wakes in a hospital with no memory, and..."


In [39]:
sentence_model  = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = sentence_model.encode(descriptions, show_progress_bar=False)
reduced_embeddings = UMAP(n_components=2, min_dist=0.0, metric="cosine",random_state=101).fit_transform(embeddings)

fig = topic_model.visualize_documents(
    titles,
    reduced_embeddings=reduced_embeddings,
    width=1200,
    hide_annotations=True
)


fig.update_traces(visible="legendonly")

fig.update_layout(
    font=dict(size=16),
    showlegend=True,
    legend_title_text="Topics"
)


In [11]:
data_filtered["topic_generated"] = topics


In [12]:
master_table_merged = data_filtered.merge(topic_df[topic_df["Topic"]!=-1][["Name","Topic"]], left_on = "topic_generated",right_on = "Topic", how = "inner")
master_table_merged = master_table_merged[["Title","Summary","Name"]]

In [13]:
master_table_merged.sample(5)

Unnamed: 0,Title,Summary,Name
4200,The Divide,Survivors of a nuclear attack are grouped toge...,35_shelter_apocalyptic_storm_destruction
2583,The New Guy,A high school senior branded uncool in the nin...,1_school_teacher_students_girls
676,Role Play,Young married couple whose past life's secrets...,2_couple_woman_love_married
4678,Onibaba,Two women kill samurai and sell their belongin...,28_tokyo_korean_warlord_japan
4473,A Dog's Way Home,A female dog travels four hundred miles in sea...,30_dog_dogs_owners_beloved


In [14]:
master_table_merged[master_table_merged["Name"]=="59_peter_land_magical_factory"].head(10)

Unnamed: 0,Title,Summary,Name
15,Wonka,Based on the extraordinary character at the ce...,59_peter_land_magical_factory
55,The Wizard of Oz,Young Dorothy Gale and her dog Toto are swept ...,59_peter_land_magical_factory
220,Frozen II,"Anna, Elsa, Kristoff, Olaf and Sven leave Aren...",59_peter_land_magical_factory
597,A Series of Unfortunate Events,"When a massive fire kills their parents, three...",59_peter_land_magical_factory
845,Peter Pan,The Darling family children receive a visit fr...,59_peter_land_magical_factory
923,The Chronicles of Narnia: Prince Caspian,"The Pevensie siblings return to Narnia, where ...",59_peter_land_magical_factory
931,Peter Pan & Wendy,"Follow the adventures of Peter Pan, a boy who ...",59_peter_land_magical_factory
1061,Trolls,"After the Bergens invade Troll Village, Poppy,...",59_peter_land_magical_factory
1144,Snow White and the Huntsman,"In a twist to the fairy tale, the Huntsman ord...",59_peter_land_magical_factory
1233,Return to Oz,"Dorothy, saved from a psychiatric experiment b...",59_peter_land_magical_factory


In [57]:
topic_model.save("artifacts/bertopic_model")



In [41]:
np.save("artifacts/reduced_embeddings.npy", reduced_embeddings)

In [59]:
master_table_merged.to_parquet("artifacts/movies.parquet")

In [15]:
import re
def parse_topic_id(s: str) -> int:
        m = re.match(r"^\s*(\d+)_", s)
        if not m:
            raise ValueError(
                f"Topic label '{s}' does not start with '<int>_'. "
                "Store a numeric 'topic' column or ensure labels start like '13_...'."
            )
        return int(m.group(1))

In [16]:
master_table_merged["topic_id"] = master_table_merged["Name"].apply(parse_topic_id)

In [17]:
master_table_merged

Unnamed: 0,Title,Summary,Name,topic_id
0,The Killer,"After a fateful near-miss, an assassin battles...",46_assassin_assassins_kill_hitman,46
1,Thanksgiving,"After a Black Friday riot ends in tragedy, a m...",8_christmas_eve_holiday_holidays,8
2,Saltburn,A student at Oxford University finds himself d...,1_school_teacher_students_girls,1
3,The Marvels,Carol Danvers gets her powers entangled with t...,11_super_powers_mutant_universe,11
4,The Creator,Against the backdrop of a war between humans a...,26_robot_human_humanoid_advanced,26
...,...,...,...,...
5262,Ip Man 2,Centering on Ip Man's migration to Hong Kong i...,27_martial_martial arts_arts_master,27
5263,The Marine,A group of diamond thieves on the run kidnap t...,15_heist_bank_thief_driver,15
5264,Space Babes from Outer Space,Three space women land on Earth in search of s...,7_woman_murder_husband_insurance,7
5265,Deep Water,A group of international passengers en route f...,36_diving_fish_infested_lake,36


In [21]:
titles.to_csv("artifacts/titles.csv")

In [None]:
a = pd.read_csv("artifacts/descriptions.csv",index_col=0)


0       An epic that details the checkered rise and fa...
1       Coriolanus Snow mentors and develops feelings ...
2       After a fateful near-miss, an assassin battles...
3       A 74-year-old lizard named Leo and his turtle ...
4       After a Black Friday riot ends in tragedy, a m...
                              ...                        
9078    In this animated musical, a girl builds a rock...
9079    A young drifter working on a river barge disru...
9080    The dramatic lives of trapeze artists, a clown...
9081    A group of international passengers en route f...
9082    A group of men set out in search of a dead bod...
Name: Summary, Length: 9083, dtype: object

In [42]:
fig.write_html("artifacts/topic_map.html")
