# Topic Modeling and Unsupervised Learning with BERTopic

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [4]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cufft_cu12-11.2.1.

In [6]:
docs = pd.read_csv('/kaggle/input/docs-and-phrases/phraseadded.csv').iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [7]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

docs_text = docs["concepts_found"].apply(lambda toks: " ".join(toks)).tolist()

2025-05-07 07:08:09.963493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746601690.155454      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746601690.211072      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
topic_model = BERTopic(
    language="english",
    embedding_model="all-MiniLM-L6-v2",            # HF sentence-transformer
    vectorizer_model=CountVectorizer(              # tweak n-grams, stop-words, etc.
        ngram_range=(1,2),
        stop_words="english",
        max_df=0.85,
        min_df=5
    )
)

topics, probs = topic_model.fit_transform(docs_text)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
docs["topic_id"]   = topics
docs["topic_prob"] = [p.max() if p is not None else None for p in probs]

print(topic_model.get_topic_info().head())   # list of topic IDs + sizes + representation
print(topic_model.get_topic(0))              # top 10 words for topic 0

docs.to_csv("docs_with_bert_topics.csv", index=False)

   Topic  Count                                               Name  \
0     -1  30712                -1_3d_sparse_reinforcement_language   
1      0    581  0_causal_causality_causal causality_observational   
2      1    484         1_action_actions action_activities_actions   
3      2    466              2_bert_bert nlp_nlp bert_bert encoder   
4      3    451  3_interpretability_explanations_explainability...   

                                      Representation  \
0  [3d, sparse, reinforcement, language, logic, d...   
1  [causal, causality, causal causality, observat...   
2  [action, actions action, activities, actions, ...   
3  [bert, bert nlp, nlp bert, bert encoder, bert ...   
4  [interpretability, explanations, explainabilit...   

                                 Representative_Docs  
0  [gans editing gan face flow, clustering regula...  
1  [causality causal observational knowledge rela...  
2  [activities activity actions action task, acti...  
3  [bert nlp predictio