# Topic Modeling and Unsupervised Learning with BERTopic

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [3]:
docs = pd.read_csv("/Users/narenprax/Documents/GitHub/RRR-datacreation/phrase_added.csv").iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [4]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

docs_text = docs["concepts_found"].apply(lambda toks: " ".join(toks)).tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from umap import UMAP

umap_model = UMAP(random_state=42)

In [6]:
topic_model = BERTopic(
    umap_model=umap_model,
    language="english",
    embedding_model="all-MiniLM-L6-v2",            # HF sentence-transformer
    vectorizer_model=CountVectorizer(              # tweak n-grams, stop-words, etc.
        ngram_range=(1,2),
        stop_words="english",
        max_df=0.85,
        min_df=5,
    ),
    nr_topics= 35,            
)

topics, probs = topic_model.fit_transform(docs_text)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [7]:
docs["topic_id"]   = topics
docs["topic_prob"] = [p.max() if p is not None else None for p in probs]

print(topic_model.get_topic_info().head())   # list of topic IDs + sizes + representation
print(topic_model.get_topic(0))              # top 10 words for topic 0

docs.to_csv("docs_with_bert_topics.csv", index=False)

   Topic  Count                                              Name  \
0     -1  33160                   -1_neural_learning_attention_3d   
1      0  15711                0_adversarial_cnn_convolutional_3d   
2      1  14420               1_dialogue_corpus_semantic_language   
3      2   7951       2_classification_bayesian_clustering_sparse   
4      3   5218  3_optimization_reinforcement_algorithms_planning   

                                      Representation  \
0  [neural, learning, attention, 3d, classificati...   
1  [adversarial, cnn, convolutional, 3d, segmenta...   
2  [dialogue, corpus, semantic, language, languag...   
3  [classification, bayesian, clustering, sparse,...   
4  [optimization, reinforcement, algorithms, plan...   

                                 Representative_Docs  
0  [camera attention 3d view depth, loras supervi...  
1  [3d cnn pose planes plane, cnn 3d pose adversa...  
2  [conversational conversation dialogue dialogue...  
3  [sparse optimization regu

In [8]:
# 1. Produce & save the global topic info with BERTopic prefix
topic_info = topic_model.get_topic_info()
topic_info.to_csv("bertopic_topic_info.csv", index=False)
print("Saved bertopic_topic_info.csv:")
print(topic_info.head(), "\n")

# 2. Build & save the Topic → Top Words table with BERTopic prefix
rows = []
for tid in topic_info['Topic']:
    top_words = [word for word, _ in topic_model.get_topic(tid)]
    rows.append({
        'Topic': tid,
        'Top Words': ", ".join(top_words)
    })

topic_top_words = pd.DataFrame(rows)
topic_top_words.to_csv("bertopic_topic_top_words.csv", index=False)
print("Saved bertopic_topic_top_words.csv:")
print(topic_top_words.head())

Saved bertopic_topic_info.csv:
   Topic  Count                                              Name  \
0     -1  33160                   -1_neural_learning_attention_3d   
1      0  15711                0_adversarial_cnn_convolutional_3d   
2      1  14420               1_dialogue_corpus_semantic_language   
3      2   7951       2_classification_bayesian_clustering_sparse   
4      3   5218  3_optimization_reinforcement_algorithms_planning   

                                      Representation  \
0  [neural, learning, attention, 3d, classificati...   
1  [adversarial, cnn, convolutional, 3d, segmenta...   
2  [dialogue, corpus, semantic, language, languag...   
3  [classification, bayesian, clustering, sparse,...   
4  [optimization, reinforcement, algorithms, plan...   

                                 Representative_Docs  
0  [camera attention 3d view depth, loras supervi...  
1  [3d cnn pose planes plane, cnn 3d pose adversa...  
2  [conversational conversation dialogue dialogue...

In [9]:
topic_model.visualize_topics()

In [10]:
topic_model.visualize_hierarchy()

Exception ignored in: <function ResourceTracker.__del__ at 0x104e0d120>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102b99120>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/

In [12]:
topic_model.visualize_barchart()

In [13]:
topic_model.visualize_heatmap()