# Topic Modeling and Unsupervised Learning with BERTopic

In [24]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [25]:
docs = pd.read_csv("/Users/narenprax/Documents/GitHub/RRR-datacreation/phrase_added.csv").iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [26]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

docs_text = docs["concepts_found"].apply(lambda toks: " ".join(toks)).tolist()

In [54]:
topic_model = BERTopic(
    language="english",
    embedding_model="all-MiniLM-L6-v2",            # HF sentence-transformer
    vectorizer_model=CountVectorizer(              # tweak n-grams, stop-words, etc.
        ngram_range=(1,2),
        stop_words="english",
        max_df=0.85,
        min_df=5,
    ),
    nr_topics= 35,            
)

topics, probs = topic_model.fit_transform(docs_text)

Exception ignored in: <function ResourceTracker.__del__ at 0x10b00d120>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10e445120>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/

In [55]:
docs["topic_id"]   = topics
docs["topic_prob"] = [p.max() if p is not None else None for p in probs]

print(topic_model.get_topic_info().head())   # list of topic IDs + sizes + representation
print(topic_model.get_topic(0))              # top 10 words for topic 0

docs.to_csv("docs_with_bert_topics.csv", index=False)

   Topic  Count                                              Name  \
0     -1  30651                    -1_neural_learning_3d_language   
1      0  19583                   0_attention_dialogue_corpus_nlp   
2      1  17445       1_recognition_convolutional_adversarial_cnn   
3      2   2729                2_planning_reinforcement_bandit_ai   
4      3   2436  3_optimization_algorithms_evolutionary_algorithm   

                                      Representation  \
0  [neural, learning, 3d, language, attention, re...   
1  [attention, dialogue, corpus, nlp, embeddings,...   
2  [recognition, convolutional, adversarial, cnn,...   
3  [planning, reinforcement, bandit, ai, agent, r...   
4  [optimization, algorithms, evolutionary, algor...   

                                 Representative_Docs  
0  [poses pose 3d supervised depth, depth 3d supe...  
1  [multilingual corpus utterances hindi language...  
2  [recognition contourlet pose classifiers featu...  
3  [planning planner planner

In [56]:
# 1. Produce & save the global topic info with BERTopic prefix
topic_info = topic_model.get_topic_info()
topic_info.to_csv("bertopic_topic_info.csv", index=False)
print("Saved bertopic_topic_info.csv:")
print(topic_info.head(), "\n")

# 2. Build & save the Topic → Top Words table with BERTopic prefix
rows = []
for tid in topic_info['Topic']:
    top_words = [word for word, _ in topic_model.get_topic(tid)]
    rows.append({
        'Topic': tid,
        'Top Words': ", ".join(top_words)
    })

topic_top_words = pd.DataFrame(rows)
topic_top_words.to_csv("bertopic_topic_top_words.csv", index=False)
print("Saved bertopic_topic_top_words.csv:")
print(topic_top_words.head())

Saved bertopic_topic_info.csv:
   Topic  Count                                              Name  \
0     -1  30651                    -1_neural_learning_3d_language   
1      0  19583                   0_attention_dialogue_corpus_nlp   
2      1  17445       1_recognition_convolutional_adversarial_cnn   
3      2   2729                2_planning_reinforcement_bandit_ai   
4      3   2436  3_optimization_algorithms_evolutionary_algorithm   

                                      Representation  \
0  [neural, learning, 3d, language, attention, re...   
1  [attention, dialogue, corpus, nlp, embeddings,...   
2  [recognition, convolutional, adversarial, cnn,...   
3  [planning, reinforcement, bandit, ai, agent, r...   
4  [optimization, algorithms, evolutionary, algor...   

                                 Representative_Docs  
0  [poses pose 3d supervised depth, depth 3d supe...  
1  [multilingual corpus utterances hindi language...  
2  [recognition contourlet pose classifiers featu...

In [57]:
topic_model.visualize_topics()