# Topic Modeling and Unsupervised Learning with BERTopic

In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [13]:
docs = pd.read_csv("/Users/narenprax/Documents/GitHub/RRR-datacreation/phrase_added.csv").iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [59]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

docs_text = docs["concepts_found"].apply(lambda toks: " ".join(toks)).tolist()

In [60]:
from umap import UMAP

umap_model = UMAP(random_state=42)

In [75]:
topic_model = BERTopic(
    umap_model=umap_model,
    language="english",
    embedding_model="all-MiniLM-L6-v2",
    hdbscan_model=hdbscan_model,# HF sentence-transformer
    vectorizer_model=CountVectorizer(              # tweak n-grams, stop-words, etc.
        ngram_range=(1,2),
        stop_words="english",
        max_df=0.85,
        min_df=5,
    ),
    nr_topics=35,
    calculate_probabilities=True          
)

topics, probs = topic_model.fit_transform(docs_text)
topics = topic_model.reduce_outliers(docs_text, topics, strategy="probabilities", probabilities=probs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [108]:
docs["topic_id"]   = topics
docs["topic_prob"] = [p.max() if p is not None else None for p in probs]

print(topic_model.get_topic_info().head())   # list of topic IDs + sizes + representation
print(topic_model.get_topic(0))              # top 10 words for topic 0

docs.to_csv("docs_with_bert_topics.csv", index=False)

   Topic  Count                                             Name  \
0     -1  31827          -1_learning_3d_classification_attention   
1      0  16445     0_adversarial_convolutional_cnn_segmentation   
2      1  12471           1_dialogue_translation_corpus_semantic   
3      2   6805  2_sparse_classification_gaussian_regularization   
4      3   2726           3_reinforcement_bandit_planning_reward   

                               CustomName  \
0               3D Convolutional Networks   
1          Dialogue and Semantic Analysis   
2                  Bayesian Deep Learning   
3       Reinforcement Learning Algorithms   
4  Optimization Techniques and Algorithms   

                                      Representation  \
0  [learning, 3d, classification, attention, lang...   
1  [adversarial, convolutional, cnn, segmentation...   
2  [dialogue, translation, corpus, semantic, lang...   
3  [sparse, classification, gaussian, regularizat...   
4  [reinforcement, bandit, planning, rew

In [109]:
# 1. Produce & save the global topic info with BERTopic prefix
topic_info = topic_model.get_topic_info()
topic_info.to_csv("bertopic_topic_info.csv", index=False)
print("Saved bertopic_topic_info.csv:")
print(topic_info.head(), "\n")

# 2. Build & save the Topic → Top Words table with BERTopic prefix
rows = []
for tid in topic_info['Topic']:
    top_words = [word for word, _ in topic_model.get_topic(tid)]
    rows.append({
        'Topic': tid,
        'Top Words': ", ".join(top_words)
    })

topic_top_words = pd.DataFrame(rows)
topic_top_words.to_csv("bertopic_topic_top_words.csv", index=False)
print("Saved bertopic_topic_top_words.csv:")
print(topic_top_words.head())

Saved bertopic_topic_info.csv:
   Topic  Count                                             Name  \
0     -1  31827          -1_learning_3d_classification_attention   
1      0  16445     0_adversarial_convolutional_cnn_segmentation   
2      1  12471           1_dialogue_translation_corpus_semantic   
3      2   6805  2_sparse_classification_gaussian_regularization   
4      3   2726           3_reinforcement_bandit_planning_reward   

                               CustomName  \
0               3D Convolutional Networks   
1          Dialogue and Semantic Analysis   
2                  Bayesian Deep Learning   
3       Reinforcement Learning Algorithms   
4  Optimization Techniques and Algorithms   

                                      Representation  \
0  [learning, 3d, classification, attention, lang...   
1  [adversarial, convolutional, cnn, segmentation...   
2  [dialogue, translation, corpus, semantic, lang...   
3  [sparse, classification, gaussian, regularizat...   
4  [reinf

In [117]:
custom_labels = [
    "Adversarial Robustness in Convolutional Networks",
    "Dialogue Systems and Machine Translation",
    "Sparse Modeling and Gaussian Classification",
    "Reinforcement Learning and Bandit Algorithms",
    "Optimization Algorithms in Machine Learning",
    "Causal Inference and Treatment Effects",
    "Bayesian Inference and Probabilistic Models",
    "Graph Neural Networks",
    "Facial Recognition and Biometrics",
    "Pedestrian Tracking and Trajectory Estimation",
    "Emotion Recognition in Social Media",
    "Computational Neuroscience",
    "Handwritten Character Recognition",
    "Image Enhancement and Deblurring",
    "Sketch-Based Image Retrieval",
    "Clinical and Biomedical AI",
    "Privacy-Preserving Machine Learning",
    "Conceptual Analogy and Reasoning",
    "Pose Estimation and Fashion Analysis",
    "Social Attribute Inference",
    "Legal Document Analysis",
    "Stylometry and Music Generation",
    "Deep Learning for Image Segmentation",
    "Auction Theory and Online Advertising",
    "Quantile Regression and Copula Models",
    "Language Model Hallucination and Control",
    "Hardware-Aware Model Optimization",
    "Computer Vision Applications",
    "Pandemic Modeling and Epidemiology",
    "Stochastic Processes in AI",
    "Security and Adversarial Attacks",
    "Simulation-Based Learning Environments",
    "Fuzzy Logic and Set Theory",
    "Tomography and Medical Imaging"
]
topic_model.set_topic_labels(topic_model.custom_labels_)

In [118]:
topic_model.topic_labels_

{-1: '-1_learning_3d_classification_attention',
 0: '0_adversarial_convolutional_cnn_segmentation',
 1: '1_dialogue_translation_corpus_semantic',
 2: '2_sparse_classification_gaussian_regularization',
 3: '3_reinforcement_bandit_planning_reward',
 4: '4_optimization_algorithms_evolutionary_algorithm',
 5: '5_causal_bias_fairness_explanations',
 6: '6_ontology_ontologies_logic_probabilistic',
 7: '7_clustering_clusters_cluster_graphs',
 8: '8_face_facial_faces_recognition',
 9: '9_tracking_pedestrian_tracker_tracking tracker',
 10: '10_emotion_tweets_twitter_emotions',
 11: '11_neurons_neuron_neurons neural_eeg',
 12: '12_handwritten_ocr_recognition_text',
 13: '13_illumination_blur_reflectance_shading',
 14: '14_sketch_sketches_artistic_creativity',
 15: '15_clinical_medical_healthcare_hospital',
 16: '16_privacy_private_federated_data',
 17: '17_analogy_conceptual_concepts_affordances',
 18: '18_fashion_pose_poses_body',
 19: '19_gender_personality_traits_bias',
 20: '20_law_judgment_

In [125]:
topic_model.visualize_topics()

In [126]:
topic_model.visualize_hierarchy(custom_labels=custom_labels)

In [128]:
topic_model.visualize_barchart()

In [127]:
topic_model.visualize_heatmap(custom_labels=custom_labels)