In [7]:
!pip install datasets umap-learn bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan, bertopic
Successfully installed bertopic-0.16.4 hdbscan-0.8.40


In [2]:
# Import the load_dataset function from the datasets library
from datasets import load_dataset

# Load the "arxiv_nlp" dataset from Hugging Face, specifically the "train" split
dataset = load_dataset("maartengr/arxiv_nlp")["train"] # We will use train datasplit

# Extract the "Abstracts" column from the dataset
abstracts = dataset["Abstracts"]

# Extract the "Titles" column from the dataset
titles = dataset["Titles"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/617 [00:00<?, ?B/s]

data.csv:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Let's Load the Embedding Model and convert documents into embeddings

In [12]:
# Import the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Initialize a pre-trained sentence embedding model ('thenlper/gte-small')
embedding_model = SentenceTransformer('thenlper/gte-small')

# Encode the abstracts into vector embeddings, with a progress bar shown
document_embeddings = embedding_model.encode(abstracts, show_progress_bar=True)


Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

# Let's load the Dimensionality Reduction Model

In [5]:
# Import the UMAP class from the umap library for dimensionality reduction
from umap import UMAP

# Initialize the UMAP model to reduce embeddings to 5 dimensions
# n_components=5: Target dimensionality (5 dimensions)
# min_dist=0.0: Minimum distance between points in the low-dimensional space
# metric='cosine': Use cosine similarity as the distance metric
# random_state=42: Set a fixed random seed for reproducibility
umap_model_object = UMAP(
    n_components=5, min_dist=0.0, metric='cosine', random_state=42
)


# Let's load the Clustering Model

In [10]:
# Import the HDBSCAN class from the hdbscan library for clustering
from hdbscan import HDBSCAN

# Initialize the HDBSCAN model with specified parameters:
# min_cluster_size=50: Minimum size of clusters to be considered
# metric='euclidean': Use Euclidean distance to measure similarity between points
# cluster_selection_method='eom': Select clusters based on the Excess of Mass method
# Fit the model on the reduced embeddings (new_compressed_embeddings)
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric='euclidean', cluster_selection_method='eom'
)

# Here comes the BERTopic

In [13]:
from bertopic import BERTopic  # Import the BERTopic class from the bertopic package

# Train our model with our previously defined models
topic_model = BERTopic(
    embedding_model=embedding_model,  # Specify the pre-trained embedding model to transform text into embeddings
    umap_model=umap_model_object,            # Specify the UMAP model for dimensionality reduction
    hdbscan_model=hdbscan_model,      # Specify the HDBSCAN model for clustering the embeddings
    verbose=True                      # Enable verbose output to track the model's progress during training
).fit(abstracts, document_embeddings)           # Fit the BERTopic model using the provided abstracts and precomputed embeddings


2024-12-22 18:55:50,754 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-22 18:57:07,494 - BERTopic - Dimensionality - Completed ✓
2024-12-22 18:57:07,501 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-22 18:57:13,459 - BERTopic - Cluster - Completed ✓
2024-12-22 18:57:13,476 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-22 18:57:17,952 - BERTopic - Representation - Completed ✓


# Let's inspect Topics

In [14]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14462,-1_the_of_and_to,"[the, of, and, to, in, we, for, that, language...",[ Cross-lingual text classification aims at t...
1,0,2241,0_question_questions_qa_answer,"[question, questions, qa, answer, answering, a...",[ Question generation (QG) attempts to solve ...
2,1,2098,1_speech_asr_recognition_end,"[speech, asr, recognition, end, acoustic, audi...",[ End-to-end models have achieved impressive ...
3,2,903,2_image_visual_multimodal_images,"[image, visual, multimodal, images, vision, mo...",[ In this paper we propose a model to learn m...
4,3,887,3_summarization_summaries_summary_abstractive,"[summarization, summaries, summary, abstractiv...",[ We present a novel divide-and-conquer metho...
...,...,...,...,...,...
148,147,54,147_counseling_mental_therapy_health,"[counseling, mental, therapy, health, psychoth...",[ Mental health care poses an increasingly se...
149,148,53,148_chatgpt_its_openai_has,"[chatgpt, its, openai, has, it, tasks, capabil...","[ Over the last few years, large language mod..."
150,149,52,149_mixed_code_sentiment_mixing,"[mixed, code, sentiment, mixing, english, anal...",[ In today's interconnected and multilingual ...
151,150,51,150_diffusion_generation_autoregressive_text,"[diffusion, generation, autoregressive, text, ...",[ Diffusion models have achieved great succes...


First topic labelled as -1 contains all the documents which could not be fitted into topics and are considered as outliers.


In [15]:
topic_model.get_topic(0)


[('question', 0.021262463291547223),
 ('questions', 0.015866039067984204),
 ('qa', 0.015830640927795868),
 ('answer', 0.015787698152510205),
 ('answering', 0.014859992848422435),
 ('answers', 0.00992918704536005),
 ('retrieval', 0.009497931820914705),
 ('comprehension', 0.007719047154229789),
 ('reading', 0.007175282051339653),
 ('knowledge', 0.0063049421989358)]

In [16]:
topic_model.find_topics("topic modeling")


([22, -1, 50, 38, 84],
 [0.95448655, 0.91218555, 0.9067658, 0.9051957, 0.9026561])

In [17]:
topic_model.get_topic(22)


[('topic', 0.06782148231481569),
 ('topics', 0.03509097163093816),
 ('lda', 0.0162350543969945),
 ('latent', 0.013482620892138605),
 ('document', 0.01258276852968132),
 ('documents', 0.012463820004375148),
 ('modeling', 0.011571581804609226),
 ('dirichlet', 0.009901318233964887),
 ('word', 0.00852094200971816),
 ('allocation', 0.007792539607690728)]

In [18]:
topic_model.topics_[titles.index('BERTopic: Neural topic modeling with a class-based TF-IDF procedure')]


22