In [None]:
!pip install datasets umap-learn bertopic

In [None]:
# Import the load_dataset function from the datasets library
from datasets import load_dataset

# Load the "arxiv_nlp" dataset from Hugging Face, specifically the "train" split
dataset = load_dataset("maartengr/arxiv_nlp")["train"] # We will use train datasplit

# Extract the "Abstracts" column from the dataset
abstracts = dataset["Abstracts"]

# Extract the "Titles" column from the dataset
titles = dataset["Titles"]

# Let's Load the Embedding Model and convert documents into embeddings

In [None]:
# Import the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Initialize a pre-trained sentence embedding model ('thenlper/gte-small')
embedding_model = SentenceTransformer('thenlper/gte-small')

# Encode the abstracts into vector embeddings, with a progress bar shown
document_embeddings = embedding_model.encode(abstracts, show_progress_bar=True)


# Let's load the Dimensionality Reduction Model

In [None]:
# Import the UMAP class from the umap library for dimensionality reduction
from umap import UMAP

# Initialize the UMAP model to reduce embeddings to 5 dimensions
# n_components=5: Target dimensionality (5 dimensions)
# min_dist=0.0: Minimum distance between points in the low-dimensional space
# metric='cosine': Use cosine similarity as the distance metric
# random_state=42: Set a fixed random seed for reproducibility
umap_model_object = UMAP(
    n_components=5, min_dist=0.0, metric='cosine', random_state=42
)


# Let's load the Clustering Model

In [None]:
# Import the HDBSCAN class from the hdbscan library for clustering
from hdbscan import HDBSCAN

# Initialize the HDBSCAN model with specified parameters:
# min_cluster_size=50: Minimum size of clusters to be considered
# metric='euclidean': Use Euclidean distance to measure similarity between points
# cluster_selection_method='eom': Select clusters based on the Excess of Mass method
# Fit the model on the reduced embeddings (new_compressed_embeddings)
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric='euclidean', cluster_selection_method='eom'
)

# Here comes the BERTopic

In [None]:
from bertopic import BERTopic  # Import the BERTopic class from the bertopic package

# Train our model with our previously defined models
topic_model = BERTopic(
    embedding_model=embedding_model,  # Specify the pre-trained embedding model to transform text into embeddings
    umap_model=umap_model_object,            # Specify the UMAP model for dimensionality reduction
    hdbscan_model=hdbscan_model,      # Specify the HDBSCAN model for clustering the embeddings
    verbose=True                      # Enable verbose output to track the model's progress during training
).fit(abstracts, document_embeddings)           # Fit the BERTopic model using the provided abstracts and precomputed embeddings


# Let's inspect Topics

In [None]:
topic_model.get_topic_info()


First topic labelled as -1 contains all the documents which could not be fitted into topics and are considered as outliers.


In [None]:
topic_model.get_topic(0)


In [None]:
topic_model.find_topics("topic modeling")


In [None]:
topic_model.get_topic(22)


In [None]:
topic_model.topics_[titles.index('BERTopic: Neural topic modeling with a class-based TF-IDF procedure')]
