# Install Libraries

In [None]:
!pip install datasets umap-learn hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


# About Dataset

In this tutorial, we'll be working with the ArXiv articles dataset. ArXiv is a free platform that hosts scholarly papers, mostly in fields like computer science, math, and physics. We'll focus on articles from the "Computation and Language" section, which includes around 44,000 abstracts!

In [None]:
# Import the load_dataset function from the datasets library
from datasets import load_dataset

# Load the "arxiv_nlp" dataset from Hugging Face, specifically the "train" split
dataset = load_dataset("maartengr/arxiv_nlp")["train"] # We will use train datasplit

# Extract the "Abstracts" column from the dataset
abstracts = dataset["Abstracts"]

# Extract the "Titles" column from the dataset
titles = dataset["Titles"]

# Lets Implement Text Clustering Pipeline Step by Step

# Step 1: Convert Texts to Embeddings

In [None]:
# Import the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Initialize a pre-trained sentence embedding model ('thenlper/gte-small')
embedding_model = SentenceTransformer('thenlper/gte-small')

# Encode the abstracts into vector embeddings, with a progress bar shown
document_embeddings = embedding_model.encode(abstracts, show_progress_bar=True)


Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

In [None]:
document_embeddings

array([[-8.38182941e-02,  4.16542739e-02,  1.30672995e-02, ...,
         1.21650947e-02, -4.80868062e-03, -4.08202223e-03],
       [-8.71458948e-02,  4.05699238e-02,  1.86350923e-02, ...,
         1.45083414e-02, -4.59755320e-05,  3.18071828e-03],
       [-8.35833475e-02, -1.35176983e-02,  5.27515486e-02, ...,
         6.29818738e-02,  5.36538735e-02, -1.19384965e-02],
       ...,
       [-6.52083009e-02, -4.26613912e-03,  2.71811057e-02, ...,
         3.73920761e-02, -3.73670156e-03, -3.14223091e-03],
       [-5.74976951e-02, -3.33516486e-02, -1.80733029e-03, ...,
        -1.48869278e-02,  1.60427690e-02,  1.20141041e-02],
       [-6.99503496e-02, -7.10727938e-04,  2.52838675e-02, ...,
         4.33816463e-02, -1.36642193e-03,  6.26952271e-04]], dtype=float32)

In [None]:
document_embeddings.shape

(44949, 384)

# Step 2: Reduce Dimensionality

In [None]:
# Import the UMAP class from the umap library for dimensionality reduction
from umap import UMAP

# Initialize the UMAP model to reduce embeddings to 5 dimensions
# n_components=5: Target dimensionality (5 dimensions)
# min_dist=0.0: Minimum distance between points in the low-dimensional space
# metric='cosine': Use cosine similarity as the distance metric
# random_state=42: Set a fixed random seed for reproducibility
umap_model_object = UMAP(
    n_components=5, min_dist=0.0, metric='cosine', random_state=42
)

# Apply UMAP to reduce the dimensionality of the document embeddings
new_compressed_embeddings = umap_model_object.fit_transform(document_embeddings)


  warn(


# Step 3: Apply Clustering

We'll use the UMAP algorithm because it does a better job at capturing complex, nonlinear relationships and structures compared to PCA.

In [None]:
# Import the HDBSCAN class from the hdbscan library for clustering
from hdbscan import HDBSCAN

# Initialize the HDBSCAN model with specified parameters:
# min_cluster_size=50: Minimum size of clusters to be considered
# metric='euclidean': Use Euclidean distance to measure similarity between points
# cluster_selection_method='eom': Select clusters based on the Excess of Mass method
# Fit the model on the reduced embeddings (new_compressed_embeddings)
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric='euclidean', cluster_selection_method='eom'
).fit(new_compressed_embeddings)

# Extract the cluster labels assigned to each data point
clusters = hdbscan_model.labels_

# Count how many unique clusters were generated
len(set(clusters))




150

# Lets Inspect Clusters

In [None]:
# Import the numpy library for numerical operations
import numpy as np

# Set the target cluster to 0
cluster = 0

# Loop through the first three document indices in cluster 0
# np.where(clusters == cluster)[0] returns the indices of documents in the specified cluster
# [:3] selects the first three documents
for index in np.where(clusters == cluster)[0][:3]:
    # Print the first 100 characters of each abstract, followed by ellipsis and a newline
    print(abstracts[index][:100] + "... \n")


  This works aims to design a statistical machine translation from English text
to American Sign Lan... 

  Researches on signed languages still strongly dissociate lin- guistic issues
related on phonologic... 

  Modern computational linguistic software cannot produce important aspects of
sign language transla... 



In [None]:
# Import the numpy library for numerical operations
import numpy as np

# Set the target cluster to 2
cluster = 2

# Loop through the first three document indices in cluster 2
# np.where(clusters == cluster)[0] returns the indices of documents in cluster 2
# [:3] selects the first three documents from those indices
for index in np.where(clusters == cluster)[0][:3]:
    # Print the first 100 characters of each abstract, followed by ellipsis and a newline
    print(abstracts[index][:100] + "... \n")


  A computer model of "a sense of humour" suggested previously
[arXiv:0711.2058,0711.2061], relating... 

  Computer model of a "sense of humour" suggested previously [arXiv:0711.2058,
0711.2061, 0711.2270]... 

  The New Yorker publishes a weekly captionless cartoon. More than 5,000
readers submit captions for... 

