# Clustering trials

## Setup

### Imports

In [None]:
import datetime, os, json, random, sys
from pathlib import Path
from time import time
from typing import Any, Dict, List, Literal, Optional, Union, Tuple
from tqdm import tqdm

import numpy as np
import numpy.typing as npt
import pandas as pd
import plotly.express as px
import torch
import umap
import yaml
from langchain.embeddings import HuggingFaceEmbeddings
from loguru import logger
from rich import print
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.metrics import (
    calinski_harabasz_score,
    davies_bouldin_score,
    silhouette_score,
)
from transformers import AutoTokenizer, BertModel, BertTokenizer, BertTokenizerFast

In [None]:
# check GPU availability
sys.path.append("../")
from src.ai_news_digest.utils import check_gpu_availability, create_run_folder
from src.ai_news_digest.steps.benchmark import entropy

### Set random seeds

In [None]:
np.random.seed(123)
random.seed(123)
torch.manual_seed(123)

### Check GPU availability

In [None]:
device = check_gpu_availability()

### Config

In [None]:
MODEL_CONFIG_PATH = "../conf/base/cluster_bench_models.yml"
DATA_PATH = "../data/03_primary/arxiv_dict_2023-11-06_00-22-42.json"

MODEL_KWARGS = {"device": device}
ENCODE_KWARGS = {
    "normalize_embeddings": True,
    "batch_size": 16,
    "output_value": "sentence_embedding",
    "convert_to_numpy": True,
    "show_progress_bar": True,
}

In [None]:
PATH_INFO_DICT = "data/03_primary/arxiv_dict_2023-11-06_00-22-42.json"
MODEL_ID = "BAAI/bge-small-en"

## Load data

In [None]:
# load data
with open(DATA_PATH, "r") as f:
    info_dict = json.load(f)["results"]
logger.info(f"Successfully loaded prepared data from : {DATA_PATH}")

# retrieve abstracts, titles, dates & paper IDs
df_data = pd.DataFrame(info_dict).transpose()

# display
df_data.head()

## Embed Data

In [None]:
# load model
hf = HuggingFaceEmbeddings(
    model_name=MODEL_ID,
    model_kwargs={"device": device},
    encode_kwargs=ENCODE_KWARGS,
)

In [None]:
# compute embeddings
embeddings = np.array(hf.embed_documents(df_data["abstract"]))

# store embeddings in a dataframe
df_embed = pd.DataFrame(
    data=embeddings, 
    columns=[f"embed_{i}" for i in range(embeddings.shape[1])],
    index=df_data.index
)

# display
df_embed.head()

## Dimensionality reduction

In [None]:
# set umap params
umap_kwargs = {
    "n_neighbors": 5,
    "min_dist": 0.001,
    "n_components": 2,
    "metric": "cosine",
}

# instanciate umap projector
reducer = umap.UMAP(random_state=123, **umap_kwargs)

# project data
umap_proj = reducer.fit_transform(df_embed)

# normalize umap coords
umap_proj = (umap_proj - umap_proj.min(axis=0)) / (umap_proj.max(axis=0) - umap_proj.min(axis=0))

# store in a dataframe with metadata
df_umap = pd.DataFrame(columns=[f"umap_{i}" for i in range(umap_proj.shape[1])], data=umap_proj)
df_umap = pd.concat((df_umap, df_data.reset_index(names=["ID"])), axis=1)

# display
df_umap.head()

## Clustering

In [None]:
X_cluster = df_umap[[col for col in df_umap.columns if "umap" in col]]

clustering = HDBSCAN(
    min_cluster_size=10, 
    min_samples=3, 
    max_cluster_size=None, 
    cluster_selection_epsilon=0.05
)
clustering.fit(X_cluster)

In [None]:
# viz
df_umap["cluster"] = [str(elt) for elt in clustering.labels_]
df_umap["noise"] = [int(elt==-1) for elt in clustering.labels_]
fig = px.scatter(
    df_umap,
    x="umap_0",
    y="umap_1",
    hover_data=[
        "title",
        "ID",
    ],
    color="cluster",
    symbol="noise",
    # color_continuous_scale=px.colors.qualitative.D3,
    category_orders={"cluster": list(np.sort(pd.unique(clustering.labels_)).astype(str))},
)
fig.show()

In [None]:
# Entropy ↓
entropy(X_cluster.values)

In [None]:
# Silhouette ↑
silhouette_score(X_cluster, df_umap["cluster"])

## Refacto projection & clustering pipeline

In [None]:
def clustering_pipeline(
    df_embed: pd.DataFrame,
    umap_kwargs: dict,
    clustering_kwargs: Optional[dict]=None,
    random_state: int=123,
    df_data: Optional[pd.DataFrame]=None,
) -> Tuple[pd.DataFrame, Any]:
    
    #--- dimensionality reduction ---

    # instanciate umap projector
    reducer = umap.UMAP(random_state=random_state, **umap_kwargs)

    # project data
    umap_proj = reducer.fit_transform(df_embed)

    # normalize umap coords
    umap_proj = (umap_proj - umap_proj.min(axis=0)) / (umap_proj.max(axis=0) - umap_proj.min(axis=0))

    # store in a dataframe
    df_umap = pd.DataFrame(columns=[f"umap_{i}" for i in range(umap_proj.shape[1])], data=umap_proj)

    # add metadata if available
    if df_data is not None:
        df_umap = pd.concat((df_umap, df_data.reset_index(names=["ID"])), axis=1)

    #--- clustering ---
    if clustering_kwargs is not None:
        X_cluster = df_umap[[col for col in df_umap.columns if "umap" in col]]
        clustering = HDBSCAN(**clustering_kwargs)
        clustering.fit(X_cluster)
        df_umap["cluster"] = [str(elt) for elt in clustering.labels_]
        df_umap["noise"] = [int(elt==-1) for elt in clustering.labels_]
    
    else:
        clustering = None

    #--- result ---
    return df_umap, clustering

In [None]:
clustering_kwargs = {    
    "min_cluster_size": 10, 
    "min_samples": 3, 
    "max_cluster_size": None, 
    "cluster_selection_epsilon": 0.05,
}

umap_kwargs = {
    "n_neighbors": 5,
    "min_dist": 0.1,
    "n_components": 2,
    "metric": "cosine",
}

df_umap, clustering = clustering_pipeline(
    df_embed,
    umap_kwargs,
    clustering_kwargs,
    random_state=123,
    df_data=df_data,
)

X_cluster = df_umap[[c for c in df_umap.columns if "umap_" in c]]

# Silhouette ↑
print(f" Silhouette ↑ = {silhouette_score(X_cluster, df_umap['cluster'])}")

# Entropy ↓
print(f" Entropy ↓ = {entropy(X_cluster.values)}")

# viz
df_umap["cluster"] = [str(elt) for elt in clustering.labels_]
df_umap["noise"] = [int(elt==-1) for elt in clustering.labels_]
fig = px.scatter(
    df_umap,
    x="umap_0",
    y="umap_1",
    hover_data=[
        "title",
        "ID",
    ],
    color="cluster",
    symbol="noise",
    # color_continuous_scale=px.colors.qualitative.D3,
    category_orders={"cluster": list(np.sort(pd.unique(clustering.labels_)).astype(str))},
)
fig.show()

## Grid search over projection and clustering parameters

In [None]:
# from itertools import product
from tqdm.contrib.itertools import product

clustering_grid = {
    "min_cluster_size": [3, 5, 10, 15], 
    "min_samples": [3, 5, 10], 
    "max_cluster_size": [None, 15, 25, 40], 
    "cluster_selection_epsilon": [
        # 0.0, 
        # 0.05, 
        # 0.0,
        # 0.001,
        # 0.01,
        0.05,
        0.1,    
        0.5,    
    ],
}

umap_grid = {
    "n_neighbors": [2, 4, 8, 10],
    "min_dist": [0.001, 0.01, 0.1, 0.5],
    "n_components": [2],
    "metric": ["cosine"],
    "init": ["random"],
}


In [None]:
best_entropy = np.inf
best_umap_kwargs = None
df_umap_search = pd.DataFrame(columns=list(umap_grid.keys())+["Entropy ↓"])

for umap_vals in product(*tuple(umap_grid.values())):
    
    # retrieve current umap kwargs
    umap_kwargs = {}
    for i, key in enumerate(umap_grid.keys()):
        umap_kwargs[key] = umap_vals[i]
    
    try:
        df_umap, clustering = clustering_pipeline(
            df_embed,
            umap_kwargs,
            clustering_kwargs=None,
            random_state=123,
            df_data=df_data,
        )
        X_cluster = df_umap[[c for c in df_umap.columns if "umap_" in c]]
        curr_entropy = entropy(X_cluster.values)
        df_umap_search = pd.concat(
            (
                df_umap_search,
                pd.DataFrame(data=np.array([umap_vals+(curr_entropy,)]), columns=df_umap_search.columns)
            ),
            axis=0,
            ignore_index=True,
        )

        if curr_entropy < best_entropy:
            best_entropy = curr_entropy
            best_umap_kwargs = umap_kwargs
        
    except Exception as e:
        print(f" {umap_kwargs} failed with following error: {e}; will proceed to next iteration")
    
print(f"Best entropy: {best_entropy}")
print(f"Best umap_kwargs: {best_umap_kwargs}")

df_umap_search

In [None]:
best_silhouette = -np.inf
best_clustering_kwargs = None
df_clustering_search = pd.DataFrame(columns=list(clustering_grid.keys())+["Silhouette ↑"])

for clustering_vals in product(*tuple(clustering_grid.values())):
    
    # retrieve current clustering kwargs
    clustering_kwargs = {}
    for i, key in enumerate(clustering_grid.keys()):
        clustering_kwargs[key] = clustering_vals[i]
    
    try:
        df_umap, clustering = clustering_pipeline(
            df_embed,
            best_umap_kwargs,
            clustering_kwargs=clustering_kwargs,
            random_state=123,
            df_data=df_data,
        )
        X_cluster = df_umap[[c for c in df_umap.columns if "umap_" in c]]
        curr_silhouette = silhouette_score(X_cluster.values, df_umap["cluster"])
        df_clustering_search = pd.concat(
            (
                df_clustering_search,
                pd.DataFrame(data=np.array([clustering_vals+(curr_silhouette,)]), columns=df_clustering_search.columns)
            ),
            axis=0,
            ignore_index=True,
        )

        if curr_silhouette > best_silhouette:
            best_silhouette = curr_silhouette
            best_clustering_kwargs = clustering_kwargs
        
    except Exception as e:
        print(f" {umap_kwargs} failed with following error: {e}; will proceed to next iteration")
    
print(f"Best silhouette: {best_silhouette}")
print(f"Best clustering_kwargs: {best_clustering_kwargs}")

df_clustering_search

In [None]:
df_clustering_search["Silhouette ↑"].sort_values().reset_index(drop=True).plot()

In [None]:
df_umap, clustering = clustering_pipeline(
    df_embed,
    best_umap_kwargs,
    best_clustering_kwargs,
    random_state=123,
    df_data=df_data,
)

X_cluster = df_umap[[c for c in df_umap.columns if "umap_" in c]]

# Silhouette ↑
print(f" Silhouette ↑ = {silhouette_score(X_cluster, df_umap['cluster'])}")

# Entropy ↓
print(f" Entropy ↓ = {entropy(X_cluster.values)}")

# viz
df_umap["cluster"] = [str(elt) for elt in clustering.labels_]
df_umap["noise"] = [int(elt==-1) for elt in clustering.labels_]
fig = px.scatter(
    df_umap,
    x="umap_0",
    y="umap_1",
    hover_data=[
        "title",
        "ID",
    ],
    color="cluster",
    symbol="noise",
    # color_continuous_scale=px.colors.qualitative.D3,
    category_orders={"cluster": list(np.sort(pd.unique(clustering.labels_)).astype(str))},
)
fig.show()