HDBSCAN Clustering

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer  # using HuggingFace model
import umap.umap_ as umap
import hdbscan
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

model = SentenceTransformer("all-MiniLM-L6-v2")

def cluster_and_query(text, query=None):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return "Please enter at least two sentences.", None, None
    
    # Embed sentences
    embeddings = model.encode(sentences, normalize_embeddings=True)
    
    # Reduce to 2D for visualization
    reducer = umap.UMAP(n_neighbors=10, min_dist=0.3, metric='cosine', random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    # Cluster
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean')
    labels = clusterer.fit_predict(reduced)
    
    # Plot clusters
    plt.figure(figsize=(7,5))
    scatter = plt.scatter(reduced[:,0], reduced[:,1], c=labels, cmap='tab10', s=60)
    plt.title("Sentence Clusters")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    
    # Annotate sentences
    for i, s in enumerate(sentences):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))
    
    plt.tight_layout()
    plt.savefig("cluster_plot.png")
    plt.close()
    
    # Group sentences by cluster
    grouped = {}
    for sent, lbl in zip(sentences, labels):
        grouped.setdefault(int(lbl), []).append(sent)
    
    summary = "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in grouped.items()])
    
    # Query functionality
    query_result = ""
    if query:
        query_emb = model.encode([query], normalize_embeddings=True)
        sims = np.dot(embeddings, query_emb.T).flatten()  # cosine similarity
        top_idx = np.argsort(-sims)[:5]  # top 5 matches
        query_result = "\n".join([f"{i+1}. {sentences[idx]}" for i, idx in enumerate(top_idx)])
    
    return summary, "cluster_plot.png", query_result

# Gradio interface
demo = gr.Interface(
    fn=cluster_and_query,
    inputs=[
        gr.Textbox(lines=10, placeholder="Paste article or chat transcript here..."),
        gr.Textbox(lines=1, placeholder="Enter query to search relevant sentences (optional)")
    ],
    outputs=[
        gr.Textbox(label="Clustered Sentences"),
        gr.Image(label="Cluster Graph"),
        gr.Textbox(label="Top Sentences for Query")
    ],
    title="Contextual Sentence Clustering with Query",
    description="Enter an article or chat log. The model embeds each sentence, clusters by meaning, shows a 2D cluster map, and optionally finds sentences matching your query."
)

if __name__ == "__main__":
    demo.launch()


KMeans Clustering

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import umap.umap_ as umap
import numpy as np
import matplotlib.pyplot as plt
import nltk
import os

# --- Safe NLTK setup (top-level, before threads) ---
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)

# Append the directory first
if nltk_data_dir not in nltk.data.path:
    nltk.data.path.append(nltk_data_dir)

# Download punkt_tab if missing
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', download_dir=nltk_data_dir)

# Import tokenizer AFTER setup
from nltk.tokenize import sent_tokenize


# # --- Safe NLTK setup ---
# nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
# os.makedirs(nltk_data_dir, exist_ok=True)

# try:
#     nltk.data.find('tokenizers/punkt')
# except LookupError:
#     nltk.download('punkt', download_dir=nltk_data_dir)

# nltk.data.path.append(nltk_data_dir)
# from nltk.tokenize import sent_tokenize

# --- Load embedding model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

def cluster_article(text, query=None, n_clusters=5):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return "Please enter at least two sentences.", None, None

    # Embed sentences
    embeddings = model.encode(sentences, normalize_embeddings=True)

    # Reduce dimensions for visualization
    reducer = umap.UMAP(n_neighbors=10, min_dist=0.3, metric='cosine', random_state=42)
    reduced = reducer.fit_transform(embeddings)

    # Cluster with KMeans
    kmeans = KMeans(n_clusters=min(n_clusters, len(sentences)), random_state=42)
    labels = kmeans.fit_predict(reduced)

    # Plot clusters
    plt.figure(figsize=(7,5))
    scatter = plt.scatter(reduced[:,0], reduced[:,1], c=labels, cmap='tab10', s=60)
    plt.title("Sentence Clusters")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")

    # Annotate sentences
    for i, s in enumerate(sentences):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))

    plt.tight_layout()
    plt.savefig("cluster_plot.png")
    plt.close()

    # Group sentences by cluster
    grouped = {}
    for sent, lbl in zip(sentences, labels):
        grouped.setdefault(int(lbl), []).append(sent)
    summary = "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in grouped.items()])

    # Find most similar sentences if query is provided
    if query:
        query_emb = model.encode([query], normalize_embeddings=True)
        sims = cosine_similarity(query_emb, embeddings)[0]
        top_idx = np.argsort(sims)[::-1][:5]  # Top 5 matches
        similar_sentences = [sentences[i] for i in top_idx]
        similar_summary = "\n".join(similar_sentences)
    else:
        similar_summary = ""

    return summary, "cluster_plot.png", similar_summary

# --- Gradio Interface ---
demo = gr.Interface(
    fn=cluster_article,
    inputs=[
        gr.Textbox(lines=10, placeholder="Paste article or chat transcript here..."),
        gr.Textbox(lines=1, placeholder="Optional: Enter search query here...")
    ],
    outputs=[
        gr.Textbox(label="Clustered Sentences"),
        gr.Image(label="Cluster Graph"),
        gr.Textbox(label="Most Similar Sentences (Optional)")
    ],
    title="Contextual Sentence Clustering",
    description="Enter an article or chat log. The model embeds each sentence, clusters by meaning, shows a 2D cluster map, and optionally finds sentences most similar to your query."
)

if __name__ == "__main__":
    demo.launch()

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import matplotlib.pyplot as plt
import numpy as np
from pyclustering.cluster.clarans import clarans
from sklearn.metrics.pairwise import euclidean_distances
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

model = SentenceTransformer("all-MiniLM-L6-v2")

def cluster_with_clarans(text, query=None, k=3, numlocal=2, maxneighbor=10):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return "Please enter at least two sentences.", None, None
    
    # Embed sentences
    embeddings = model.encode(sentences, normalize_embeddings=True)
    
    # Reduce to 2D for visualization
    reducer = umap.UMAP(n_neighbors=10, min_dist=0.3, metric='cosine', random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    # Distance matrix for CLARANS (use reduced dimensionality for speed)
    distance_matrix = euclidean_distances(reduced).tolist()
    
    # Run CLARANS with reduced search parameters for speed
    clarans_instance = clarans(distance_matrix, number_clusters=k, numlocal=numlocal, maxneighbor=maxneighbor)
    clarans_instance.process()
    clusters = clarans_instance.get_clusters()
    
    # Map sentences to cluster labels
    labels = np.full(len(sentences), -1)
    for cluster_id, cluster in enumerate(clusters):
        for idx in cluster:
            labels[idx] = cluster_id
    
    # Plot clusters
    plt.figure(figsize=(7,5))
    scatter = plt.scatter(reduced[:,0], reduced[:,1], c=labels, cmap='tab10', s=60)
    plt.title(f"Sentence Clusters (CLARANS k={k})")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    
    for i, s in enumerate(sentences):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))
    
    plt.tight_layout()
    plt.savefig("cluster_plot.png")
    plt.close()
    
    # Group sentences by cluster
    grouped = {}
    for sent, lbl in zip(sentences, labels):
        grouped.setdefault(int(lbl), []).append(sent)
    
    summary = "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in grouped.items()])
    
    # Query functionality
    query_result = ""
    if query:
        query_emb = model.encode([query], normalize_embeddings=True)
        sims = np.dot(embeddings, query_emb.T).flatten()  # cosine similarity
        top_idx = np.argsort(-sims)[:5]  # top 5 matches
        query_result = "\n".join([f"{i+1}. {sentences[idx]}" for i, idx in enumerate(top_idx)])
    
    return summary, "cluster_plot.png", query_result

# Gradio interface
demo = gr.Interface(
    fn=cluster_with_clarans,
    inputs=[
        gr.Textbox(lines=10, placeholder="Paste article or chat transcript here..."),
        gr.Textbox(lines=1, placeholder="Enter query to search relevant sentences (optional)"),
        gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters (k)")
    ],
    outputs=[
        gr.Textbox(label="Clustered Sentences"),
        gr.Image(label="Cluster Graph"),
        gr.Textbox(label="Top Sentences for Query")
    ],
    title="CLARANS Sentence Clustering with Query",
    description="Clusters sentences using CLARANS (k-medoids variant), shows 2D map, and returns top sentences matching query. Note: CLARANS is slower than other algorithms."
)

if __name__ == "__main__":
    demo.launch()

Clarnas Attempted Speed Improvements

Use reduced 2D data instead of full embeddings - Calculates distances on the 2D UMAP projection instead of the full 384-dimensional embeddings, dramatically reducing computation
Reduced numlocal from 5 to 2 - Fewer random restarts
Reduced maxneighbor from 20 to 10 - Less exhaustive neighborhood search
Use sklearn's euclidean_distances instead of pyclustering's calculate_distance_matrix - More optimized implementation

Combined Comparison: All Three Clustering Algorithms

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import hdbscan
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from pyclustering.cluster.clarans import clarans
from sklearn.metrics.pairwise import euclidean_distances
import nltk
from nltk.tokenize import sent_tokenize
import os
import time

# Ensure NLTK data is available
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

model = SentenceTransformer("all-MiniLM-L6-v2")

def run_all_clustering(text, query=None):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return "Please enter at least two sentences.", None, None, None, None, None, "", "", ""
    
    # Embed sentences
    embeddings = model.encode(sentences, normalize_embeddings=True)
    
    # Reduce to 2D for visualization
    reducer = umap.UMAP(n_neighbors=10, min_dist=0.3, metric='cosine', random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    # --- HDBSCAN Clustering ---
    start_time = time.time()
    hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean')
    hdbscan_labels = hdbscan_clusterer.fit_predict(reduced)
    hdbscan_time = time.time() - start_time
    
    plt.figure(figsize=(7,5))
    plt.scatter(reduced[:,0], reduced[:,1], c=hdbscan_labels, cmap='tab10', s=60)
    plt.title("HDBSCAN Clustering")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    for i in range(len(sentences)):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))
    plt.tight_layout()
    plt.savefig("hdbscan_plot.png")
    plt.close()
    
    hdbscan_grouped = {}
    for sent, lbl in zip(sentences, hdbscan_labels):
        hdbscan_grouped.setdefault(int(lbl), []).append(sent)
    hdbscan_summary = f"⏱️ Time: {hdbscan_time:.2f}s\n\n" + "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in hdbscan_grouped.items()])
    
    # --- KMeans Clustering ---
    start_time = time.time()
    k = min(5, len(sentences))
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans_labels = kmeans.fit_predict(reduced)
    kmeans_time = time.time() - start_time
    
    plt.figure(figsize=(7,5))
    plt.scatter(reduced[:,0], reduced[:,1], c=kmeans_labels, cmap='tab10', s=60)
    plt.title(f"KMeans Clustering (k={k})")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    for i in range(len(sentences)):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))
    plt.tight_layout()
    plt.savefig("kmeans_plot.png")
    plt.close()
    
    kmeans_grouped = {}
    for sent, lbl in zip(sentences, kmeans_labels):
        kmeans_grouped.setdefault(int(lbl), []).append(sent)
    kmeans_summary = f"⏱️ Time: {kmeans_time:.2f}s\n\n" + "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in kmeans_grouped.items()])
    
    # --- CLARANS Clustering ---
    start_time = time.time()
    k_clarans = min(3, len(sentences))
    distance_matrix = euclidean_distances(reduced).tolist()
    clarans_instance = clarans(distance_matrix, number_clusters=k_clarans, numlocal=2, maxneighbor=10)
    clarans_instance.process()
    clusters = clarans_instance.get_clusters()
    clarans_time = time.time() - start_time
    
    clarans_labels = np.full(len(sentences), -1)
    for cluster_id, cluster in enumerate(clusters):
        for idx in cluster:
            clarans_labels[idx] = cluster_id
    
    plt.figure(figsize=(7,5))
    plt.scatter(reduced[:,0], reduced[:,1], c=clarans_labels, cmap='tab10', s=60)
    plt.title(f"CLARANS Clustering (k={k_clarans})")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    for i in range(len(sentences)):
        plt.annotate(str(i), (reduced[i,0], reduced[i,1]))
    plt.tight_layout()
    plt.savefig("clarans_plot.png")
    plt.close()
    
    clarans_grouped = {}
    for sent, lbl in zip(sentences, clarans_labels):
        clarans_grouped.setdefault(int(lbl), []).append(sent)
    clarans_summary = f"⏱️ Time: {clarans_time:.2f}s\n\n" + "\n\n".join([f"Cluster {c}:\n" + "\n".join(v) for c, v in clarans_grouped.items()])
    
    # --- Query Results (separate for each algorithm to show differences) ---
    hdbscan_query = kmeans_query = clarans_query = ""
    if query:
        query_emb = model.encode([query], normalize_embeddings=True)
        sims = np.dot(embeddings, query_emb.T).flatten()
        top_idx = np.argsort(-sims)[:5]
        
        # Base query results
        base_results = [f"{i+1}. {sentences[idx]} (similarity: {sims[idx]:.3f})" for i, idx in enumerate(top_idx)]
        
        # Add cluster info for each algorithm
        hdbscan_query = "Top 5 sentences matching query:\n\n" + "\n\n".join(
            [f"{base_results[i]} [HDBSCAN Cluster: {hdbscan_labels[top_idx[i]]}]" for i in range(len(base_results))]
        )
        
        kmeans_query = "Top 5 sentences matching query:\n\n" + "\n\n".join(
            [f"{base_results[i]} [KMeans Cluster: {kmeans_labels[top_idx[i]]}]" for i in range(len(base_results))]
        )
        
        clarans_query = "Top 5 sentences matching query:\n\n" + "\n\n".join(
            [f"{base_results[i]} [CLARANS Cluster: {clarans_labels[top_idx[i]]}]" for i in range(len(base_results))]
        )
    
    return (hdbscan_summary, "hdbscan_plot.png", hdbscan_query,
            kmeans_summary, "kmeans_plot.png", kmeans_query,
            clarans_summary, "clarans_plot.png", clarans_query)

# Gradio interface with all three algorithms
demo = gr.Interface(
    fn=run_all_clustering,
    inputs=[
        gr.Textbox(lines=10, placeholder="Paste article or chat transcript here...", label="Text Input"),
        gr.Textbox(lines=1, placeholder="Enter query to search relevant sentences (optional)", label="Query (Optional)")
    ],
    outputs=[
        gr.Textbox(label="HDBSCAN: Clustered Sentences (with timing)"),
        gr.Image(label="HDBSCAN: Cluster Graph"),
        gr.Textbox(label="HDBSCAN: Query Results"),
        gr.Textbox(label="KMeans: Clustered Sentences (with timing)"),
        gr.Image(label="KMeans: Cluster Graph"),
        gr.Textbox(label="KMeans: Query Results"),
        gr.Textbox(label="CLARANS: Clustered Sentences (with timing)"),
        gr.Image(label="CLARANS: Cluster Graph"),
        gr.Textbox(label="CLARANS: Query Results")
    ],
    title="Clustering Comparison: HDBSCAN vs KMeans vs CLARANS",
    description="Compare three clustering algorithms side-by-side. Enter text to cluster sentences and optionally query for relevant sentences. Timing and cluster assignments are shown for each algorithm."
)

if __name__ == "__main__":
    demo.launch()

Example article: 
https://en.wikipedia.org/wiki/Tokyo

Example Queries
Big Island
Tasty Food
Economic Growth