In [None]:
!pip install hdbscan umap-learn scikit-learn numpy sentence_transformers

In [None]:
import os
import json
import numpy as np

# For embedding: Sentence Transformers (pip install sentence-transformers)
from sentence_transformers import SentenceTransformer

def embed_description(text, model):
    """
    Takes a text string and a SentenceTransformer model,
    returns the embedding as a numpy array.
    """
    if not text:
        return np.array([])
    return model.encode(text)

def embed_schema_folder(folder_path, model_name='all-MiniLM-L6-v2'):
    """
    Iterates over all .json files in a folder. For each JSON schema:
      - Embeds the 'description' at the top level (if present).
      - Embeds the 'description' of each property (if present).
      - Saves the resulting embeddings as .npy files in the same folder.
    """
    # Load the SentenceTransformer model once
    model = SentenceTransformer(model_name)

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    schema = json.load(f)
                except json.JSONDecodeError:
                    print(f"Could not parse JSON from {file_name}. Skipping...")
                    continue

            # 1) Embed the main schema 'description' (if it exists)
            main_description = schema.get('description', '')
            main_embedding = embed_description(main_description, model)
            if main_embedding.size > 0:
                # Save the main description embedding
                np.save(
                    os.path.join(folder_path, f"{file_name}_main_desc_embedding.npy"),
                    main_embedding
                )

            # 2) Embed each property's description (if properties exist)
            props = schema.get('properties', {})
            for prop_name, prop_info in props.items():
                prop_description = prop_info.get('description', '')
                prop_embedding = embed_description(prop_description, model)
                if prop_embedding.size > 0:
                    # Save the property description embedding
                    np.save(
                        os.path.join(folder_path, f"{file_name}_{prop_name}_embedding.npy"),
                        prop_embedding
                    )

            print(f"Processed schema file: {file_name}")

# Example usage:
# folder_path = "/path/to/your/schema/folder"
# embed_schema_folder(folder_path)


In [None]:
# MPORTS & LOADING
import os
import numpy as np

# scikit-learn
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# HDBSCAN & UMAP
import hdbscan
import umap

def load_embeddings_from_folder(folder_path):
    """
    Loads all .npy files from the given folder
    and returns a single numpy array of embeddings.
    """
    all_embeddings = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.npy'):
            file_path = os.path.join(folder_path, file_name)
            embeddings = np.load(file_path)
            # If each file has shape (D,) or (1, D), adjust as needed.
            all_embeddings.append(embeddings)
    
    # Stack into a single array (N, D)
    # If each file is shape (D,), do np.vstack(all_embeddings).
    # If shape (N, D), adjust accordingly.
    all_embeddings = np.vstack(all_embeddings)
    
    return all_embeddings

# Set your folder path here
folder_path = ""
X = load_embeddings_from_folder(folder_path)
print("Embeddings shape:", X.shape)


In [None]:
#  K-MEANS
def kmeans_clustering(X, n_clusters=5):
    """
    Performs K-Means clustering.
    Returns cluster labels and the trained model.
    """
    kmeans = KMeans(
        n_clusters=n_clusters,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    labels = kmeans.fit_predict(X)
    return labels, kmeans

labels_kmeans, model_kmeans = kmeans_clustering(X, n_clusters=5)
print("K-Means labels:", np.unique(labels_kmeans))


In [None]:
# AGGLOMERATIVE (HIERARCHICAL) CLUSTERING
def hierarchical_clustering(X, n_clusters=5, linkage='ward'):
    """
    Performs Agglomerative (Hierarchical) Clustering.
    Returns cluster labels and the trained model.
    """
    agg = AgglomerativeClustering(
        n_clusters=n_clusters,
        affinity='euclidean',
        linkage=linkage
    )
    labels = agg.fit_predict(X)
    return labels, agg

labels_hier, model_hier = hierarchical_clustering(X, n_clusters=5, linkage='ward')
print("Hierarchical labels:", np.unique(labels_hier))


In [None]:
# DBSCAN
def dbscan_clustering(X, eps=0.5, min_samples=5):
    """
    Performs DBSCAN clustering.
    Returns cluster labels and the trained model.
    """
    dbscan_model = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    labels = dbscan_model.fit_predict(X)
    return labels, dbscan_model

labels_dbscan, model_dbscan = dbscan_clustering(X, eps=0.5, min_samples=5)
print("DBSCAN labels:", np.unique(labels_dbscan))


In [None]:
# HDBSCAN
def hdbscan_clustering(X, min_cluster_size=5, min_samples=5):
    """
    Performs HDBSCAN clustering.
    Returns cluster labels and the trained model.
    """
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom'
    )
    labels = hdbscan_model.fit_predict(X)
    return labels, hdbscan_model

labels_hdbscan, model_hdbscan = hdbscan_clustering(X, min_cluster_size=5, min_samples=5)
print("HDBSCAN labels:", np.unique(labels_hdbscan))


In [None]:
#  UMAP + HDBSCAN
def umap_hdbscan_clustering(X, n_neighbors=15, n_components=5, min_cluster_size=5, min_samples=5):
    """
    First reduces embeddings to n_components dimensions using UMAP,
    then performs HDBSCAN on the reduced data.
    Returns labels and a tuple (umap_reducer, hdbscan_model).
    """
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        random_state=42
    )
    X_reduced = reducer.fit_transform(X)
    
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom'
    )
    labels = hdbscan_model.fit_predict(X_reduced)
    return labels, (reducer, hdbscan_model)

labels_umap_hdbscan, (umap_model, hdb_model) = umap_hdbscan_clustering(X)
print("UMAP+HDBSCAN labels:", np.unique(labels_umap_hdbscan))


In [None]:
# JUPYTER NOTEBOOK CELL 8: EVALUATION (SILHOUETTE SCORE)
def evaluate_clustering(X, labels, cluster_name="Cluster"):
    """
    Evaluates clustering with Silhouette Score if there are more than 1 valid cluster.
    (Excluding -1 if using DBSCAN/HDBSCAN)
    """
    unique_labels = set(labels)
    # Remove noise label (-1) from the count if present
    unique_labels_no_noise = unique_labels - {-1}
    
    # Need at least 2 valid clusters
    if len(unique_labels_no_noise) > 1:
        score = silhouette_score(X, labels)
        print(f"{cluster_name} Silhouette Score: {score:.4f}")
    else:
        print(f"{cluster_name} has too few clusters or mostly noise. Cannot compute Silhouette Score.")

# Evaluate each clustering result
evaluate_clustering(X, labels_kmeans, "K-Means")
evaluate_clustering(X, labels_hier, "Hierarchical")
evaluate_clustering(X, labels_dbscan, "DBSCAN")
evaluate_clustering(X, labels_hdbscan, "HDBSCAN")

X_reduced = umap_model.transform(X)
evaluate_clustering(X_reduced, labels_umap_hdbscan, "UMAP+HDBSCAN")
