In [12]:
from code_data_science import data_table as dt
from code_data_science import palette
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import numpy as np
import umap
from infinity_emb import EngineArgs, AsyncEmbeddingEngine
import asyncio
import time

import logging

# Adjust the log level for the 'infinity_emb' package to WARNING or higher
logging.getLogger("infinity_emb").setLevel(logging.ERROR)

# Get data
df = dt.read_csv("../samples/find_methods.csv")
df.drop_duplicates("method", inplace=True)

# Initialize the engine outside the function
engine_args = EngineArgs(
    model_name_or_path="michaelfeil/bge-small-en-v1.5",
    device="cpu",
    engine="optimum",
    compile=True,
    batch_size=2
)
engine = AsyncEmbeddingEngine.from_args(engine_args)

# Define a single function to get embeddings
async def get_embeddings(sentences: list[str]) -> list:
    async with engine:  # Ensure engine is properly started and stopped
        embeddings, _ = await engine.embed(sentences=sentences)
    return embeddings

# Track time for embeddings
start_time_embeddings = time.time_ns()
loop = asyncio.get_running_loop()
embeddings = await loop.create_task((get_embeddings(df["method"].to_list())))
end_time_embeddings = time.time_ns()
embedding_time_seconds = (end_time_embeddings - start_time_embeddings) / 1e9

# Track time for UMAP
start_time_umap = time.time_ns()
indices = umap.UMAP(n_neighbors=100, min_dist=0.7, random_state=42).fit_transform(embeddings)
end_time_umap = time.time_ns()
umap_time_seconds = (end_time_umap - start_time_umap) / 1e9

x = indices[:, 0]
y = indices[:, 1]

# Find best k
best_silhouette_score = -100
kmax = 6
best_k = -1

# Track time for clustering
start_time_cluster = time.time_ns()
for k in range(2, kmax + 1):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, n_jobs=1).fit_predict(embeddings)
    labels = kmeans
    if silhouette_score(embeddings, labels, metric="euclidean") > best_silhouette_score:
        best_kmeans = kmeans
        best_k = k
end_time_cluster = time.time_ns()
cluster_time_seconds = (end_time_cluster - start_time_cluster) / 1e9

df["x"] = x
df["y"] = y
df["kmeans"] = best_kmeans.astype("str")  # As string so the colors and legend are discrete
df.sort_values(by="kmeans", inplace=True)

custom_palette = [
    palette.__moderne_color_map["red"]["main"],
    palette.__moderne_color_map["yellow"]["main"],
    palette.__moderne_color_map["green"]["main"],
    palette.__moderne_color_map["blue"]["main"],
    palette.__moderne_color_map["indigo"]["main"],
    palette.__moderne_color_map["red"][700],
    palette.__moderne_color_map["yellow"][700],
]

colors = custom_palette[:best_k]
df.rename(columns={"kmeans": "cluster id"}, inplace=True)

# Create the figure with time metrics in the title
fig = px.scatter(
    df,
    x="x",
    y="y",
    log_x=False,
    color="cluster id",
    hover_name="method",
    color_discrete_sequence=colors
)
fig.update_layout(
    showlegend=False,
    title=(
        f"Embedding Time: {embedding_time_seconds:.2f} sec | "
        f"UMAP Time: {umap_time_seconds:.2f} sec | "
        f"Clustering Time: {cluster_time_seconds:.2f} sec"
    )
)

fig.show()

The ONNX file model_quantized_optimized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.

n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



