In [None]:
from code_data_science import data_table as dt
from code_data_science import palette
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import numpy as np
import umap


def string_to_float_array(str, delimiter=";"):
    return [float(f) for f in str.split(delimiter)]


# get data
df = dt.read_csv("../samples/embeddings.csv")
df.drop_duplicates("embedding", inplace=True)

df["embedding"] = [
    [float(f) for f in embd_str.split(";")] for embd_str in df["embedding"]
]
embds = np.array(list(df["embedding"]))


indices = umap.UMAP(n_neighbors=100, min_dist=0.7).fit_transform(embds)


x = indices[:, 0]
y = indices[:, 1]

# find best k
best_silhouette_score = -100
kmax = 6
best_k = -1

for k in range(2, kmax + 1):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit_predict(
        embds
    )  # np.column_stack([x,y])
    labels = kmeans
    if silhouette_score(embds, labels, metric="euclidean") > best_silhouette_score:
        best_kmeans = kmeans
        best_k = k

df["x"] = x
df["y"] = y
df["kmeans"] = best_kmeans.astype(
    "str"
)  # as string so the colors and legend are discrete
df.sort_values(by="kmeans", inplace=True)

colors = palette.generate_colors(best_k)

fig = px.scatter(
    df,
    x="x",
    y="y",
    log_x=False,
    color="kmeans",
    hover_name="name",
    color_discrete_sequence=colors,
)

fig.show()