# KMeans Clustering with LLM-Augmented Labels

Herein, we leverage `sci-kit learn`'s `KMeans` clustering algorithm to cluster embeddings 
generated from a neural network into `n` clusters. We then use the cluster assignments as
bundles of similar text. Each of these bundles are passed to an LLM to generate a 
descriptive label for the cluster.

In [1]:
import dotenv
import os
import sys
from sklearn.cluster import KMeans

sys.path.append("../")

dotenv.load_dotenv()

corpus = [
    "I'd like an apple",
    "An apple a day keeps the doctor away",
    "Never compare an apple to an orange",
    "I prefer scikit-learn to orange",
    "The scikit-learn docs are orange",
    "A man is eating food",
    "Eating a piece of bread",
    "Horse is eating hay",
    "Woman has some Biryani",
    "The girl is carrying a baby",
    "The baby is carried by the woman",
    "A man is riding a horse",
    "A man is riding a white horse on an enclosed ground",
    "A monkey is playing with a ball",
    "Someone in a gorilla costume is playing a drum",
    "A cheetah is running behind its prey",
    "A cheetah chases its prey across a field",
    "The cheetah is chasing a man who is riding the horse",
    "The man and woman witht the baby are watching the cheetah in the zoo",
]

In [None]:
from fns.openai_fns import get_embeddings

embeddings = get_embeddings(corpus)

In [4]:
import numpy as np

corpus_embeddings = [ d["embedding"] for d in embeddings["data"]]

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

corpus_embeddings

array([[-0.0080403 , -0.00057823,  0.00185002, ..., -0.0178255 ,
         0.00857381, -0.00607573],
       [ 0.01228517,  0.00509541,  0.02712444, ..., -0.00713868,
        -0.0004737 , -0.02057319],
       [ 0.00797196, -0.00975728,  0.02111392, ..., -0.00997425,
        -0.00153736, -0.03270611],
       ...,
       [-0.02104704,  0.01460099,  0.00169947, ..., -0.0168879 ,
         0.00548794, -0.01135912],
       [-0.02075954,  0.01224726,  0.01262572, ..., -0.00979657,
         0.00271437, -0.00654552],
       [-0.00708078,  0.0107069 ,  0.00436278, ..., -0.00542648,
        -0.01525384, -0.02822151]])

In [5]:
clustering_model = KMeans(n_clusters=3)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)

[0 0 2 2 2 0 0 0 0 0 1 1 1 1 1 1 1 1 1]


In [6]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
["I'd like an apple", 'An apple a day keeps the doctor away', 'A man is eating food', 'Eating a piece of bread', 'Horse is eating hay', 'Woman has some Biryani', 'The girl is carrying a baby']

Cluster  3
['Never compare an apple to an orange', 'I prefer scikit-learn to orange', 'The scikit-learn docs are orange']

Cluster  2
['The baby is carried by the woman', 'A man is riding a horse', 'A man is riding a white horse on an enclosed ground', 'A monkey is playing with a ball', 'Someone in a gorilla costume is playing a drum', 'A cheetah is running behind its prey', 'A cheetah chases its prey across a field', 'The cheetah is chasing a man who is riding the horse', 'The man and woman witht the baby are watching the cheetah in the zoo']



In [22]:
clustered_sentences

{0: ["I'd like an apple",
  'An apple a day keeps the doctor away',
  'A man is eating food',
  'Eating a piece of bread',
  'Horse is eating hay',
  'Woman has some Biryani',
  'The girl is carrying a baby'],
 2: ['Never compare an apple to an orange',
  'I prefer scikit-learn to orange',
  'The scikit-learn docs are orange'],
 1: ['The baby is carried by the woman',
  'A man is riding a horse',
  'A man is riding a white horse on an enclosed ground',
  'A monkey is playing with a ball',
  'Someone in a gorilla costume is playing a drum',
  'A cheetah is running behind its prey',
  'A cheetah chases its prey across a field',
  'The cheetah is chasing a man who is riding the horse',
  'The man and woman witht the baby are watching the cheetah in the zoo']}

In [18]:
from fns.openai_fns import messages_prompt
import json

def generate_cluster_names(clustered_sentences):
    """
    Takes a map of clustered sentences and prompts the user to name each cluster
    """
    cluster_names = {}
    for i, cluster in clustered_sentences.items():
        messages = [{
            "role": "system",
            "content": "You are a helpful assistant. You will be provided with a list of sentences. Please name the cluster based on the sentences. Please return just the name of the cluster. No additional information is needed."
        
        }, {
            "role": "user",
            "content": "Name this cluster:\n" + json.dumps(cluster)
        
        }]
        name = messages_prompt(messages)
        cluster_names[name] = cluster
    return cluster_names

In [19]:
named_clusters = generate_cluster_names(clustered_sentences)

print(json.dumps(named_clusters, indent=2))

{
  "Food and Eating": [
    "I'd like an apple",
    "An apple a day keeps the doctor away",
    "A man is eating food",
    "Eating a piece of bread",
    "Horse is eating hay",
    "Woman has some Biryani",
    "The girl is carrying a baby"
  ],
  "Fruit Comparison": [
    "Never compare an apple to an orange",
    "I prefer scikit-learn to orange",
    "The scikit-learn docs are orange"
  ],
  "Animal Actions": [
    "The baby is carried by the woman",
    "A man is riding a horse",
    "A man is riding a white horse on an enclosed ground",
    "A monkey is playing with a ball",
    "Someone in a gorilla costume is playing a drum",
    "A cheetah is running behind its prey",
    "A cheetah chases its prey across a field",
    "The cheetah is chasing a man who is riding the horse",
    "The man and woman witht the baby are watching the cheetah in the zoo"
  ]
}
