In [6]:

from sklearn.cluster import KMeans

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
  "ai",
  "engineering",
  "reading",
  "research",
  "reasoning",
  "model",
  "reinforcement",
  "curation",
  "chatgpt",
  "api",
  "realtime",
  "mathematics",
  "language",
  "deeplearning",
  "experts",
  "mixtral",
  "pixtral",
  "optimization",
  "performance",
  "programming",
  "intelligence",
  "coding",
  "tools",
  "deepseek",
  "llm",
  "scaling",
  "longtermism",
  "mistral",
  "llama",
  "chat",
  "fine",
  "tuning",
  "gpt",
  "report",
  "models",
  "segment",
  "images",
  "videos",
  "speech",
  "recognition",
  "supervision",
  "learning",
  "transferable",
  "visual",
  "deep",
  "large",
  "scale",
  "multimodal",
  "shortcomings",
  "exploration",
  "object",
  "detection",
  "unified",
  "code",
  "generation",
  "prompt",
  "segmentation",
  "sonnet",
  "agents",
  "computer",
  "stack",
  "source",
  "licensing",
  "memory",
  "papers",
  "systems",
  "implementation",
  "architecture",
  "huggingface",
  "benchmark",
  "github",
  "issues",
  "benchmarks",
  "system",
  "hallucinations",
  "facts",
  "building",
  "retrieval",
  "augmented",
  "chatbots",
  "evaluation",
  "embedding",
  "datasets",
  "capabilities",
  "advancements",
  "mteb",
  "knowledge",
  "nlp",
  "tasks",
  "embeddings",
  "context",
  "power",
  "parameter",
  "efficiency",
  "problem",
  "solving",
  "prompting",
  "training",
  "proof",
  "questions",
  "testing",
  "custom",
  "metrics",
  "mathematical",
  "accuracy",
  "augmentation",
  "structure",
  "queries",
  "multitask",
  "understanding",
  "problems",
  "solutions"
]

corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i + 1)
    print(cluster)
    print("")

Cluster  1
['longtermism', 'report', 'shortcomings']

Cluster  2
['reasoning', 'llama', 'fine', 'large', 'code', 'prompt', 'stack', 'memory', 'issues', 'hallucinations', 'facts', 'context', 'power', 'parameter', 'problem', 'solving', 'prompting', 'proof', 'questions', 'testing', 'custom', 'queries', 'understanding', 'problems', 'solutions']

Cluster  3
['model', 'realtime', 'optimization', 'models', 'segment', 'images', 'videos', 'recognition', 'visual', 'object', 'detection', 'segmentation', 'systems', 'implementation', 'benchmark', 'benchmarks', 'system', 'retrieval', 'evaluation', 'embedding', 'datasets', 'embeddings', 'metrics', 'accuracy', 'structure']

Cluster  4
['engineering', 'research', 'mathematics', 'performance', 'tools', 'scaling', 'scale', 'unified', 'generation', 'papers', 'architecture', 'building', 'augmented', 'capabilities', 'advancements', 'efficiency', 'mathematical', 'augmentation']

Cluster  5
['ai', 'reading', 'reinforcement', 'curation', 'chatgpt', 'api', 'lan

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np


# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
corpus_embeddings_2 = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform agglomerative clustering
clustering_model_2 = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model_2.fit(corpus_embeddings_2)
cluster_assignment_2 = clustering_model_2.labels_

print(cluster_assignment_2)

clustered_sentences_2 = {}
for sentence_id, cluster_id in enumerate(cluster_assignment_2):
    if cluster_id not in clustered_sentences_2:
        clustered_sentences_2[cluster_id] = []

    clustered_sentences_2[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences_2.items():
    print("Cluster ", i + 1)
    print(cluster)
    print("")

[ 4  2 16 16  4  6  4  5 14  1  0 18  5 17  4  8  8  3  2  1  4  1  2 17
 11  3  2  8 11 14  5  3  1 15  6  0  0  0  5  0  4  4  5  0 17  5  3  0
  2  4  5  0  6  1  5 12  0 11  4  5 11 15  1  5 16  7  6  6 11 10  1  9
 10  7  8 15  6 16  2 14  2 13 10  2  2 11  4  5  7 13  5  5  3  2  9  9
 12  4 15 16 16  5 10 18 10  2  6 16  7  4  9  9]
{np.int64(4): ['ai', 'reasoning', 'reinforcement', 'experts', 'intelligence', 'supervision', 'learning', 'exploration', 'agents', 'knowledge', 'training', 'understanding'], np.int64(2): ['engineering', 'performance', 'tools', 'longtermism', 'shortcomings', 'augmented', 'evaluation', 'capabilities', 'advancements', 'efficiency', 'augmentation'], np.int64(16): ['reading', 'research', 'papers', 'retrieval', 'questions', 'testing', 'queries'], np.int64(6): ['model', 'models', 'unified', 'implementation', 'architecture', 'building', 'structure'], np.int64(5): ['curation', 'language', 'fine', 'speech', 'transferable', 'large', 'object', 'generation', 'comp