In [1]:
import numpy as np
from typing import List
from pydantic import BaseModel
from sklearn.cluster import DBSCAN
from prisma import Prisma
from sklearn.metrics.pairwise import cosine_distances

In [2]:
class ClusterableObject(BaseModel):
    id: int
    embedding: List[float]

In [23]:
db = Prisma()
db.connect()

In [24]:
contexts = [
    e.contextName
    for e in db.edge.find_many(where={ "generationId": 47 })
]
contexts = sorted(list(set(contexts)))

In [7]:
import openai

client = openai.OpenAI()

embeds_response = client.embeddings.create(
    model="text-embedding-3-large", input=contexts, dimensions=1536
)

In [14]:
embeds = [e.embedding for e in embeds_response.data]

In [22]:
len(embeddings), len(contexts)

(286, 3)

In [50]:
embeddings = np.array(embeds)
distance_matrix = cosine_distances(embeddings)
# Run DBSCAN
clustering = DBSCAN(eps=0.5, min_samples=5, metric="precomputed").fit(
    distance_matrix
)

# Print the contexts for each cluster
clusters = {}

for i, cluster in enumerate(clustering.labels_):
    if cluster not in clusters:
        clusters[cluster] = []
    clusters[cluster].append(contexts[i])


for cluster, ctxts in clusters.items():
    print(f"Cluster {cluster}")
    print(f"Length: {len(ctxts)}")
    print(ctxts)
    print()

Cluster -1
Length: 107
['adaptive mindsets', 'ai capabilities', 'ai-assisted governance models', 'altered states', 'alternative economic models', 'ancient legends', 'ancient mysteries', 'archaeological sites', 'atmospheric events', 'beauty standards', 'biohacking experiments', 'biomechanical designs', 'bold adventures', 'business ventures', 'career paths', 'cities to live in', 'city design concepts', 'consent practices', 'conservation projects', 'conservation success stories', 'coral reef systems', 'cultural values and ambassador programs', 'deep sea adaptations', 'desertification reversal methods', 'digital art provenance and value preservation methods', 'diplomatic relationships', 'diverse beauty standards', 'dream interpretations', 'earth shaping forces', 'earth-shaping events', 'eco-community principles', 'ecological urban design elements', 'ecosystem services', 'escape plans', 'evolutionary laboratories', 'existential questions', 'experimental designs', 'explanations of quantum ph

# Try cluster contexts

In [95]:
cards = db.valuescard.find_many(where={"generationId": 34})
contexts = [c.choiceContext.lower() for c in cards]

In [96]:
contexts

['emotional connections',
 'effective communication strategies',
 'ethical considerations',
 'community initiatives',
 'ethical considerations',
 'ethical considerations',
 'ethical considerations',
 'ways to earn money online',
 'sources of fulfillment',
 'work-life balance',
 'moral principles',
 'decision-making process',
 'team well-being',
 'respect for diversity',
 'respect for diversity',
 'instances of effective conflict resolution',
 'moral choices',
 'work-life balance',
 'personal growth opportunities',
 'emotional regulation techniques',
 'team dynamics',
 'effective team communication',
 'time management skills',
 'sources of personal fulfillment',
 'supportive actions',
 'balance between providing support and maintaining self-care',
 'balanced supportive actions',
 'ethical considerations',
 'relationships',
 'effective communication',
 'respect for diversity',
 'inclusive environment',
 'emotional connection',
 'resolution strategies',
 'shared goals',
 'educational oppo

In [97]:
import openai


embeddings_resp = openai.embeddings.create(model="text-embedding-3-large", input=contexts)

In [98]:
len(embeddings_resp.data), len(contexts)

(198, 198)

In [129]:
embeddings = np.array([e.embedding for e in embeddings_resp.data])
distance_matrix = cosine_distances(embeddings)
# Run DBSCAN
clustering = DBSCAN(eps=0.25, min_samples=1, metric="precomputed").fit(distance_matrix)

print(len(set(clustering.labels_)))

119


In [130]:
# for each cluster with more than one element, print the contexts
for cluster_id in np.unique(clustering.labels_):
    if cluster_id == -1:
        continue
    cluster_contexts = [c for i, c in enumerate(contexts) if clustering.labels_[i] == cluster_id]
    print(f"Cluster {cluster_id}:")
    print(cluster_contexts)

print(len(np.unique(clustering.labels_)))

Cluster 0:
['emotional connections', 'emotional connection', 'emotional connections', 'emotional connections', 'emotional connections', 'emotional connection and mutual understanding', 'emotional connections', 'emotional connections']
Cluster 1:
['effective communication strategies', 'effective communication', 'communication strategies', 'communication strategies', 'effective communication strategies', 'communication strategies', 'communication techniques']
Cluster 2:
['ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical considerations', 'ethical guidelines', 'ethical considerations', 'ethical considerations', 'ethical considerations']
Cluster 3:
['community initiatives']
Cluster 4:
['ways to earn money online']
Cluster 5:
['sources of fulfillment', 'sources of personal fulfillment', 'source of personal fulfillment', 'sources of personal fulfillment'

In [109]:
len(clustering.labels_)

198