In [1]:
import os
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from pinecone import Pinecone
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.spatial import Voronoi
import hashlib

In [2]:
# Load environment variables
load_dotenv()

# Initialize Pinecone
pc = Pinecone(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT')
)
index = pc.Index('music-embeddings')

In [3]:
# Fetch all vectors
vector_count = index.describe_index_stats()['total_vector_count']
print(f"Total vectors in index: {vector_count}")

# Query to get the top k vectors
index_arr = index.query(vector=[0 for _ in range(768)], top_k=vector_count, include_metadata=True)

# Fetch the vectors using the retrieved IDs in batches
vector_ids = [v['id'] for v in index_arr['matches']]
batch_size = 100  # Adjust batch size as needed
embeddings = []
metadata = []

for i in range(0, len(vector_ids), batch_size):
    batch_ids = vector_ids[i:i + batch_size]
    response = index.fetch(ids=batch_ids)
    batch_embeddings = [v['values'] for v in response['vectors'].values()]
    batch_metadata = [v['metadata'] for v in response['vectors'].values()]
    for id_, meta in zip(batch_ids, batch_metadata):
        meta['id'] = id_
    embeddings.extend(batch_embeddings)
    metadata.extend(batch_metadata)

# Convert to numpy array
embeddings = np.array(embeddings)

Total vectors in index: 651


In [19]:
# Credit to gmacro on stackoverflow for the foundation for this function
def voronoi_finite_polygons_2d(vor, radius=None):
    # Check if the input is 2D
    if vor.points.shape[1] != 2:
        raise ValueError("Requires 2D input")

    # Calculate the center of the points
    center = vor.points.mean(axis=0)
    if radius is None:
        radius = vor.points.ptp().max() * 2

    new_regions = []
    new_vertices = vor.vertices.tolist()

    # Construct a map containing all ridges for a given point
    all_ridges = construct_ridge_map(vor)

    # Reconstruct regions
    for p1, region in enumerate(vor.point_region):
        vertices = vor.regions[region]

        if all(v >= 0 for v in vertices):
            # Finite region
            new_regions.append(vertices)
        else:
            # Reconstruct a non-finite region
            new_region = reconstruct_infinite_region(p1, vertices, all_ridges, vor, center, radius, new_vertices)
            new_regions.append(new_region)

    return new_regions, np.asarray(new_vertices)

def construct_ridge_map(vor):
    all_ridges = {}
    for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
        all_ridges.setdefault(p1, []).append((p2, v1, v2))
        all_ridges.setdefault(p2, []).append((p1, v1, v2))
    return all_ridges

def reconstruct_infinite_region(p1, vertices, all_ridges, vor, center, radius, new_vertices):
    new_region = [v for v in vertices if v >= 0]
    for p2, v1, v2 in all_ridges[p1]:
        if v2 < 0:
            v1, v2 = v2, v1
        if v1 >= 0:
            # Finite ridge: already in the region
            continue
        # Compute the missing endpoint of an infinite ridge
        far_point = compute_far_point(p1, p2, v2, vor, center, radius)
        new_region.append(len(new_vertices))
        new_vertices.append(far_point.tolist())
    # Sort region counterclockwise
    new_region = sort_region_counterclockwise(new_region, new_vertices)
    return new_region

def compute_far_point(p1, p2, v2, vor, center, radius):
    tangent = vor.points[p2] - vor.points[p1]  # Tangent vector
    tangent /= np.linalg.norm(tangent)
    normal = np.array([-tangent[1], tangent[0]])  # Normal vector
    midpoint = vor.points[[p1, p2]].mean(axis=0)
    direction = np.sign(np.dot(midpoint - center, normal)) * normal
    far_point = vor.vertices[v2] + direction * radius
    return far_point

def sort_region_counterclockwise(region, vertices):
    vs = np.asarray([vertices[v] for v in region])
    center = vs.mean(axis=0)
    angles = np.arctan2(vs[:, 1] - center[1], vs[:, 0] - center[0])
    sorted_region = np.array(region)[np.argsort(angles)]
    return sorted_region.tolist()

In [17]:
# Perform PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Perform K-means clustering
n_clusters = vector_count//15  # Adjust this number as needed
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", n_init=4, random_state=42)
cluster_labels = kmeans.fit_predict(reduced_embeddings)

# Prepare data for Plotly
data = {
    'x': reduced_embeddings[:, 0],
    'y': reduced_embeddings[:, 1],
    'name': [m['name'] for m in metadata],
    'artist': [m['artist'] for m in metadata],
    'genre': [m['genre'] for m in metadata],
    'cluster': cluster_labels,
    'id': [m['id'] for m in metadata]
}

# Create a DataFrame for easier manipulation
df = pd.DataFrame(data)

vor = Voronoi(kmeans.cluster_centers_)
regions, vertices = voronoi_finite_polygons_2d(vor)

In [18]:
# Generate a unique color for each region
def generate_color(name, factor=0.25):
    """Generate a pastel color for the given name."""
    # Generate a color based on the hash of the name
    hash_object = hashlib.md5(name.encode())
    hex_dig = hash_object.hexdigest()
    base_color = [int(hex_dig[i:i+2], 16) for i in (0, 2, 4)]

    # Mix the color with white
    pastel_color = [(1 - factor) * c + factor * 255 for c in base_color]
    pastel_color_hex = ''.join(f'{int(c):02x}' for c in pastel_color)

    return '#' + pastel_color_hex

# Create Plotly figure
fig = go.Figure()

# Add Voronoi regions, each with a different color
for region, center in zip(regions, kmeans.cluster_centers_):
    polygon = vertices[region]
    color = generate_color(str(center))
    fig.add_trace(go.Scatter(
        x=polygon[:, 0],
        y=polygon[:, 1],
        fill="toself",
        fillcolor=color,
        mode='lines',
        line=dict(color='rgba(0,0,0,0)')  # No border color
    ))

# Add scatter plot of points
fig.add_trace(go.Scatter(
    x=df['x'],
    y=df['y'],
    mode='markers',
    marker=dict(color='black', size=5),
    text=df.apply(lambda row: f"{row['name']} by {row['artist']}", axis=1),
    hoverinfo='text'
))

# Add centroids
fig.add_trace(go.Scatter(
    x=kmeans.cluster_centers_[:, 0],
    y=kmeans.cluster_centers_[:, 1],
    mode='markers',
    marker=dict(size=10, color='white', symbol='x'),
    name='Centroids'
))

# Update layout
fig.update_layout(
    title="K-means clustering on Spotify embeddings (PCA-reduced)",
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    showlegend=False,
    width=800,
    height=800,
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_xaxes(minallowed=df['x'].min()-1, maxallowed=df['x'].max()+1)
fig.update_yaxes(minallowed=df['y'].min()-1, maxallowed=df['y'].max()+1)


# Show the plot
fig.show()