In [2]:
import os
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pinecone import Pinecone
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.spatial import Voronoi
import hashlib

# Load environment variables
load_dotenv()

# Initialize Pinecone
pc = Pinecone(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT')
)

# Function to compute finite Voronoi regions
def voronoi_finite_polygons_2d(vor, radius=None):
    if vor.points.shape[1] != 2:
        raise ValueError("Requires 2D input")

    center = vor.points.mean(axis=0)
    if radius is None:
        radius = vor.points.ptp().max() * 2

    new_regions = []
    new_vertices = vor.vertices.tolist()
    all_ridges = construct_ridge_map(vor)

    for p1, region in enumerate(vor.point_region):
        vertices = vor.regions[region]

        if all(v >= 0 for v in vertices):
            new_regions.append(vertices)
        else:
            new_region = reconstruct_infinite_region(p1, vertices, all_ridges, vor, center, radius, new_vertices)
            new_regions.append(new_region)

    return new_regions, np.asarray(new_vertices)

def construct_ridge_map(vor):
    all_ridges = {}
    for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
        if v1 >= 0 and v2 >= 0:  # Ensure finite vertices
            all_ridges.setdefault(p1, []).append((p2, v1, v2))
            all_ridges.setdefault(p2, []).append((p1, v1, v2))
    return all_ridges

def reconstruct_infinite_region(p1, vertices, all_ridges, vor, center, radius, new_vertices):
    new_region = [v for v in vertices if v >= 0]
    for p2, v1, v2 in all_ridges.get(p1, []):  # Use get to avoid KeyError
        if v2 < 0:
            v1, v2 = v2, v1
        if v1 >= 0:
            continue
        far_point = compute_far_point(p1, p2, v2, vor, center, radius)
        new_region.append(len(new_vertices))
        new_vertices.append(far_point.tolist())
    new_region = sort_region_counterclockwise(new_region, new_vertices)
    return new_region

def compute_far_point(p1, p2, v2, vor, center, radius):
    tangent = vor.points[p2] - vor.points[p1]
    tangent /= np.linalg.norm(tangent)
    normal = np.array([-tangent[1], tangent[0]])
    midpoint = vor.points[[p1, p2]].mean(axis=0)
    direction = np.sign(np.dot(midpoint - center, normal)) * normal
    far_point = vor.vertices[v2] + direction * radius
    return far_point

def sort_region_counterclockwise(region, vertices):
    vs = np.asarray([vertices[v] for v in region])
    center = vs.mean(axis=0)
    angles = np.arctan2(vs[:, 1] - center[1], vs[:, 0] - center[0])
    sorted_region = np.array(region)[np.argsort(angles)]
    return sorted_region.tolist()

# Generate a unique color for each region
def generate_color(name, factor=0.25):
    hash_object = hashlib.md5(name.encode())
    hex_dig = hash_object.hexdigest()
    base_color = [int(hex_dig[i:i+2], 16) for i in (0, 2, 4)]
    pastel_color = [(1 - factor) * c + factor * 255 for c in base_color]
    pastel_color_hex = ''.join(f'{int(c):02x}' for c in pastel_color)
    return '#' + pastel_color_hex

# Function to fetch embeddings from a Pinecone index
def fetch_embeddings(index_name, vector_count, dimensions):
    index = pc.Index(index_name)
    vector_count = index.describe_index_stats()['total_vector_count']
    index_arr = index.query(vector=[0 for _ in range(dimensions)], top_k=vector_count, include_metadata=True)

    vector_ids = [v['id'] for v in index_arr['matches']]
    batch_size = 100  # Adjust batch size as needed
    embeddings = []
    metadata = []

    for i in range(0, len(vector_ids), batch_size):
        batch_ids = vector_ids[i:i + batch_size]
        response = index.fetch(ids=batch_ids)
        batch_embeddings = [v['values'] for v in response['vectors'].values()]
        batch_metadata = [v['metadata'] for v in response['vectors'].values()]
        for id_, meta in zip(batch_ids, batch_metadata):
            meta['id'] = id_
        embeddings.extend(batch_embeddings)
        metadata.extend(batch_metadata)

    return np.array(embeddings), metadata

# Fetch song embeddings
song_embeddings, song_metadata = fetch_embeddings('song-embeddings', 768, 768)

# Fetch genre embeddings
genre_embeddings, genre_metadata = fetch_embeddings('genre-embeddings', 768, 768)



In [42]:
# Filter genre embeddings by popularity
filtered_genre_embeddings = []
filtered_genre_metadata = []
for emb, meta in zip(genre_embeddings, genre_metadata):
    if meta.get('popularity', 0) > 80:
        filtered_genre_embeddings.append(emb)
        filtered_genre_metadata.append(meta)

filtered_genre_embeddings = np.array(filtered_genre_embeddings)

# Use filtered genre embeddings as initial centroids for K-means clustering
kmeans = KMeans(n_clusters=len(filtered_genre_embeddings), init=filtered_genre_embeddings, n_init=1, max_iter=100)
cluster_labels = kmeans.fit_predict(song_embeddings)

# Apply PCA to reduce genre centroids for visualization
pca = PCA(n_components=2)
reduced_genre_embeddings = pca.fit_transform(filtered_genre_embeddings)

# Apply PCA to reduce song embeddings for visualization
# Apply PCA to reduce song embeddings for visualization
song_embeddings_pca = pca.transform(song_embeddings)

# Standardize the song embeddings to ensure good distribution
scaler_song = StandardScaler()
song_embeddings_standardized = scaler_song.fit_transform(song_embeddings_pca)

# Increase the scaling factor to make the songs sparser
scaling_factor_song = 1.0  # Adjust this value to increase sparsity
song_embeddings_sparse = song_embeddings_standardized * scaling_factor_song

# Standardize the genre embeddings to ensure good distribution
scaler_genre = StandardScaler()
reduced_genre_standardized = scaler_genre.fit_transform(reduced_genre_embeddings)

# Increase the scaling factor to make the genres sparser
scaling_factor_genre = 1.0  # Adjust this value to increase sparsity
reduced_genre_sparse = reduced_genre_standardized * scaling_factor_genre

# Compute Voronoi diagram
vor = Voronoi(reduced_genre_sparse)
regions, vertices = voronoi_finite_polygons_2d(vor)

In [43]:
# Prepare data for Plotly
data = {
    'x': song_embeddings_sparse[:, 0],
    'y': song_embeddings_sparse[:, 1],
    'name': [m['name'] for m in song_metadata],
    'artist': [m['artist'] for m in song_metadata],
    'genre': [m['genre'] for m in song_metadata],
    'cluster': cluster_labels,
    'id': [m['id'] for m in song_metadata]
}

# Create a DataFrame for easier manipulation
df = pd.DataFrame(data)

# Create Plotly figure
fig = go.Figure()

# Add Voronoi regions, each with a different color
for region, genre in zip(regions, filtered_genre_metadata):
    polygon = vertices[region]
    color = generate_color(genre['name'])
    fig.add_trace(go.Scatter(
        x=polygon[:, 0],
        y=polygon[:, 1],
        fill="toself",
        fillcolor=color,
        mode='lines',
        line=dict(color='rgba(0,0,0,0)')
    ))

# Add centroids with genre names for hover info
centroid_texts = [m['name'] for m in filtered_genre_metadata]
fig.add_trace(go.Scatter(
    x=reduced_genre_sparse[:, 0],
    y=reduced_genre_sparse[:, 1],
    mode='markers',
    marker=dict(size=10, color='white', symbol='x'),
    text=centroid_texts,
    hoverinfo='text',
    name='Centroids'
))

# Add scatter plot of points
fig.add_trace(go.Scatter(
    x=df['x'],
    y=df['y'],
    mode='markers',
    marker=dict(color='black', size=5),
    text=df.apply(lambda row: f"{row['name']} by {row['artist']}", axis=1),
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    title="K-means clustering on Spotify embeddings (PCA-reduced)",
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    showlegend=False,
    width=800,
    height=800,
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_xaxes(range=[df['x'].min()-1, df['x'].max()+1])
fig.update_yaxes(range=[df['y'].min()-1, df['y'].max()+1])

# Show the plot
fig.show()