## Phase 1 - Project Setup ##

In [1]:
# imports
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("songs.csv", encoding="latin-1") # Loads csv file
df.head() # Displays first few rows in csv

FileNotFoundError: [Errno 2] No such file or directory: 'spotify-2023.csv'

In [None]:
df.describe() # Generates descriptive statistics

In [None]:
df.isnull() # Checks for missing values

In [None]:
df.columns

In [None]:
df.dtypes # Gets the type of each column

In [None]:
df.isnull().sum() 

In [None]:
audio_features = [
    "bpm",
    "danceability_%",
    "valence_%",
    "energy_%",
    "acousticness_%",
    "instrumentalness_%",
    "liveness_%",
    "speechiness_%"
]

Clustering songs based on similarity to each other. Clustering with audio characteristics creates usable clusters for:

* mood based recommendations
* playlist generation
* discovering similar sounding artists
* personalization 


In [None]:
# Keep metadata with audio features and drop rows with missing values
spotify_df = df[["track_name", "artist(s)_name"] + audio_features].dropna().copy()

X = spotify_df[audio_features].values

spotify_df.head()

## Phase 2 - Find K Value ##

In [None]:
k_values = range(2, 11) # K values from 2 to 10
inertias = []            
sil_scores = []           

for k in k_values:
    pipe = Pipeline([
        ('scale', StandardScaler()),
        ('kmeancl', KMeans(n_clusters=k, random_state=0))
    ])
    labels_k = pipe.fit_predict(X)
    
    # Elbow inertia
    inertias.append(pipe.named_steps['kmeancl'].inertia_)
    
    # Silhouette score
    score_k = silhouette_score(X, labels_k)
    sil_scores.append(score_k)
    
    print(f"k = {k}, inertia = {pipe.named_steps['kmeancl'].inertia_:.2f}, silhouette = {score_k:.4f}")


The elbow occurs at k = 3 because it is the first point where increasing k results in a decreasing value. 

k = 2 gives the highest silhouette score, but its not useful because it only splits the dataset into 2 giant clusters. K = 3 gives the next best score. The silhouette steadily declines after k = 3 resulting in the clusters becoming noisier and less separated.

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(list(k_values), inertias, marker='o')
plt.xlabel("Number of clusters k")
plt.ylabel("Inertia")
plt.title("Elbow plot for K-means on Spotify audio features")
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(list(k_values), sil_scores, marker='o')
plt.xlabel("Number of clusters k")
plt.ylabel("Silhouette score")
plt.title("Silhouette score vs k for K-means on Spotify audio features")
plt.grid(True)
plt.show()

K = 3 because:

* It is the clearest elbow with the first major shape change
* Gives the best balance of cluster quality and interpretability
* Produces clusters that are distinct, easy to interpret, useful for reccomendation, and is validated by both elbow and silhouette plots

## Phase 3 - Fit K-Means model with scaling (pipeline) ##

In [None]:
BEST_K = 3 # Chosen k value

cl_pipe = Pipeline([
    ('scale', StandardScaler()), 
    ('kmeancl', KMeans(n_clusters=BEST_K, random_state=0)) # Uses KMeans operator with chosen cluster number 
])

# Fit pipeline and get cluster labels
spotify_df["cluster_label"] = cl_pipe.fit_predict(X)

print("Cluster label counts:")
print(spotify_df["cluster_label"].value_counts().sort_index())

## Phase 4 - Visualize clusters ##

In [None]:
sns.pairplot(
    vars=["bpm", "energy_%", "danceability_%", "valence_%"], 
    hue="cluster_label",
    data=spotify_df,
    diag_kind="kde",
    markers=["X", "o", "^"]  
)
plt.suptitle(f"Spotify songs clustered by K-means (k={BEST_K})", y=1.02)
plt.show()

We choose the four features bpm, energy, danceability, and valence because they are the most interpretable musically:
* bpm: shows tempo, strongly separates fast vs. slow songs
* energy: high energy vs chill songs
* danceability: groups dance pop songs separately from acoustic songs
* valence: measures "happy vs. sad"

These features provide the most interpretable axes for understanding musical similarity, they clearly separate songs by tempo, intensity, rhythm, and mood.

In [None]:
cluster_counts = spotify_df["cluster_label"].value_counts().sort_index()

plt.figure(figsize=(6, 4))
cluster_counts.plot(kind="bar")
plt.xlabel("Cluster label")
plt.ylabel("Number of songs")
plt.title(f"Number of songs in each cluster (k={BEST_K})")
plt.xticks(rotation=0)
plt.show()

## Phase 5 - Inspect cluster results ##

In [None]:
cluster_summary = spotify_df.groupby("cluster_label")[audio_features].mean()
cluster_summary.head()

**Cluster 0 - Acoustic/Low-Energy/Emotional & Chill** 
* High acousticness
* Low energy
* Low valence (more sad and reflective)
* Moderate danceability

**Cluster 1 - High Energy Dance Pop/Happy/Mainstream**
* Very high danceability
* High energy
* Highest valence (happy and upbeat)
* Low acousticness
* Moderate speechiness (pop and rap blends)

**Cluster 2 - High BPM Energetic/Edgy/Electronic**
* Highest BPM
* High energy
* Low valence
* Low acousticness
* High liveness

## Phase 6 - Testing (finding similar songs in the same cluster) ##

Recommend songs from the same cluster as the seed song, ranked by Euclidean distance in scaled feature space.

In [None]:
def recommend_similar_songs(spotify_df, audio_features, cl_pipe,
                            seed_track, seed_artist=None, top_n=5):

    # Find seed song
    mask = spotify_df["track_name"].str.lower() == seed_track.lower()
    if seed_artist is not None:
        mask &= spotify_df["artist(s)_name"].str.lower().str.contains(seed_artist.lower())
    
    seed_df = spotify_df[mask]
    if seed_df.empty:
        raise ValueError("Seed song not found. Check track name / artist.")
    
    seed_idx = seed_df.index[0]
    seed_cluster = spotify_df.loc[seed_idx, "cluster_label"]
    
    # Filter other songs in same cluster
    same_cluster = spotify_df[
        (spotify_df["cluster_label"] == seed_cluster) &
        (spotify_df.index != seed_idx)
    ]
    if same_cluster.empty:
        raise ValueError("No other songs found in the same cluster.")
    
    # Get scaled features using the scaler from the pipeline
    scaler = cl_pipe.named_steps["scale"]
    X_scaled = scaler.transform(spotify_df[audio_features])
    
    # Map DataFrame index to position in X_scaled
    index_to_pos = {idx: pos for pos, idx in enumerate(spotify_df.index)}
    
    # Seed vector
    seed_pos = index_to_pos[seed_idx]
    seed_vec = X_scaled[seed_pos]
    
    # Compute distances to songs in same cluster
    distances = []
    for idx in same_cluster.index:
        pos = index_to_pos[idx]
        vec = X_scaled[pos]
        dist = np.linalg.norm(seed_vec - vec)
        distances.append((idx, dist))
    
    # 5. Sort by distance and pick top_n
    distances.sort(key=lambda x: x[1])
    top_indices = [idx for idx, _ in distances[:top_n]]
    
    return spotify_df.loc[top_indices, ["track_name", "artist(s)_name", "cluster_label"] + audio_features]

**Test Case: deja vu by Olivia Rodrigo**

In [None]:
seed_track = "deja vu"        
seed_artist = "Olivia Rodrigo"   
recs = recommend_similar_songs(
    spotify_df=spotify_df,
    audio_features=audio_features,
    cl_pipe=cl_pipe,
    seed_track=seed_track,
    seed_artist=seed_artist,
    top_n=5
)

print("Seed song:", seed_track, "-", seed_artist)
print("\nRecommended similar songs from the same cluster:")
display(recs)

**Test Case: Labyrinth by Taylor Swift**

In [None]:
seed_track = "Labyrinth"        
seed_artist = "Taylor Swift"   
recs = recommend_similar_songs(
    spotify_df=spotify_df,
    audio_features=audio_features,
    cl_pipe=cl_pipe,
    seed_track=seed_track,
    seed_artist=seed_artist,
    top_n=5
)

print("Seed song:", seed_track, "-", seed_artist)
print("\nRecommended similar songs from the same cluster:")
display(recs)

**Test Case: Butter by BTS**

In [None]:
seed_track = "Butter"        
seed_artist = "BTS"   
recs = recommend_similar_songs(
    spotify_df=spotify_df,
    audio_features=audio_features,
    cl_pipe=cl_pipe,
    seed_track=seed_track,
    seed_artist=seed_artist,
    top_n=5
)

print("Seed song:", seed_track, "-", seed_artist)
print("\nRecommended similar songs from the same cluster:")
display(recs) 