In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [4]:
# Load the Spotify dataset
df = pd.read_csv("spotify-2023.csv", encoding="latin1")

df.shape, df.head()

((953, 24),
                             track_name    artist(s)_name  artist_count  \
 0  Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook             2   
 1                                 LALA       Myke Towers             1   
 2                              vampire    Olivia Rodrigo             1   
 3                         Cruel Summer      Taylor Swift             1   
 4                       WHERE SHE GOES         Bad Bunny             1   
 
    released_year  released_month  released_day  in_spotify_playlists  \
 0           2023               7            14                   553   
 1           2023               3            23                  1474   
 2           2023               6            30                  1397   
 3           2019               8            23                  7858   
 4           2023               5            18                  3133   
 
    in_spotify_charts    streams  in_apple_playlists  ...  bpm key   mode  \
 0                1

In [8]:
   FEATURES = [
    "artist_count",
    "released_year",
    "released_month",
    "released_day",
    "in_spotify_playlists",
    "in_spotify_charts",
    "streams",
    "in_apple_playlists",
    "in_apple_charts",
    "in_deezer_playlists",
    "in_deezer_charts",
    "in_shazam_charts",
    "bpm",
    "danceability_%",
    "valence_%",
    "energy_%",
    "acousticness_%",
    "instrumentalness_%",
    "liveness_%",
    "speechiness_%"
]

# sanity check: all features must exist in the dataframe
missing = [c for c in FEATURES if c not in df.columns]
missing

[]

In [9]:
# Convert selected columns to numeric (coerce bad values to NaN)
df[FEATURES] = df[FEATURES].apply(pd.to_numeric, errors="coerce")

# Drop rows with missing values in any of the feature columns
df_clean = df.dropna(subset=FEATURES).reset_index(drop=True)

df_clean.shape

(829, 24)

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean[FEATURES])

X_scaled.shape

(829, 20)

In [12]:
K = 10  # number of neighbors to retrieve for recommendations

knn_model = NearestNeighbors(n_neighbors=K, metric="euclidean")
knn_model.fit(X_scaled)

In [16]:
def recommend(song_name: str, k: int = 10):
    # case-insensitive match on track_name
    matches = df_clean[df_clean["track_name"].str.lower() == song_name.lower()]
    
    if matches.empty:
        print(f"No song named '{song_name}' found in the dataset.")
        return None
    
    # use the first match if multiple rows share the same title
    song = matches.iloc[0]
    
    print(f"Query: {song['track_name']} — {song['artist(s)_name']}")
    
    # build and scale feature vector for the query song
    song_vec = pd.DataFrame([song[FEATURES]], columns=FEATURES)
    song_vec_scaled = scaler.transform(song_vec)

    # find nearest neighbors (include the song itself, then skip it)
    distances, indices = knn_model.kneighbors(song_vec_scaled, n_neighbors=k + 1)
    
    rec_indices = indices[0][1:]        # skip index 0 (the song itself)
    rec_distances = distances[0][1:]
    
    recs = df_clean.iloc[rec_indices].copy()
    recs["distance"] = rec_distances
    
    return recs[["track_name", "artist(s)_name", "distance"]].reset_index(drop=True)


In [18]:
recommend("vampire")

Query: vampire — Olivia Rodrigo


Unnamed: 0,track_name,artist(s)_name,distance
0,fukumean,Gunna,3.560654
1,Tattoo,Loreen,4.601381
2,Cruel Summer,Taylor Swift,5.068201
3,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",5.33683
4,Sprinter,"Dave, Central Cee",5.460017
5,Baby Don't Hurt Me,"David Guetta, Anne-Marie, Coi Leray",5.935335
6,Miracle (with Ellie Goulding),"Calvin Harris, Ellie Goulding",6.544017
7,Eyes Closed,Ed Sheeran,6.929642
8,Moonlight,Kali Uchis,7.019079
9,LALA,Myke Towers,7.06199
