In [18]:
import pandas as pd

# Load dataset
df = pd.read_csv("spotify.csv")

# Preview
df.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [None]:
df = df[[
    'track_name', 'artist_name', 'acousticness', 'danceability',
    'duration_ms', 'energy', 'instrumentalness', 'liveness',
    'loudness', 'speechiness', 'tempo', 'valence', 'popularity'
]]
# We remove unnecessary metadata (e.g., key, mode) and keep only the relevant musical/audio traits + song title and artist.

# Drop duplicates
df.drop_duplicates(subset='track_name', inplace=True)

In [20]:
# Handling missing values
df.isnull().sum()
df.dropna(inplace=True)

# Reset index after all modifications
df.reset_index(drop=True, inplace=True)

In [None]:
# Normalization = It brings them to similar scale (mean=0, std=1) so no one feature dominates the similarity score.
from sklearn.preprocessing import StandardScaler
# This ensures all features contribute equally in similarity calculation
features = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness',
    'tempo', 'valence', 'popularity'
]

In [None]:
# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

from sklearn.neighbors import NearestNeighbors
# Fit Nearest Neighbors model
# This avoids large similarity matrices and uses memory efficiently
nn = NearestNeighbors(n_neighbors=6, metric='cosine')
nn.fit(scaled_features)

In [None]:
def recommend(song_name, n=5):
    # Recommend n songs similar to the given song name based on audio features.
    song_name = song_name.lower()
    matches = df[df['track_name'].str.lower() == song_name]

    if matches.empty:
        return "❌ Song not found in dataset."

    index = matches.index[0]
    song_vector = scaled_features[index].reshape(1, -1)

    distances, indices = nn.kneighbors(song_vector, n_neighbors=n+1)

    # Remove first (same song)
    similar_indices = indices[0][1:]
    similar_distances = 1 - distances[0][1:]  # convert distance to similarity

    # Create result table
    result = df.iloc[similar_indices][['track_name', 'artist_name']].copy()
    result["similarity_score"] = similar_distances.round(2)
    
    return result.reset_index(drop=True)

In [24]:
recommend("Shape of You", 5)

Unnamed: 0,track_name,artist_name,similarity_score
0,New Moon,Schranzen Danzen,0.97
1,Angel Smile,Ando Rio,0.95
2,La Comparsa,Diego Pinera,0.95
3,Soul & Coffee,Cafe Music BGM channel,0.95
4,Mission Creep,Bambi Cat,0.95
