Columns:

,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature

In [26]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder

In [27]:
song_data = pd.read_csv('data/song_data.csv')
# Remove key, mode, time_signature, and duration_ms
song_data = song_data.drop(['key', 'mode', 'time_signature', 'duration_ms'], axis=1)

In [28]:
# Print all available genres
print(len(song_data['genre'].unique()))

82


In [29]:
only_artist = False

if only_artist:
    # Remove all but one artist
    artist = 'BABYMETAL'
    song_data = song_data[song_data['artist_name'] == artist]

Implement KNN

In [30]:
# Scale numerical features
numerical_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
X = song_data[numerical_features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [31]:
# Genre encoding
genres = song_data[['genre']]
encoder = OneHotEncoder()
genres_encoded = encoder.fit_transform(genres).toarray()

In [32]:
# Fit KNN
X_combined = np.concatenate((X_scaled, genres_encoded * 20), axis=1)

knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(X_combined)

Use KNN to find the closest songs

In [33]:
# Find closest song to a given song
song_name = 'Nightcall'
artist_name = 'Kavinsky'
# Assuming 'encoder' and 'scaler' are already fitted to your dataset

# Find the song in the dataset
song_query = song_data[(song_data['track_name'].str.lower() == song_name.lower()) & (song_data['artist_name'].str.lower() == artist_name.lower())]

if not song_query.empty:
    # Scale numerical features of the query
    song_numerical_features = scaler.transform(song_query[numerical_features])
    
    # Encode genre of the query
    song_genre_encoded = encoder.transform(song_query[['genre']]).toarray()
    
    # Combine scaled numerical features and encoded genre for the query
    song_combined_features = np.hstack((song_numerical_features, song_genre_encoded * 20))
    
    # Use KNN to find the nearest neighbors
    distances, indices = knn.kneighbors(song_combined_features)
    
    print('Closest songs to |', song_name, '-', artist_name, '| are....')
    for i in range(1, 6):  # Adjust the range if necessary
        index = indices[0][i]
        print('----------')
        print(song_data.iloc[index][['track_name', 'artist_name']])
else:
    print("No song found. Please check the song and artist names.")

Closest songs to | Nightcall - Kavinsky | are....
----------
track_name     Walking in My Shoes - 2006 Remaster
artist_name                           Depeche Mode
Name: 890168, dtype: object
----------
track_name     Bulletproof Picasso
artist_name                  Train
Name: 120025, dtype: object
----------
track_name        Kingdom
artist_name    Dave Gahan
Name: 936050, dtype: object
----------
track_name     Ultimate - From "Freaky Friday"/Soundtrack Ver...
artist_name                                        Lindsay Lohan
Name: 759151, dtype: object
----------
track_name     Underneath This Smile
artist_name              Hilary Duff
Name: 801990, dtype: object
