Columns:

,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature

In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder

In [12]:
song_data = pd.read_csv('data/song_data.csv')
# Remove key, mode, time_signature, and duration_ms
song_data = song_data.drop(['key', 'mode', 'time_signature', 'duration_ms'], axis=1)

## Implement KNN

#### Feature scaling

In [13]:
# Scale numerical features
numerical_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
X = song_data[numerical_features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# Genre encoding
genres = song_data[['genre']]
encoder = OneHotEncoder()
genres_encoded = encoder.fit_transform(genres).toarray()

#### Combine genres and numerical features and fit

In [6]:
genre_weight = 20 # The higher the weight, the more important the genre is in the recommendation

X_combined = np.concatenate((X_scaled, genres_encoded * genre_weight), axis=1)

knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(X_combined)

## Song Query

In [7]:
# Define song and artist which we want to find similar songs for
song_name = 'What Once Was'
artist_name = 'Her\'s'

In [8]:
# Find the song in the dataset
song_query = song_data[(song_data['track_name'].str.lower() == song_name.lower()) & (song_data['artist_name'].str.lower() == artist_name.lower())]

if not song_query.empty:
    # Scale numerical features of the query
    song_numerical_features = scaler.transform(song_query[numerical_features])
    
    # Encode genre of the query
    song_genre_encoded = encoder.transform(song_query[['genre']]).toarray()
    
    # Combine scaled numerical features and encoded genre for the query
    song_combined_features = np.hstack((song_numerical_features, song_genre_encoded * 20))
    
    # Use KNN to find the nearest neighbors
    distances, indices = knn.kneighbors(song_combined_features)
    
    print('Closest songs to |', song_name, '-', artist_name, '| are...')
    for i in range(5):
        index = indices[0][i]
        print('----------')
        print(song_data.iloc[index][['track_name', 'artist_name']])
        
else:
    print("No song found. Check the song and artist names.")

Closest songs to | What Once Was - Her's | are....
----------
track_name         Rosary
artist_name    Inner Wave
Name: 131030, dtype: object
----------
track_name              Lupa
artist_name    King Stingray
Name: 553167, dtype: object
----------
track_name     Stupid Decisions
artist_name              FIDLAR
Name: 180455, dtype: object
----------
track_name         2:00 AM
artist_name    Los Shadows
Name: 389334, dtype: object
----------
track_name     Follow The Sun
artist_name    Royal Republic
Name: 236101, dtype: object


## Playlist testing

In [25]:
playlist_test = [('What Once Was', 'Her\'s'), 
                 ('Lauren', 'Men I Trust'),
                 ('Pink + White', 'Frank Ocean'), 
                 ('After The Earthquake', 'Alvvays'),
                 ('North', 'Clairo'),
                 ('Sunflower', 'Rex Orange County'),
                 ('Wet Dream', 'Wet Leg'),
                 ('Freaking Out the Neighborhood', 'Mac DeMarco'),
                 ('Lovers Rock', 'TV Girl'),
                 ('Right Side of My Neck', 'Faye Webster'),
                 ('Summertime Magic', 'Childish Gambino'),
                 ('Be Sweet', 'Japanese Breakfast'),
                 ('Where U Goin\' Tonight?', 'Mac Ayres'),
                 ('Just A Stranger (feat. Steve Lacy)', 'Kali Uchis'),
                 ('Useless', 'Omar Apollo')]


playlist_test2 = [('Easy Lover', 'Philip Bailey'),
                  ('505', 'Arctic Monkeys'),
                  ('Electric Feel', 'MGMT'),
                  ('Everywhere - 2018 Remaster', 'Fleetwood Mac'),
                  ('Blue Monday \'88 - 2015 Remaster', 'New Order')] 

In [27]:

playlist_numerical_features = []
playlist_genre_encoded = []

for song_name, artist_name in playlist_test:
    song_query = song_data[(song_data['track_name'].str.lower() == song_name.lower()) & (song_data['artist_name'].str.lower() == artist_name.lower())]

    if not song_query.empty:
        # Scale numerical features of the query
        song_numerical_features = scaler.transform(song_query[numerical_features])
        playlist_numerical_features.append(song_numerical_features)
        
        # Encode genre of the query
        song_genre_encoded = encoder.transform(song_query[['genre']]).toarray()
        playlist_genre_encoded.append(song_genre_encoded)
    else:
        print(f'No song found for {song_name} by {artist_name}. Check the song and artist names.')

# Aggregate features of the playlist
if playlist_numerical_features and playlist_genre_encoded:
    average_numerical_features = np.mean(np.vstack(playlist_numerical_features), axis=0)
    average_genre_encoded = np.mean(np.vstack(playlist_genre_encoded), axis=0)

    # Combine scaled numerical features and encoded genre for the playlist query
    playlist_combined_features = np.hstack((average_numerical_features, average_genre_encoded * 20))

    # Use KNN to find the nearest neighbors to the playlist
    distances, indices = knn.kneighbors([playlist_combined_features])

    print('Closest songs to the playlist are...')
    for i in range(3):
        index = indices[0][i]
        print('----------')
        print(song_data.iloc[index][['track_name', 'artist_name']])
else:
    print('The playlist is empty or no songs were found.')


Closest songs to the playlist are...
----------
track_name     If You Didn't See Me (Then You Weren't on the ...
artist_name                                                JR JR
Name: 87707, dtype: object
----------
track_name     I Want It
artist_name     Two Feet
Name: 342922, dtype: object
----------
track_name     I'll Call You Mine
artist_name           girl in red
Name: 508586, dtype: object


## Reinforcement learning