In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Path: notebook.ipynb
RANDOM_SEED = 123

In [2]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key,mode,time_signature
0,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,711MCceyCBcFnzjGY4Q7Un,AC/DC,37i9dQZF1DWXRqgorJj26U,Rock Classics,"australian rock,hard rock,rock",0.574,0.913,-4.793,0.133,0.061,0.00158,0.156,0.423,115.728,208400,6,0,4
1,1UBQ5GK8JaQjm5VbkBZY66,Sharp Dressed Man - 2008 Remaster,2AM4ilv6UzW0uMRuqKtDgN,ZZ Top,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album rock,blues rock,classic rock,hard rock,rock",0.601,0.859,-5.263,0.0276,0.000359,0.00124,0.0871,0.446,125.145,258027,5,1,4
2,57JVGBtBLCfHw2muk5416J,Another One Bites The Dust - Remastered 2011,1dfeR4HaWDbWqFHLkxsg1d,Queen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"classic rock,glam rock,rock",0.933,0.528,-6.472,0.162,0.112,0.329,0.163,0.756,109.975,214653,5,0,4
3,6pPWRBubXOBAHnjl5ZIujB,The Best,1zuJe6b1roixEKMOtyrEak,Tina Turner,37i9dQZF1DWXRqgorJj26U,Rock Classics,soft rock,0.662,0.737,-9.54,0.0296,0.108,0.0191,0.119,0.744,103.867,329800,0,1,4
4,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,0ECwFtbIWEVNwjlrfc6xoL,Eagles,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album rock,classic rock,heartland rock,mellow ...",0.579,0.508,-9.484,0.027,0.00574,0.000494,0.0575,0.609,147.125,391376,2,1,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2921 entries, 0 to 2920
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2921 non-null   object 
 1   name              2921 non-null   object 
 2   artist_id         2921 non-null   object 
 3   artist            2921 non-null   object 
 4   playlist_id       2921 non-null   object 
 5   playlist          2921 non-null   object 
 6   genres            2470 non-null   object 
 7   danceability      2921 non-null   float64
 8   energy            2921 non-null   float64
 9   loudness          2921 non-null   float64
 10  speechiness       2921 non-null   float64
 11  acousticness      2921 non-null   float64
 12  instrumentalness  2921 non-null   float64
 13  liveness          2921 non-null   float64
 14  valence           2921 non-null   float64
 15  tempo             2921 non-null   float64
 16  duration_ms       2921 non-null   int64  


<hr/>

## Data Preprocessing

In [4]:
numerical_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                  'duration_ms', 'time_signature']
categorical_cols = ['artist', 'name', 'id', 'release_date', 'playlist', 'genres']

### Duplicate Data

In [5]:
# see if duplicates exist

df[df['id'].duplicated()][['id', 'name', 'artist']].head(10)


Unnamed: 0,id,name,artist
739,05NYcsjJwOYq4jIiKPVj9p,Hard To Handle,The Black Crowes
742,0upLyFR8Rr52ZpMp5esQoq,You Really Got Me - 2015 Remaster,Van Halen
820,1Lo0QY9cvc8sUB2vnIOxDT,Fast Car,Luke Combs
854,7cA3PnUQDlUyBBwpRozQeo,One Man Band,Old Dominion
1000,3gdewACMIVMEWVbyb8O9sY,"Rocket Man (I Think It's Going To Be A Long, L...",Elton John
1002,6eN1f9KNmiWEhpE2RhQqB5,Paradise City,Guns N' Roses
1003,7e89621JPkKaeDSTQ3avtg,Sweet Home Alabama,Lynyrd Skynyrd
1006,0dOg1ySSI7NkpAe89Zo0b9,Born in the U.S.A.,Bruce Springsteen
1021,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,Eagles
1041,37ZJ0p5Jm13JPevGcx4SkF,Livin' On A Prayer,Bon Jovi


In [6]:
# drop duplicates
def drop_duplicates(df, column='id'):
    df = df.drop_duplicates(subset=[column])
    df = df.reset_index(drop=True)
    return df

In [7]:
df_dropped = drop_duplicates(df, 'id')
df_dropped[df_dropped['id'].duplicated()]

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key,mode,time_signature


### Genres preprocessing
* Converting genres string to list of genres
* Filling out missing values with its playlist's mode genres

In [8]:
# Preprocess genres from column seperated string to list (pd.Series)
df_null_ids = df[df['genres'].isnull()]['id']
df[df['genres'].isnull()][['id', 'name', 'artist', 'playlist', 'genres']].head()

Unnamed: 0,id,name,artist,playlist,genres
246,47kA4ykK0Rlwjf8oV2HMo7,amour,Jean-Michel Blais,Best Classical Music,
303,6a3LTwG4gwiC62EUNff5AD,Woman´s Prayer,G. I. Gurdjieff,Best Classical Music,
304,0scC8HqVaRe04wA0EeAZLp,Silver Light,Sebastian Rochford,Best Classical Music,
342,7AEf1Jc3htMKTWmFZOOGtR,Viola's Theme - Music For Theatre,Jacob Mühlrad,Best Classical Music,
362,3bagw7W1Q4nsecXdDV5CAx,LOST,Tony Ann,Best Classical Music,


In [9]:
def preprocess_genres(df):
    # converts flat string of genres to list of genres, and fills null values with mode of genres in the same playlist

    # each row is like: 'classic rock,rock,soft rock' seperated by comma, but each item could have whitespace
    # replace whitespace between words with underscore # classic rock -> classic_rock #ISSUE not generalizable
    # ~convert to list of strings~
    df['genres'] = df['genres'].str.replace(' ', '_')
    # commented out, no need to convert to list, TFIDF will need flat string
    # df['genres'] = df['genres'].str.split(',')

    # handle null genres by filling with mode of genres in the same playlist
    df['genres'] = df.groupby('playlist_id')['genres'].transform(lambda x: x.fillna(x.mode()[0]))

    return df

In [10]:
df_pp_genres = preprocess_genres(df)
df_pp_genres = df_pp_genres[['id', 'name', 'artist', 'playlist', 'genres']]

# print out previous null ids to see if they are filled
df_pp_genres[df_pp_genres['id'].isin(df_null_ids)]

Unnamed: 0,id,name,artist,playlist,genres
246,47kA4ykK0Rlwjf8oV2HMo7,amour,Jean-Michel Blais,Best Classical Music,"baroque,classical,early_music,german_baroque"
303,6a3LTwG4gwiC62EUNff5AD,Woman´s Prayer,G. I. Gurdjieff,Best Classical Music,"baroque,classical,early_music,german_baroque"
304,0scC8HqVaRe04wA0EeAZLp,Silver Light,Sebastian Rochford,Best Classical Music,"baroque,classical,early_music,german_baroque"
342,7AEf1Jc3htMKTWmFZOOGtR,Viola's Theme - Music For Theatre,Jacob Mühlrad,Best Classical Music,"baroque,classical,early_music,german_baroque"
362,3bagw7W1Q4nsecXdDV5CAx,LOST,Tony Ann,Best Classical Music,"baroque,classical,early_music,german_baroque"
...,...,...,...,...,...
2891,2SCM41M2pe6PF6eASyVWyy,City,Lo'fi Chield,phonk,drift_phonk
2896,29K9XCDx9IDpyGpKmoM5VJ,ERROR,ROMANTICA,phonk,drift_phonk
2897,6CN5O0edKbnXUJCNDIjZJy,ECHO OF TERROR,requi3m,phonk,drift_phonk
2901,7ANKWDStVvxzYfxLrtmKYw,Hurt,Maikubi,phonk,drift_phonk


### Normalization of numerical features

In [11]:
# Normalize numerical columns
def normalize(df, columns):
    # Normalize numerical columns to range [0, 1] using min-max normalization
    df_norm = df.copy()
    # pandas one-liner
    for column in columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())
    return df_norm


normalize(df.head(), numerical_cols).head()

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key,mode,time_signature
0,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,711MCceyCBcFnzjGY4Q7Un,AC/DC,37i9dQZF1DWXRqgorJj26U,Rock Classics,"australian_rock,hard_rock,rock",0.0,1.0,1.0,0.785185,0.543179,0.003306,0.933649,0.0,0.274192,0.0,1.0,0.0,
1,1UBQ5GK8JaQjm5VbkBZY66,Sharp Dressed Man - 2008 Remaster,2AM4ilv6UzW0uMRuqKtDgN,ZZ Top,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,blues_rock,classic_rock,hard_rock,rock",0.075209,0.866667,0.90099,0.004444,0.0,0.002271,0.280569,0.069069,0.491886,0.271221,0.833333,1.0,
2,57JVGBtBLCfHw2muk5416J,Another One Bites The Dust - Remastered 2011,1dfeR4HaWDbWqFHLkxsg1d,Queen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"classic_rock,glam_rock,rock",1.0,0.049383,0.646303,1.0,1.0,1.0,1.0,1.0,0.141199,0.034174,0.833333,0.0,
3,6pPWRBubXOBAHnjl5ZIujB,The Best,1zuJe6b1roixEKMOtyrEak,Tina Turner,37i9dQZF1DWXRqgorJj26U,Rock Classics,soft_rock,0.245125,0.565432,0.0,0.019259,0.964171,0.056638,0.582938,0.963964,0.0,0.663475,0.0,1.0,
4,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,0ECwFtbIWEVNwjlrfc6xoL,Eagles,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,heartland_rock,mellow_...",0.013928,0.0,0.011797,0.0,0.048199,0.0,0.0,0.558559,1.0,1.0,0.333333,1.0,


### Apply Changes

In [12]:
# preprocessing pipeline
def preprocess(df):
    df = drop_duplicates(df, 'id')
    df = preprocess_genres(df)

    num_cols = df.select_dtypes(include=np.number).columns
    df = normalize(df, num_cols)

    return df

In [13]:
df = preprocess(df)
df.head()

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key,mode,time_signature
0,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,711MCceyCBcFnzjGY4Q7Un,AC/DC,37i9dQZF1DWXRqgorJj26U,Rock Classics,"australian_rock,hard_rock,rock",0.569555,0.914769,0.900818,0.124377,0.061243,0.001598,0.137859,0.415679,0.431239,0.216632,0.545455,0.0,0.75
1,1UBQ5GK8JaQjm5VbkBZY66,Sharp Dressed Man - 2008 Remaster,2AM4ilv6UzW0uMRuqKtDgN,ZZ Top,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,blues_rock,classic_rock,hard_rock,rock",0.599821,0.860623,0.889924,0.004984,0.000358,0.001254,0.065682,0.439592,0.481695,0.279155,0.454545,1.0,0.75
2,57JVGBtBLCfHw2muk5416J,Another One Bites The Dust - Remastered 2011,1dfeR4HaWDbWqFHLkxsg1d,Queen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"classic_rock,glam_rock,rock",0.971976,0.528724,0.861901,0.157227,0.112448,0.332659,0.145192,0.761905,0.400415,0.22451,0.454545,0.0,0.75
3,6pPWRBubXOBAHnjl5ZIujB,The Best,1zuJe6b1roixEKMOtyrEak,Tina Turner,37i9dQZF1DWXRqgorJj26U,Rock Classics,soft_rock,0.668199,0.738292,0.790789,0.00725,0.108432,0.019312,0.099099,0.749428,0.367688,0.36958,0.0,1.0,0.75
4,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,0ECwFtbIWEVNwjlrfc6xoL,Eagles,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,heartland_rock,mellow_...",0.57516,0.50867,0.792087,0.004304,0.005761,0.000499,0.034674,0.609066,0.599462,0.447158,0.181818,1.0,0.75


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2880 non-null   object 
 1   name              2880 non-null   object 
 2   artist_id         2880 non-null   object 
 3   artist            2880 non-null   object 
 4   playlist_id       2880 non-null   object 
 5   playlist          2880 non-null   object 
 6   genres            2880 non-null   object 
 7   danceability      2880 non-null   float64
 8   energy            2880 non-null   float64
 9   loudness          2880 non-null   float64
 10  speechiness       2880 non-null   float64
 11  acousticness      2880 non-null   float64
 12  instrumentalness  2880 non-null   float64
 13  liveness          2880 non-null   float64
 14  valence           2880 non-null   float64
 15  tempo             2880 non-null   float64
 16  duration_ms       2880 non-null   float64


<hr/>

## Feature Engineering

In [15]:
# one-hot encode
def onehotencode(df, column, prefix='', sep='_'):
    if prefix == '':
        prefix = column
    
    df_ohe = pd.get_dummies(df[column])
    df_ohe.columns = [prefix + sep + str(col) for col in df_ohe.columns]
    df_ohe.reset_index(drop = True, inplace = True) 
    return df_ohe
    
onehotencode(df.head(), 'artist').head()

Unnamed: 0,artist_AC/DC,artist_Eagles,artist_Queen,artist_Tina Turner,artist_ZZ Top
0,1,0,0,0,0
1,0,0,0,0,1
2,0,0,1,0,0
3,0,0,0,1,0
4,0,1,0,0,0


### TD-IDF (Term Frequency - Inverse Document Frequency) 
is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. The tf-idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

In [16]:
# # TF-IDF vectorizer from sklearn
# from sklearn.feature_extraction.text import TfidfVectorizer

# # tfidf vectorizer
# def tfidf(df, column, prefix='', sep='_'):
#     if prefix == '':
#         prefix = column
    
#     tfidf = TfidfVectorizer()
#     tfidf_matrix =  tfidf.fit_transform(df[column]) # reverse of preprocess_genres()
#     df_tfidf = pd.DataFrame(tfidf_matrix.toarray())
#     df_tfidf.columns = [prefix + sep + i for i in tfidf.get_feature_names_out()]
#     df_tfidf.reset_index(drop = True, inplace=True)
    
#     return df_tfidf


# # tfidf(df.head(), 'genres').head()
# df_test = pd.DataFrame({'genres': ['rock,pop', 'rock,metal', 'lofi,hiphop,chill', 'pop,chill']})
# tfidf(df_test, 'genres')

In [17]:
# implement TF-IDF from scratch
def tfidf(df, column, prefix='', sep='_'):
    if prefix == '':
        prefix = column

    # get all unique genres 
    genres = df[column].str.split(',').explode().unique()
    
    # get term frequencies = term count / len of each document
    # (since each genre is repeated max once per document, term counts = one hot encoded (get_dummies))
    doc_len = df[column].str.split(',').str.len().values[:,None] # get length of each document
    tf = df[column].str.get_dummies(sep=',') / doc_len

    # get counts of each genre in all documents
    counts = df[column].str.get_dummies(sep=',').sum(axis=0)

    # get inverse document frequency
    # log( total num of documents / num of documents with term t in it(=counts)))
    idf = np.log(len(df) / counts+1) # add 1 to denominator to avoid division by 0 
    idf = np.expand_dims(idf, axis=0)

    # get tf-idf
    tfidf = tf * idf

    # convert to dataframe
    df_tfidf = pd.DataFrame(tfidf)
    df_tfidf.columns = [prefix + sep + i for i in genres]
    df_tfidf.reset_index(drop = True, inplace=True)

    return df_tfidf

# df_test = pd.DataFrame({'genres': ['rock,pop', 'rock,metal', 'lofi,hiphop,chill', 'pop,chill']})
# tfidf(df_test, 'genres')
tfidf(df.head(), 'genres')


Unnamed: 0,genres_australian_rock,genres_hard_rock,genres_rock,genres_album_rock,genres_blues_rock,genres_classic_rock,genres_glam_rock,genres_soft_rock,genres_heartland_rock,genres_mellow_gold,genres_yacht_rock
0,0.0,0.597253,0.0,0.0,0.0,0.417588,0.0,0.0,0.27031,0.0,0.0
1,0.250553,0.0,0.358352,0.196166,0.0,0.250553,0.0,0.0,0.162186,0.0,0.0
2,0.0,0.0,0.0,0.326943,0.597253,0.0,0.0,0.0,0.27031,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.252763,0.0
4,0.178966,0.0,0.0,0.140118,0.0,0.0,0.255966,0.255966,0.115847,0.178966,0.255966


### Apply Changes

In [18]:
# Feature Extraction pipeline
def extract_features(df):
    df = pd.concat([df, onehotencode(df, 'artist')], axis=1)
    df = pd.concat([df, onehotencode(df, 'playlist')], axis=1)
    df = pd.concat([df, tfidf(df, 'genres')], axis=1)
    # get numerical columns, it also normalizes ohe and genres_tfidf columns

    return df


In [19]:
df = extract_features(df)
df.head()

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,danceability,energy,loudness,...,genres_aggressive_phonk,genres_chill_phonk,genres_memphis_phonk,genres_traprun,genres_aesthetic_rap,genres_funk_mtg,genres_sigilkore,genres_new_school_turkce_rap,genres_turkce_trap_metal,genres_filthstep
0,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,711MCceyCBcFnzjGY4Q7Un,AC/DC,37i9dQZF1DWXRqgorJj26U,Rock Classics,"australian_rock,hard_rock,rock",0.569555,0.914769,0.900818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1UBQ5GK8JaQjm5VbkBZY66,Sharp Dressed Man - 2008 Remaster,2AM4ilv6UzW0uMRuqKtDgN,ZZ Top,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,blues_rock,classic_rock,hard_rock,rock",0.599821,0.860623,0.889924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57JVGBtBLCfHw2muk5416J,Another One Bites The Dust - Remastered 2011,1dfeR4HaWDbWqFHLkxsg1d,Queen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"classic_rock,glam_rock,rock",0.971976,0.528724,0.861901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6pPWRBubXOBAHnjl5ZIujB,The Best,1zuJe6b1roixEKMOtyrEak,Tina Turner,37i9dQZF1DWXRqgorJj26U,Rock Classics,soft_rock,0.668199,0.738292,0.790789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,0ECwFtbIWEVNwjlrfc6xoL,Eagles,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,heartland_rock,mellow_...",0.57516,0.50867,0.792087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.795923,0.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Columns: 2400 entries, id to genres_filthstep
dtypes: float64(709), object(7), uint8(1684)
memory usage: 20.4+ MB


In [21]:
df.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,genres_aggressive_phonk,genres_chill_phonk,genres_memphis_phonk,genres_traprun,genres_aesthetic_rap,genres_funk_mtg,genres_sigilkore,genres_new_school_turkce_rap,genres_turkce_trap_metal,genres_filthstep
count,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,...,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0,2880.0
mean,0.611432,0.531206,0.765121,0.060373,0.371035,0.430555,0.146069,0.425657,0.431519,0.235515,...,0.002766,0.000922,0.000722,0.043523,0.000722,0.002766,0.000461,0.000691,0.003458,0.000307
std,0.194208,0.258589,0.133763,0.083383,0.351631,0.418309,0.140045,0.284543,0.158091,0.129119,...,0.148436,0.049479,0.027376,0.308964,0.027376,0.148436,0.024739,0.037109,0.05729,0.016493
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.49333,0.345228,0.720748,0.017331,0.03047,8.6e-05,0.075005,0.175244,0.288139,0.139767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.648022,0.53073,0.79043,0.030245,0.258532,0.327604,0.091766,0.403722,0.454214,0.201415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.756754,0.7413,0.849211,0.06165,0.698794,0.883974,0.155667,0.650655,0.508407,0.293274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,7.965893,2.655298,1.039013,3.563478,1.039013,7.965893,1.327649,1.991473,1.114292,0.885099


### Full workflow

In [22]:
# def get_featureset_from_df(df):
#     pass
# def get_featureset_from_spotify_id():
#     pass

<hr/>

## Model Training


In [25]:
# Prepare dataset for Cossine Similarity
def split_metadata_features(df):
    # get numerical columns
    num_cols = df.select_dtypes(include=np.number).columns

    # get featureset
    featureset = pd.concat([df['id'], df[num_cols]], axis=1) # id could be useful
    metadata = df.drop(num_cols, axis=1) # metadata is everything else
    return metadata, featureset

df_metadata, df_featureset = split_metadata_features(df)
print(f"Metadata shape: {df_metadata.shape}")
print(f"Featureset shape: {df_featureset.shape}")

Metadata shape: (2880, 7)
Featureset shape: (2880, 2394)


In [84]:
# Cossine similarity
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_track_ids(df, track_id, n=10):
    # get metadata and featureset
    metadata, df = split_metadata_features(df)
    # get track features
    track_features = df[df['id'] == track_id].drop('id', axis=1)
    # get similarity scores

    ''' example
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    '''

    

    
    metadata['sim_score'] = cosine_similarity(track_features, df.drop('id', axis=1)).T


    # get top n similar tracks
    top_n = metadata.sort_values('sim_score', ascending=False).head(n+1)

    return top_n

# print(f"{cosine_similarity(df.iloc[0:1], df.iloc[1:2])}")

get_similar_track_ids(df, '1xsYj84j7hUDDnTTerGWlH')

Unnamed: 0,id,name,artist_id,artist,playlist_id,playlist,genres,sim_score
23,1xsYj84j7hUDDnTTerGWlH,Dream On,7Ey4PD4MYsKc5I2dolUwbH,Aerosmith,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",1.0
117,73TxYZd0lBCVRrHawrAglA,Sweet Emotion,7Ey4PD4MYsKc5I2dolUwbH,Aerosmith,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.941517
167,5FqYA8KfiwsQvyBI4IamnY,Jump - 2015 Remaster,2cnMpRsOVqtPMfq7YiFE6K,Van Halen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.826846
15,2eDdFHgqNJltzlvlZFVDWd,Over the Hills and Far Away - Remaster,36QJpDe2go2KgaRleHCDTp,Led Zeppelin,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.82567
26,05RgAMGypEvqhNs5hPCbMS,Panama - 2015 Remaster,2cnMpRsOVqtPMfq7YiFE6K,Van Halen,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.808755
17,0hCB0YR03f6AmQaHbwWDe8,Whole Lotta Love - 1990 Remaster,36QJpDe2go2KgaRleHCDTp,Led Zeppelin,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.804321
69,78lgmZwycJ3nzsdgmPPGNx,Immigrant Song - Remaster,36QJpDe2go2KgaRleHCDTp,Led Zeppelin,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,classic_rock,hard_rock,rock",0.790975
1,1UBQ5GK8JaQjm5VbkBZY66,Sharp Dressed Man - 2008 Remaster,2AM4ilv6UzW0uMRuqKtDgN,ZZ Top,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,blues_rock,classic_rock,hard_rock,rock",0.775944
40,3qiyyUfYe7CRYLucrPmulD,Baba O'Riley,67ea9eGLXYMsO2eYQRui3w,The Who,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,british_invasion,classic_rock,hard_...",0.770514
97,0cJPLFrlV7TTCyPLupHzcH,Won't Get Fooled Again - Original Album Version,67ea9eGLXYMsO2eYQRui3w,The Who,37i9dQZF1DWXRqgorJj26U,Rock Classics,"album_rock,british_invasion,classic_rock,hard_...",0.764343


In [None]:
# pretty print track info
    # track_info = df[df['id'] == track_id][['id', 'name', 'artist', 'playlist', 'genres']]
    # print(f"Track info: {track_info}")
    # print(f"Top {n} similar tracks:")
    # for i in top_n:
    #     print(df.iloc[i][['id', 'name', 'artist', 'playlist', 'genres']])
    #     print(f"Similarity score: {similarity_scores[i]}")
    #     print()