In this first notebook, I am using Spotify data to see which songs are similar to a given song. Similar songs are simply those songs which are close to the vector embedding of the given song. I construct the vector embedding using (scaled version of) attributes like "acousticness", ..., "valence".

In [1]:
import numpy as np
import pandas as pd
import glob
import tqdm
from collections import OrderedDict
import langid # I want english songs

In [3]:
csv = pd.read_csv(*glob.glob("public/data/tracks.csv"))

In [4]:
def scale_0_1(df, col):
    df.loc[:, col] = (df[col]-df[col].mean())/df[col].std()

In [5]:
attrs = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence"]
for attr in attrs:
    scale_0_1(csv, attr) # Rescale these attributes to be N(0, 1)
csv.loc[:, "name"] = csv["name"].str.lower()
csv = csv.drop_duplicates(subset=attrs)

In [6]:
# langid.classify is a fast way to check the language of the song name
def check_en(name):
    try:
        return langid.classify(name)[0] in ["en"]
    except:
        return False

tqdm.tqdm.pandas()
csv = csv[csv["name"].progress_apply(lambda x: check_en(x))]

100%|██████████| 566113/566113 [07:06<00:00, 1325.91it/s]


In [7]:
song_name = "i'll do anything"
rows = csv[["name", "id", "artists"]+attrs][csv["name"].str.contains(song_name)]
vec = rows[attrs].iloc[2].to_numpy()
rows

Unnamed: 0,name,id,artists,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
56455,i'll do anything for you,3CuKcfH8BJ5The5G3llaKD,['Denroy Morgan'],-0.730608,1.013868,1.091461,-0.425109,-0.713601,0.714253,1.006913,-0.110027,0.976856
458469,i'll do anything,2mmqUU64P1BhJNvH1jtofT,['D-Train'],-1.164622,1.519579,0.075277,-0.425099,-0.817765,0.149738,-0.155446,0.047277,0.775049
555315,i'll do anything,0hK8yn7I0oqrqXljVVGPla,['Jason Mraz'],-0.948475,0.038568,0.670697,-0.42512,0.749028,1.188775,-0.193246,1.500604,0.375314


In [61]:
csv2 = csv[["acousticness","danceability","energy","instrumentalness","valence","liveness","speechiness"]]

In [62]:
from sklearn.manifold import TSNE

In [83]:
csv2.to_numpy().shape

(215035, 7)

In [85]:
X_embedded = TSNE(n_components=2, verbose=1, n_jobs=8).fit_transform(csv2.to_numpy())

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 215035 samples in 0.378s...
[t-SNE] Computed neighbors for 215035 samples in 41.898s...
[t-SNE] Computed conditional probabilities for sample 1000 / 215035
[t-SNE] Computed conditional probabilities for sample 2000 / 215035
[t-SNE] Computed conditional probabilities for sample 3000 / 215035
[t-SNE] Computed conditional probabilities for sample 4000 / 215035
[t-SNE] Computed conditional probabilities for sample 5000 / 215035
[t-SNE] Computed conditional probabilities for sample 6000 / 215035
[t-SNE] Computed conditional probabilities for sample 7000 / 215035
[t-SNE] Computed conditional probabilities for sample 8000 / 215035
[t-SNE] Computed conditional probabilities for sample 9000 / 215035
[t-SNE] Computed conditional probabilities for sample 10000 / 215035
[t-SNE] Computed conditional probabilities for sample 11000 / 215035
[t-SNE] Computed conditional probabilities for sample 12000 / 215035
[t-SNE] Computed conditional probab

[t-SNE] Computed conditional probabilities for sample 121000 / 215035
[t-SNE] Computed conditional probabilities for sample 122000 / 215035
[t-SNE] Computed conditional probabilities for sample 123000 / 215035
[t-SNE] Computed conditional probabilities for sample 124000 / 215035
[t-SNE] Computed conditional probabilities for sample 125000 / 215035
[t-SNE] Computed conditional probabilities for sample 126000 / 215035
[t-SNE] Computed conditional probabilities for sample 127000 / 215035
[t-SNE] Computed conditional probabilities for sample 128000 / 215035
[t-SNE] Computed conditional probabilities for sample 129000 / 215035
[t-SNE] Computed conditional probabilities for sample 130000 / 215035
[t-SNE] Computed conditional probabilities for sample 131000 / 215035
[t-SNE] Computed conditional probabilities for sample 132000 / 215035
[t-SNE] Computed conditional probabilities for sample 133000 / 215035
[t-SNE] Computed conditional probabilities for sample 134000 / 215035
[t-SNE] Computed con

In [86]:
X_embedded.shape

(215035, 2)

In [96]:
csv3 = csv

In [88]:
csv3['x'] = X_embedded[:, 0].tolist()
csv3['y'] = X_embedded[:, 1].tolist()
# csv3['z'] = X_embedded[:, 2].tolist()

In [108]:
csv4 = csv3[csv3['popularity'] > 60]

In [109]:
csv4

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,x,y
8095,1LGqJ3nvxpVXDWpEzq4DJD,all of me,65,181440,0,['Billie Holiday'],['1YzCsTRb22dQkh9lghPIrp'],1933,-0.358777,-1.895961,...,0,-0.356121,1.496796,-0.425103,-0.216655,-0.579392,-0.385392,5,29.339767,-24.890440
18549,0qB213IfGN0JXXm9aRjldF,again (with the mellomen),61,167907,0,"['Doris Day', 'The Mellomen']","['3ESG6pj6a0LvUKklENalT6', '548L4DXlt7N14Mhbfd...",1945,-0.412960,-2.068633,...,1,-0.389474,1.522596,-0.425115,-0.752120,-1.580669,-1.877323,4,25.575068,-28.437536
21549,6yhLR2sVn1IfsScVrR4ocr,"whatever will be, will be (que sera, sera) (wi...",62,123360,0,"['Doris Day', 'Frank DeVol & His Orchestra']","['3ESG6pj6a0LvUKklENalT6', '2N0Yx2ISFxGV0yt7xf...",1948,-0.888570,-0.798006,...,1,-0.365571,1.187195,-0.425120,2.767196,0.868191,1.974329,4,-19.093706,-28.802057
28498,6crfO56bDm0RjpctUuGs5X,i'm in the mood for love,65,149827,0,['Julie London'],['3qUMmh5biaB5hqpF4LqS3m'],1955-12-01,0.098771,-1.935258,...,0,-0.319988,1.470996,-0.424127,-0.636021,-1.456480,0.163826,4,24.642628,-25.393911
28499,1uRKT2LRANv4baowBWHfDS,(we're gonna) rock around the clock,64,129893,0,['Bill Haley & His Comets'],['3MFp4cYuYtTZe3d3xkLLbr'],1955-12-19,1.489478,1.258178,...,1,0.350967,-0.701941,-0.425106,-0.747780,0.899238,-0.933301,4,20.308483,26.988455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586657,1dKxf4Ht2SsKLyXfSDJAgy,the cutest puppy,67,82500,0,['Laureen Conrad'],['7vgGpuiXdNlCmc994PlMlz'],2020-10-30,0.273362,-2.083320,...,1,0.073024,1.565596,3.220879,-0.574717,1.310616,-1.674697,4,-39.589260,11.519838
586661,27kcZEJvhkb1rzZS9gCpdA,remember the mornings,67,202355,0,['Clinton Kane'],['7okSU80WTrn4LXlyXYbX3P'],2020-11-27,0.158975,-0.393120,...,1,-0.181572,0.481994,-0.425120,-0.563866,-0.816128,1.040923,4,-32.162075,3.090466
586665,0SjsIzJkZfDU7wlcdklEFR,john brown's song,66,185250,0,['Gregory Oberle'],['4MxqhahGRT4BPz1PilXGeu'],2020-03-20,-0.009595,-2.020205,...,1,-0.010359,1.565596,3.175913,-0.558441,-0.645368,-1.840097,3,-9.670913,42.119564
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,1.212540,0.480163,...,1,-0.220484,-0.885408,-0.424007,-0.659350,0.518908,1.062560,4,25.317987,13.329927


In [89]:
min(csv3['popularity'].unique()), max(csv3['popularity'].unique())

(0, 99)

In [110]:
import scipy.cluster.hierarchy as hcluster

In [116]:
clusters = hcluster.fclusterdata(csv4[['x', 'y']].to_numpy(), 0.5, criterion="distance")

In [117]:
np.unique(clusters)

array([   1,    2,    3, ..., 2199, 2200, 2201], dtype=int32)

In [118]:
csv4['c'] = clusters.tolist()

In [119]:
csv4.to_csv("public/tracks13k.csv")

In [105]:
csv4.columns

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'x', 'y'],
      dtype='object')