In this first notebook, I am using Spotify data to see which songs are similar to a given song. Similar songs are simply those songs which are close to the vector embedding of the given song. I construct the vector embedding using (scaled version of) attributes like "acousticness", ..., "valence".

In [1]:
import numpy as np
import pandas as pd
import glob
import tqdm
from collections import OrderedDict
import langid # I want english songs

In [3]:
csv = pd.read_csv(*glob.glob("public/data/tracks.csv"))

In [4]:
def scale_0_1(df, col):
    df.loc[:, col] = (df[col]-df[col].mean())/df[col].std()

In [5]:
attrs = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence"]
for attr in attrs:
    scale_0_1(csv, attr) # Rescale these attributes to be N(0, 1)
csv.loc[:, "name"] = csv["name"].str.lower()
csv = csv.drop_duplicates(subset=attrs)

In [6]:
# langid.classify is a fast way to check the language of the song name
def check_en(name):
    try:
        return langid.classify(name)[0] in ["en"]
    except:
        return False

tqdm.tqdm.pandas()
csv = csv[csv["name"].progress_apply(lambda x: check_en(x))]

100%|██████████| 566113/566113 [07:06<00:00, 1325.91it/s]


In [7]:
song_name = "i'll do anything"
rows = csv[["name", "id", "artists"]+attrs][csv["name"].str.contains(song_name)]
vec = rows[attrs].iloc[2].to_numpy()
rows

Unnamed: 0,name,id,artists,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
56455,i'll do anything for you,3CuKcfH8BJ5The5G3llaKD,['Denroy Morgan'],-0.730608,1.013868,1.091461,-0.425109,-0.713601,0.714253,1.006913,-0.110027,0.976856
458469,i'll do anything,2mmqUU64P1BhJNvH1jtofT,['D-Train'],-1.164622,1.519579,0.075277,-0.425099,-0.817765,0.149738,-0.155446,0.047277,0.775049
555315,i'll do anything,0hK8yn7I0oqrqXljVVGPla,['Jason Mraz'],-0.948475,0.038568,0.670697,-0.42512,0.749028,1.188775,-0.193246,1.500604,0.375314


In [17]:
csv2 = csv[["acousticness","danceability","energy","instrumentalness","valence","liveness","speechiness"]]

In [18]:
from sklearn.manifold import TSNE

In [19]:
csv2.to_numpy().shape

(215035, 7)

In [20]:
X_embedded = TSNE(n_components=3, verbose=1, n_jobs=8).fit_transform(csv2[:10000].to_numpy())

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.013s...
[t-SNE] Computed neighbors for 10000 samples in 0.375s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.303204
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.139069
[t-SNE] KL divergence after 1000 iterations: 1.429928


In [21]:
X_embedded.shape

(10000, 3)

In [22]:
csv3 = csv[:10000]

In [26]:
csv3['x'] = X_embedded[:, 0].tolist()
csv3['y'] = X_embedded[:, 1].tolist()
csv3['z'] = X_embedded[:, 2].tolist()

In [27]:
csv3

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,x,y,z
4,08y9GfoqCWfOGsKdwojr5e,lady of the evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,-0.972855,-1.524419,...,-0.366127,1.545529,0.062013,0.526595,-1.382742,-0.512189,4,7.392858,-3.106366,-14.064507
5,0BRXJHRNGQ3W4v9frnSfhu,ave maria,0,178933,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,-2.026420,-1.115563,...,-0.370574,1.559862,0.500432,-0.630596,-1.934221,0.014317,4,12.618358,-5.836942,-0.099653
6,0Dd9ImXtAtGwsmsAD69KZT,la butte rouge,0,134467,0,['Francis Marty'],['2nuMRGzeJ5jJEKlfS7rZ0W'],1922,-0.322655,-0.742433,...,0.106377,1.476729,-0.425120,-0.319733,0.678026,-1.099003,5,-1.800437,4.847038,-6.792212
7,0IA0Hju8CAgYfV1hwhidBH,la java,0,161427,0,['Mistinguett'],['4AxgXfD7ISvJSTObqm4aIE'],1922,-0.003575,-1.421212,...,-0.298308,1.556996,-0.425062,0.602548,0.394719,0.491301,3,-3.045279,2.480563,-13.148711
8,0IgI1UCz84pYeVetnl1lGP,old fashioned girl,0,310073,0,['Greg Fieler'],['5nWlsH5RDgFuRAiDeOFVmf'],1922,-0.455103,-0.266097,...,-0.361124,0.487728,-0.400950,-0.580142,-0.032182,0.721915,4,1.231329,-3.237891,-2.544635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28415,3P8QzQgt9SAEyLDWJslqAD,the drummer and the cook,5,126710,0,['Harry Belafonte'],['6Tw1ktF4xMmzaLLbe98I2z'],1954-01-23,0.279383,-0.512204,...,0.034112,0.952128,-0.425120,-0.504189,1.128212,0.516903,4,-9.099499,5.055138,-0.166650
28416,3Q8KLS8WsQyj5pN1UAIqk2,when you're in love - (demo recording),6,134600,0,['Stanley Donen'],['1TZU00O7kZJBCxXNO1HSfl'],1954-07-15,-0.190207,-1.920571,...,-0.362235,1.539796,-0.425102,-0.363135,-0.222347,0.416379,4,6.409470,11.985744,-3.598875
28417,3QLlX5dkrzo7Q8FDWjPy3c,you're my greatest love,5,163907,0,['Jackie Gleason'],['2F0kPpQ5mtta1ORIyO2xex'],1954-05-03,-2.809069,-0.285945,...,-0.382803,-0.750674,3.018532,-0.075600,-1.289600,1.424775,3,7.039084,-17.233980,10.183969
28418,3QgNuC9njQlEocR6UZpO47,cotton crop blues,5,177362,0,['James Cotton'],['6mY93oNfUaUwZq67yn3R8k'],1954-01-01,-1.021018,0.011766,...,0.184201,1.301862,-0.400126,-0.422812,0.305458,-1.577466,3,-1.127101,-1.077574,-2.332590


In [29]:
csv3.to_csv("public/tracks10k.csv")