In [0]:
import pandas as pd

In [0]:
import json
import pprint as pprint
import numpy as np

In [0]:
with open('./slice.json', 'r') as f:
    datastore = json.load(f)

In [0]:
sample_playlists = datastore['playlists']

In [0]:
processed_data = {}

In [0]:
for playlist in sample_playlists:
    
    cumulative_track_features = {}    
    # go through each track and get the features
    for track in playlist['tracks']:
        
        if track['track_features'] is None:
            continue
            
        for feature in track['track_features']:

            if feature in ['type', 'id', 'uri', 'track_href', 'analysis_url']:
                continue # we skip these since they're non-numeric
            
            # if we haven't seen the feature, init a list
            if not feature in cumulative_track_features:
                cumulative_track_features[feature] = []
            
            cumulative_track_features[feature].append(track['track_features'][feature])
    
    if cumulative_track_features == {}:
        continue # skip, no features found 
        
    playlist_track_features = {}
    for feature in cumulative_track_features:
        playlist_track_features[feature] = sum(cumulative_track_features[feature])/len(cumulative_track_features[feature])
    
    processed_data[playlist['name']] = playlist_track_features

In [22]:
feature_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
playlist_names = []
data_matrix = []
for playlist in processed_data:
    playlist_names.append(playlist)
    row = []
    data = processed_data[playlist]
    for i in range(len(feature_keys)):
        row.append(data[feature_keys[i]])
    data_matrix.append(row)

print("Created data_matrix")

Created data_matrix


In [0]:
X = np.array(data_matrix)


In [0]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

In [25]:
model_knn.fit(X)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [0]:
distances, indices = model_knn.kneighbors(X)

In [27]:
print("Similar Playlists for {}".format(playlist_names[0]))
indices_for_playlist_0_neighbors = indices[0]
distances_for_playlist_0_neighbors = distances[0]
for index in indices_for_playlist_0_neighbors:
    print("Name: {}".format(playlist_names[index]))

Similar Playlists for r/Techno | Top weekly posts
Name: r/Techno | Top weekly posts
Name: Broken & Experimental Techno
Name: Pots n Pans
Name: Top Trip Hop and Downtempo
Name: r/CyberPunk_Music | Top weekly posts
Name: Instrumental Psychedelic Stoner Rock
Name: NIGHT DRIVES
Name: Desert Hearts 2019
Name: 2007-2010
Name: DKFM Shoegaze Finds


In [35]:
import pandas as pd

#Convert matrix to dataframe
df = pd.DataFrame(data_matrix, index=playlist_names,columns = feature_keys)
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
r/Techno | Top weekly posts,0.64668,0.79414,5.42,-9.96658,0.56,0.057056,0.052438,0.848076,0.16466,0.285762,131.44796
r/Frisson | Top weekly posts,0.49654,0.539598,5.0,-9.01446,0.62,0.071046,0.324839,0.250639,0.204546,0.292772,114.62492
r/ShoeGaze | Top weekly posts,0.3796,0.674,4.38,-8.43058,0.8,0.039974,0.139148,0.561841,0.206018,0.28524,130.58724
Noodle's Top Korean Standouts 2019,0.619658,0.657451,6.005435,-5.038103,0.548913,0.073001,0.365478,0.013186,0.190513,0.498738,122.132625
r/DeathCore | Top weekly posts,0.35594,0.94394,6.18,-4.68844,0.54,0.145508,0.001172,0.315881,0.256538,0.129884,124.27628


In [36]:
#We need to scale each feature from 0-1, else while calculating neighbours one dimension will contribute more distance than others. 
#Eg. mode will contribute 0.1 to distance, while tempo might contribute 20.
from sklearn.preprocessing import MinMaxScaler
tmp = MinMaxScaler().fit_transform(df)
df_scaled = pd.DataFrame(tmp, index=playlist_names,columns = feature_keys)
df_scaled.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
r/Techno | Top weekly posts,0.721064,0.789753,0.589333,0.72975,0.497143,0.031624,0.05368,0.925847,0.10699,0.271925,0.613814
r/Frisson | Top weekly posts,0.522396,0.507011,0.533333,0.761431,0.565714,0.048465,0.332881,0.273623,0.158004,0.279918,0.452684
r/ShoeGaze | Top weekly posts,0.367658,0.656303,0.450667,0.780859,0.771429,0.011061,0.142554,0.613363,0.159886,0.271329,0.60557
Noodle's Top Korean Standouts 2019,0.685308,0.637921,0.667391,0.893741,0.484472,0.050818,0.374534,0.014395,0.140056,0.514787,0.524593
r/DeathCore | Top weekly posts,0.33635,0.95615,0.690667,0.905376,0.474286,0.1381,0.001134,0.344849,0.224502,0.094173,0.545124


In [37]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
knn.fit(df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [55]:
testPlaylist = df.iloc[0]
#testPlaylist = df.loc['r/Techno | Top weekly posts']
distances, indices = knn.kneighbors([testPlaylist])

print("Similar Playlists for {}".format(testPlaylist.name))
for index in indices[0][1:]:
  print("Name : {}".format(df.iloc[index].name))

Similar Playlists for r/Techno | Top weekly posts
Name : Broken & Experimental Techno
Name : Pots n Pans
Name : Top Trip Hop and Downtempo
Name : r/CyberPunk_Music | Top weekly posts
Name : Instrumental Psychedelic Stoner Rock
Name : NIGHT DRIVES
Name : Desert Hearts 2019
Name : 2007-2010
Name : DKFM Shoegaze Finds
