# k-NN

Here we construct a playlist similarity matrix using the similarity proposed by [Kelen et al](https://dl.acm.org/doi/pdf/10.1145/3267471.3267477).

In [1]:
from scipy.sparse import lil_matrix
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.notebook import tqdm
import glob
import numpy as np
import os
import pandas as pd
import random
import requests
import spotipy
import time

auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager)

## Playlists treatment

Gather playlists and filter those with more than 5 and less than 250 tracks.

In [2]:
playlists_df = pd.read_pickle('../data/sp_playlists.pkl')
playlists_df = playlists_df[
    (playlists_df.tracks.apply(lambda x: x['total']) >= 5) &
    (playlists_df.tracks.apply(lambda x: x['total']) <= 250)
]
playlist_ids = list(set(playlists_df.id))

## Tracks treatment

Gather tracks and filter only those from the playlists already filtered.

In [3]:
tracks_df = pd.concat(
    [pd.read_pickle(file)[['playlist_id', 'id']] for file in glob.glob('../data/sp_tracks_ready_*.pkl')],
    ignore_index=True
)
tracks_df = tracks_df[tracks_df.playlist_id.apply(lambda x: x in playlist_ids)]

For each playlist, create a list of its tracks.

In [4]:
tracks_list = []
for playlist_id in tqdm(playlist_ids):
    tracks_list.append(tracks_df[tracks_df.playlist_id == playlist_id].id.to_list())

HBox(children=(FloatProgress(value=0.0, max=9212.0), HTML(value='')))




## Similarity matrix

Here we build the similarity matrix.

In [5]:
similar_playlists = lil_matrix((len(playlist_ids), len(playlist_ids)))

In [6]:
def similarity(u, v):
    tracks_u = tracks_list[u]
    tracks_v = tracks_list[v]
    summation = len(set(tracks_u) & set(tracks_v))
    norm_R_u = np.sqrt(len(tracks_u))
    norm_R_v = np.sqrt(len(tracks_v))
    return summation/(norm_R_u*norm_R_v)

In [7]:
for u in tqdm(range(len(playlist_ids))):
    for v in range(u + 1, len(playlist_ids)):
        s_uv = similarity(u, v)
        similar_playlists[u, v] = s_uv
        similar_playlists[v, u] = s_uv

HBox(children=(FloatProgress(value=0.0, max=9212.0), HTML(value='')))




## Sanity check

What is the most similar pair of playlists?

In [32]:
# Total similarity
np.max(similar_playlists.toarray())

1.0000000000000002

In [47]:
# First index
index_1 = int(np.floor(np.argmax(similar_playlists.toarray()) / similar_playlists.shape[1]))

In [48]:
# Second index
index_2 = np.argmax(similar_playlists.toarray()) % similar_playlists.shape[1]

In [49]:
similar_playlists[index_1, index_2]

1.0000000000000002

In [50]:
sp.playlist(playlist_ids[index_1])['name']

'Violent Femmes'

In [51]:
sp.playlist(playlist_ids[index_2])['name']

'Violent Femmes – Violent Femmes'

They are basically the same playlist! It seems a good similarity metric.

In [74]:
(similar_playlists.toarray().sum(axis=0)[[len(playlist) >= 25 for playlist in tracks_list]] == 0).sum()

263

In [73]:
np.array([len(playlist) >= 25 for playlist in tracks_list]).sum()

4366

In [64]:
tracks_list

[['5h5UlRPjHnpJusIF55agAM',
  '0qZXIaxOMvMUV5fmbSlNwu',
  '06ZVHXmJC2PYUBdxVuyhxR',
  '0MrTLfKkf0JpcfM5TSnozw',
  '0MbKF2v5uHSYYeWvh88lQ4',
  '3upy9qZlOMB9jGqLkhtWCf',
  '6Y90rMOEhnJavYMCp6hFiw',
  '4Xf4pNwXp6uAa5VmvsVMHq',
  '5apdjhdMPtbE77STuhTVcn',
  '5dcDztOvK18ZEeY5U3SSG6',
  '0IiPs8XPYZrLiKnnTtH32F',
  '4leiertH92Zz5As5J77dMO'],
 ['7vuSGejUXpD365Nl9tyehq',
  '3nFJbZCHP4d9vduKjJLdBL',
  '0qgYfRBF5mSCPOdHdgus66',
  '3hbh7f3xwRy1H573UXW9jC',
  '2ezqQeBiC72gwMJoO4w1hA',
  '7h4X53Z6RTBsLTCcmXISI3',
  '3ht2PBKbkqSJ0XEkQuNHnZ',
  '6hAN9n5pUEbgmDCqgQ25sd',
  '6Ycf7Ch2VlEKlORbz7yfpJ',
  '62LJFaYihsdVrrkgUOJC05',
  '0ew27xRdxSexrWbODuLfeE',
  '3zBhihYUHBmGd2bcQIobrF',
  '0bNPzbyaT9npwhIP8d2Rsi',
  '6Rj0GknoniiXlHdL10MpJF',
  '1hLNOxFIiEAgApBstfwONV',
  '13toFl1UwJPsRxDiD9jgtn',
  '3VZmChrnVW8JK6ano4gSED',
  '4gNeiiz86Y5wekGwfIybXW',
  '1YisXNr4DPHcPB4v5XfQ0t',
  '29rQJydAlO0uMyWvRIZxQg',
  '5hJFhO9dvhJoDvUZZ9iWSw',
  '2RcanAJpudPNDkyIe9DzKS',
  '1Gnufs5iuI3h5ow29J7tOO',
  '3QgSJ9CqgLPk9B4x