In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

In [2]:
SongCounts = json.load(open("data/song_counts.json", 'r'))

In [3]:
top_songs = pd.Series(np.load("data/top_songs.npy", allow_pickle=True))

In [4]:
top_songs

0        spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1        spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2        spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3        spotify:track:1AWQoqb9bSvzTjaLralEkT
4        spotify:track:1lzr43nnXAijIGYnCT8M8H
                         ...                 
18357    spotify:track:6p1qQzLFaJ8st27GqTB3Bm
18358    spotify:track:3iStkMXWw7Y4bqF00Avc5z
18359    spotify:track:689r5XA95FWNKPUfrvLN0K
18360    spotify:track:2DZ02lWzt1yTnTNyRIMtnu
18361    spotify:track:5WUtmER3igDXpNDqAS2zEa
Length: 18362, dtype: object

In [5]:
track2idx_lookup = {v: i for v, i in  zip(top_songs.values, top_songs.index)}
track2idx_lookup

{'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI': 0,
 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak': 1,
 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv': 2,
 'spotify:track:1AWQoqb9bSvzTjaLralEkT': 3,
 'spotify:track:1lzr43nnXAijIGYnCT8M8H': 4,
 'spotify:track:0XUfyU2QviPAs6bxSpXYG4': 5,
 'spotify:track:68vgtRHr7iZHpzGpon6Jlo': 6,
 'spotify:track:3BxWKCI06eQ5Od8TY2JBeA': 7,
 'spotify:track:7H6ev70Weq6DdpZyyTmUXk': 8,
 'spotify:track:2PpruBYCo4H7WOBJ7Q2EwM': 9,
 'spotify:track:2gam98EZKrF9XuOkU13ApN': 10,
 'spotify:track:4Y45aqo9QMa57rDsAJv40A': 11,
 'spotify:track:1HwpWwa6bnqqRhK8agG4RS': 12,
 'spotify:track:20ORwCJusz4KS2PbTPVNKo': 13,
 'spotify:track:7k6IzwMGpxnRghE7YosnXT': 14,
 'spotify:track:1Bv0Yl01xBDZD4OQP93fyl': 15,
 'spotify:track:4omisSlTk6Dsq2iQD7MA07': 16,
 'spotify:track:7xYnUQigPoIDAMPVK79NEq': 17,
 'spotify:track:6d8A5sAx9TfdeseDvfWNHd': 18,
 'spotify:track:4pmc2AxSEq6g7hPVlJCPyP': 19,
 'spotify:track:215JYyyUnrJ98NK3KEwu6d': 20,
 'spotify:track:0uqPG793dkDDN7sCUJJIVC': 21,
 'spotify:track:19Js

In [6]:
idx2track_lookup = {i: v for v, i in  zip(top_songs.values, top_songs.index)}
idx2track_lookup

{0: 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
 1: 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
 2: 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv',
 3: 'spotify:track:1AWQoqb9bSvzTjaLralEkT',
 4: 'spotify:track:1lzr43nnXAijIGYnCT8M8H',
 5: 'spotify:track:0XUfyU2QviPAs6bxSpXYG4',
 6: 'spotify:track:68vgtRHr7iZHpzGpon6Jlo',
 7: 'spotify:track:3BxWKCI06eQ5Od8TY2JBeA',
 8: 'spotify:track:7H6ev70Weq6DdpZyyTmUXk',
 9: 'spotify:track:2PpruBYCo4H7WOBJ7Q2EwM',
 10: 'spotify:track:2gam98EZKrF9XuOkU13ApN',
 11: 'spotify:track:4Y45aqo9QMa57rDsAJv40A',
 12: 'spotify:track:1HwpWwa6bnqqRhK8agG4RS',
 13: 'spotify:track:20ORwCJusz4KS2PbTPVNKo',
 14: 'spotify:track:7k6IzwMGpxnRghE7YosnXT',
 15: 'spotify:track:1Bv0Yl01xBDZD4OQP93fyl',
 16: 'spotify:track:4omisSlTk6Dsq2iQD7MA07',
 17: 'spotify:track:7xYnUQigPoIDAMPVK79NEq',
 18: 'spotify:track:6d8A5sAx9TfdeseDvfWNHd',
 19: 'spotify:track:4pmc2AxSEq6g7hPVlJCPyP',
 20: 'spotify:track:215JYyyUnrJ98NK3KEwu6d',
 21: 'spotify:track:0uqPG793dkDDN7sCUJJIVC',
 22: 'spotify:track:

In [7]:
overlaps = np.zeros((len(top_songs), len(top_songs)))

In [8]:
slices = [f for f in os.listdir('data/MPD') if f.endswith(".json")]

for slc in slices: 
    print(f"Parsing: {slc}")
    slice_json = json.load(open(f"data/MPD/{slc}", 'r'))
    slice_df = pd.DataFrame.from_dict(slice_json['playlists'], orient='columns')
    for tracks in slice_df.tracks:
        for track1 in tracks:
            idx1 = track2idx_lookup.get(track1['track_uri'], None)
            if idx1 != None:
                for track2 in tracks: 
                    idx2 = track2idx_lookup.get(track2['track_uri'], None)
                    if idx2 != None and idx1 != idx2: 
                        overlaps[idx1][idx2] += 1
                overlaps[idx1][idx1] += 1

Parsing: mpd.slice.0-999.json
Parsing: mpd.slice.1000-1999.json
Parsing: mpd.slice.2000-2999.json
Parsing: mpd.slice.3000-3999.json


In [11]:
distances = np.zeros((len(top_songs), len(top_songs)))
for i, row in enumerate(overlaps):
    track_count = SongCounts[idx2track_lookup[i]]
    distances[i] = 1 - overlaps[i]/track_count

In [12]:
distances

array([[0.        , 0.66666667, 0.61904762, ..., 1.        , 1.        ,
        1.        ],
       [0.85106383, 0.        , 0.57446809, ..., 1.        , 1.        ,
        1.        ],
       [0.90243902, 0.75609756, 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        0.        ]])