# Data preparation for harmonic compatibility reordering subjective evaluation
Author: Enric Gusó Muñoz, Music Technology Group, Universitat Pompeu Fabra, enric.guso@upf.edu
This notebook takes popular playlists from the Spotify Million Playlist Dataset that have audio previews.

In [None]:
from os.path import join as pjoin
import os
import json
import numpy as np
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import wget

In [None]:
#Path to Spotify's Million Playlist Dataset
data_path = '/home/enricguso/datasets/spotify_million_playlist_dataset'

## Task1 : get top-1000 most popular playlists

In [None]:
if not os.path.exists('spotify_data'):
    os.makedirs('spotify_data')

In [None]:
files = os.listdir(pjoin(data_path, 'data'))

pids = []
num_followers = []
for file in tqdm(files):
    with open(pjoin(pjoin(data_path, 'data'), file), 'r') as f:
        data = json.load(f)
    for playlist in data['playlists']:
        pids.append(playlist['pid'])
        num_followers.append(playlist['num_followers'])
pids = np.array(pids)
indxs = np.argsort(num_followers)[::-1][:2000]
top_pids = pids[indxs]

files = np.array(files)
files = files[np.argsort(np.array([int(x.split('.')[2].split('-')[0]) for x in files]))]

playlists = {}
for pid in tqdm(top_pids):
    with open(pjoin(pjoin(data_path, 'data'), files[pid // 1000]), 'r') as f:
        data = json.load(f)
    playlists[str(pid)] = data['playlists'][pid % 1000]

## Filter out songs without audio available

In [None]:
#Authentication -> you need to registar in the Spotify API
cid = 'b569f6b9399545fcb0b97e821ac7434f'
secret = ''
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [None]:
for playlist in tqdm(playlists):
    clutter = []
    for i, track in enumerate(playlists[playlist]['tracks']):
        try:
            if sp.track(track['track_uri'])['preview_url'] is None:
                clutter.append(track)
        except:
            clutter.append(track)
    if clutter != []:
        for track in clutter:
            playlists[playlist]['tracks'].remove(track)

In [None]:
# Take 1000 tracks with more than 19 tracks
new_playlists = {}
i=0
for playlist in playlists:
    if len(playlists[playlist]['tracks']) > 19:
        new_playlists[playlist] = playlists[playlist]
        i+=1
    if i == 1000:
        break

with open(pjoin('spotify_data', 'top1000_playlists.json'), 'w') as f:
    json.dump(new_playlists, f)  
len(new_playlists)

## Download audio for all tracks

In [None]:
with open(pjoin('spotify_data', 'top1000_playlists.json'), 'r') as f:
    playlists = json.load(f) 

In [None]:
errors = []
for playlist in tqdm(playlists):
    for track in playlists[playlist]['tracks']:
        try:
            outpath = pjoin(pjoin('spotify_data', 'previews'),sp.track(track['track_uri'])['uri']+'.mp3')
            if not os.path.exists(outpath):
                wget.download(sp.track(track['track_uri'])['preview_url'], out=outpath, bar=False)
        except:
            errors.append({"playlist": playlist, "track": track})
print(errors)

#retry to download the errors
for e in errors:
    wget.download(sp.track(track['track_uri'])['preview_url'], out=outpath, bar=False)
    del e

files = os.listdir(pjoin('spotify_data', 'previews'))

sids = [x.split('.')[0] for x in files]

with open('spoty_audios.txt', 'w') as f:
    for row in ['spotify_data/previews/'+x for x in files]:
        f.write(str(row)+'\n')

with open('spoty_sids.txt', 'w') as f:
    for row in sids:
        f.write(str(row)+'\n')

In [None]:
# final check that all tracks have their preview:
for playlist in tqdm(playlists):
    for track in playlists[playlist]['tracks']:
        outpath = pjoin(pjoin('spotify_data', 'previews'),track['track_uri']+'.mp3')
        if not os.path.exists(outpath):
            playlists[playlist]['tracks'].remove(track)

In [None]:
# Save
with open(pjoin('spotify_data', 'top1000_playlists.json'), 'w') as f:
    json.dump(playlists, f)