In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob

The million playlist dataset contains the song name, artist name and album name. No other attributes like genre, danceability, mood... that you could expect to make recommendations. Each .json file contains 1000 playlists, each playlist has a name.

## Reading in the files

In [7]:
directory = "spotify_million_playlist_dataset/data/"
file_paths = sorted(glob.glob(directory + "*.json"))
files = file_paths[0:50]

In [8]:
data = []  # To store the JSON data from all files

for file_path in files:
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        data.append(json_data)

In [9]:
len(data) # number of files selected

50

## Extracting the data from all the playlists of all the selected files

In [10]:
artists_list = []
tracks_list = []
albums_list = []
track_uri_list = []
pids_list = []
play_name_list = []
play_track_list = []
play_follow_list = []
play_album_list = []
play_collab_list = []
play_artist_list = []

for k in range(0, len(data)):
    file_len = len(data[k]['playlists'])

    for j in range(0,file_len):
        playlist = data[k]['playlists'][j]['tracks'] # all the tracks within playlist j within file k
        play_len = len(playlist)

        artist_name = []
        track_name = []
        album_name = []
        track_uri = []
        pid = []
        play_name = []
        play_track = []
        play_album = []
        play_follow = []
        play_collab = []
        play_artist = []
        for i in range(0,play_len):
            artist_name.append(playlist[i]['artist_name']) # name for track i of playlist j
            track_name.append(playlist[i]['track_name'])
            album_name.append(playlist[i]['album_name'])
            track_uri.append(playlist[i]['track_uri'])
            pid.append(data[k]['playlists'][j]['pid'])
            play_name.append(data[k]['playlists'][j]['name'])
            play_track.append(data[k]['playlists'][j]['num_tracks'])
            play_album.append(data[k]['playlists'][j]['num_albums'])
            play_follow.append(data[k]['playlists'][j]['num_followers'])
            play_collab.append(data[k]['playlists'][j]['collaborative'])
            play_artist.append(data[k]['playlists'][j]['num_artists'])

        artists_list.append(artist_name)
        tracks_list.append(track_name)
        albums_list.append(album_name)
        track_uri_list.append(track_uri)
        pids_list.append(pid)
        play_name_list.append(play_name)
        play_track_list.append(play_track)
        play_album_list.append(play_album)
        play_follow_list.append(play_follow)
        play_collab_list.append(play_collab)
        play_artist_list.append(play_artist)

## Converting the resulting lists of lists into single flat lists

In [11]:
from itertools import chain # flattening the lists of lists
artists = (list(chain.from_iterable(artists_list)))
tracks = (list(chain.from_iterable(tracks_list)))
albums = (list(chain.from_iterable(albums_list)))
pids = (list(chain.from_iterable(pids_list)))
track_uris = (list(chain.from_iterable(track_uri_list)))
play_names = (list(chain.from_iterable(play_name_list)))
play_tracks = (list(chain.from_iterable(play_track_list)))
play_albums = (list(chain.from_iterable(play_album_list)))
play_follows = (list(chain.from_iterable(play_follow_list)))
play_collabs = (list(chain.from_iterable(play_collab_list)))
play_artists = (list(chain.from_iterable(play_artist_list)))

## Naming the columns the same way as the top 10000 songs

In [12]:
play_data = {"Artist Name(s)":artists, "Track Name":tracks, "Album Name":albums, "Track URI":track_uris, 
             "Playlist":play_names, "Num_Tracks":play_tracks, "Num_Albums":play_albums, "Num_Artists":play_artists, 
             "Follow":play_follows, "Collab":play_collabs, "Pid":pids}
playlists = pd.DataFrame(play_data)
playlists

Unnamed: 0,Artist Name(s),Track Name,Album Name,Track URI,Playlist,Num_Tracks,Num_Albums,Num_Artists,Follow,Collab,Pid
0,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),The Cookbook,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Throwbacks,52,47,37,1,false,0
1,Britney Spears,Toxic,In The Zone,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Throwbacks,52,47,37,1,false,0
2,Beyoncé,Crazy In Love,Dangerously In Love (Alben für die Ewigkeit),spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Throwbacks,52,47,37,1,false,0
3,Justin Timberlake,Rock Your Body,Justified,spotify:track:1AWQoqb9bSvzTjaLralEkT,Throwbacks,52,47,37,1,false,0
4,Shaggy,It Wasn't Me,Hot Shot,spotify:track:1lzr43nnXAijIGYnCT8M8H,Throwbacks,52,47,37,1,false,0
...,...,...,...,...,...,...,...,...,...,...,...
3348253,Jonathan Groff,Reindeer(s) Are Better Than People,Frozen,spotify:track:4C463MZrXGxq3t7A3wlGuV,Disney,42,16,31,1,false,142999
3348254,Josh Gad,In Summer,Frozen,spotify:track:7bG6SQBGZthPDG5QJL5Gf7,Disney,42,16,31,1,false,142999
3348255,Maia Wilson,Fixer Upper,Frozen,spotify:track:5covTs92HnHJwAWrXF7l14,Disney,42,16,31,1,false,142999
3348256,Lebo M.,Circle Of Life,The Lion King: Original Broadway Cast Recording,spotify:track:3cgnaW34QpP1Y8Wvoz4D56,Disney,42,16,31,1,false,142999


"Playlist" = name of the playlist

## Generating the csv file

In [13]:
playlists.to_csv("playlists.csv", index=False)