In [1]:
import os
import re
import sys
import json
import time
import spotipy
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

In [2]:
# Change the path to where your Spotify Million Playlist is located. (Need to download the dataset first from 
# https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge) 
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = 'data/spotify_million_playlist_dataset/data'

def loop_slices(path, num_slices=20):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [3]:
def create_csv(playlists, extended=False):
    """
    This function will construct a datafrme and write to .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('data/{}_MPD_Extended.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S")), index=False) 

In [None]:
playlists = loop_slices(path, num_slices=1000)
create_csv(playlists, extended=True)

# Save json file - This file will be used as database
with open('data/{}_playlists_{}.json'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(playlists)), 'w') as outfile:
    json.dump(playlists, outfile, indent=2)

In [None]:
# # Read MPD
df = pd.read_csv('data/MPD.csv')
df

In [None]:
# # Read MPD_Extended
df = pd.read_csv('data/MPD_Extended.csv')
df

In [3]:
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = config.SPOTIPY_CLIENT_ID
os.environ["SPOTIPY_CLIENT_SECRET"] = config.SPOTIPY_CLIENT_SECRET
os.environ['SPOTIPY_REDIRECT_URI'] = config.SPOTIPY_REDIRECT_URI   # Needed for user authorization
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [14]:
# Code to retrieve and add data to an existing df
df_list = []
feats_df = pd.read_csv('Playlist_Feats.csv')
idx = len(feats_df)
df_list.append(feats_df)
feats_df

In [2]:
client_credentials_manager = SpotifyClientCredentials('087713e2aba34fb9a9c0e7ef2e93cf1e', 'fb273f014821463b9aa829461d9cef3b', None)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
hello = pd.read_pickle("hello3.pkl")

In [4]:
hello.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description,num_holdouts
0,Birthday Party,False,3000,1485993600,16,16,1,"[{'pos': 0, 'artist_name': 'Lost Frequencies',...",4,3484812,16,,
1,Chill,False,3001,1465084800,26,20,1,"[{'pos': 0, 'artist_name': 'Erykah Badu', 'tra...",4,6432550,13,,
2,Run For Your Life,False,3002,1501459200,12,12,1,"[{'pos': 0, 'artist_name': 'Mako', 'track_uri'...",5,2570961,11,,
3,chill,False,3003,1508716800,245,210,1,"[{'pos': 0, 'artist_name': 'Lord Huron', 'trac...",174,58826366,193,,
4,riding,False,3004,1509235200,53,50,1,"[{'pos': 0, 'artist_name': 'Kweku Collins', 't...",24,12117712,38,,


In [None]:

import pickle
playlists = hello.to_dict('records')
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
playlists2 = playlists[:10]

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = pd.DataFrame()
count = 0



for playlist in tqdm(playlists):
    count = count + 1
    audio_feats = []
    all_uris_in_plylst = []
    genres = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        all_uris_in_plylst.append(uri)

    chunks_uris = [all_uris_in_plylst[i:i + 100] for i in range(0, len(all_uris_in_plylst), 100)]
    for chunk in  chunks_uris:
        trial = audio_feats
        for attempt in range(10):
            try:
                chunk_audio_feats = sp.audio_features(chunk)
                audio_feats.append(chunk_audio_feats)
            except Exception as e: 
                print(e)
                print('playlist: {}, chunk: {}'.format(playlist['name'], chunk))
            else:
                break
        else:
            print('Everything failed')


    
    playlist_audio_feats = [item for sublist in audio_feats for item in sublist]
    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    try: # Try/Except for when there is a None in playlist
        s2 = pd.DataFrame(playlist_audio_feats)[cols_to_keep].mean()
    except:
        print('Playlist "{}" has a None. PID: {}'.format(name, pid))
        s2 = pd.DataFrame([i for i in playlist_audio_feats if i])[cols_to_keep].mean() # List comprehension to remove None from list

    dfs.append(pd.DataFrame(s1.append(s2)).T)
    if count%5000 == 0:
        with open('audio.pkl', 'wb') as f:
            pickle.dump(dfs, f)
# Time to beat: 5% in 5min

In [62]:
yo = pd.read_pickle("audio.pkl")

In [32]:
yo.head()

Unnamed: 0,0
name,Mom
pid,33999
danceability,0.606128
energy,0.610436
key,4.358974


In [None]:
yo =  pd.concat(dfs, axis=0)

In [None]:
df = dfs.to_pickle("more3.pkl")
dfs.head()
hello = pd.read_pickle("more3.pkl")
print(hello)


In [14]:
df1.head()

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Birthday Party,3000,0.7075,0.784312,7.0,-5.141437,0.25,0.070969,0.050844,0.003832,0.174956,0.648,120.545375,217801.0625,4.0
0,Chill,3001,0.556615,0.456808,4.807692,-8.912923,0.538462,0.128712,0.453395,0.006813,0.185838,0.368769,105.053654,247439.461538,3.923077
0,Run For Your Life,3002,0.704083,0.694,6.5,-5.42675,0.333333,0.087425,0.105692,0.010698,0.09555,0.586408,112.604417,214247.083333,4.0
0,chill,3003,0.531698,0.489971,4.526531,-9.746731,0.759184,0.049581,0.445193,0.158944,0.13709,0.282592,120.624567,240114.065306,3.902041
0,riding,3004,0.609189,0.667453,5.339623,-6.554283,0.584906,0.157898,0.211708,0.016296,0.201206,0.512,128.137547,228636.471698,3.981132
