In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import pandas as pd
from spotipy.oauth2 import SpotifyOAuth
import requests
import numpy as np
from datetime import datetime
from datetime import date

With the Create_DF class a user can create two data sets.  The add_audio_features method takes in a user's personal listening json files and creates a dataframe with audio features from the Spotify API and target variable with value 0 for songs listened to once and 1 for songs listened to more than once.  The merge_personal_kaggle method merges the df created using add_audio_features with a random sample from the kaggle Spotify data set.  The target variable 'y' is changed to 0 for songs from the random sample and 1 for song from personal listening history.  This code could easily be adapted to create a dataset with a different users personal listening data.  Request your personal listening data from Spotify and download the kaggle Spotify dataset.  Client_id, client_secret, spotify_token, spotify_user_id can all be obtained from Spotify's website.  Comments have been provided below to assist in making the necessary changes.  

In [2]:
class Create_DF:  # initialize the class with client_id, client_secret, spotify_token, spotify_user_id
    def __init__(self, client_id, secret, spotify_token, spotify_user_id):
        self.client_id = client_id
        self.secret = secret
        self.spotify_token = spotify_token
        self.spotify_user_id = spotify_user_id
        self.df_personal = None
        self.df_personal_kaggle = None
        
    def _parse_json(self):
        master_list = []
        for num in [0,1,2]: # change to list containing the number of personal listening json files, count starts at 0

            with open(f'MyData/StreamingHistory{num}.json',encoding='utf-8') as f: #change to location of json files on your computer
                data = json.load(f)
                master_list += data
        return pd.DataFrame(master_list)
    
    def _play_count_list(self):
        df_temp = self._parse_json()
        df_play_count = df_temp.groupby(['artistName', 'trackName']).agg('count')
        
        song_play_list = []

        df_1 = df_play_count[df_play_count['endTime'] == 1]
        df_gt_1 = df_play_count[df_play_count['endTime'] > 1]
        for row in range(df_1.shape[0]-1):
            name = df_1.iloc[row].name
            song_play_list.append({'artistName':name[0], 'trackName': name[1], 'y':0})

        for row in range(df_gt_1.shape[0]-1):
            name = df_gt_1.iloc[row].name
            song_play_list.append({'artistName':name[0], 'trackName': name[1], 'y':1})
        return song_play_list
    
    def _get_spotify_uri(self, song_name, artist): #partially adapted from https://github.com/TheComeUpCode/SpotifyGeneratePlaylist/blob/master/create_playlist.py
        """Search For the Song"""
        query = "https://api.spotify.com/v1/search?query=track%3A{}+artist%3A{}&type=track&offset=0&limit=20".format(
            song_name,
            artist
        )
        response = requests.get(
            query,
            headers={
                "Content-Type": "application/json",
                "Authorization": "Bearer {}".format(spotify_token)
            }
        )
        response_json = response.json()
        songs = response_json["tracks"]["items"]
        uri = songs[0]["uri"]
        date = songs[0]['album']['release_date']
        popularity = songs[0]['popularity']
        explicit = songs[0]['explicit']
        if len(date) == 7:
            date = datetime.strptime(date, "%Y-%m").year
        elif len(date) > 7:
            date = datetime.strptime(date, "%Y-%m-%d").year
        date = int(date)
        return uri, {'year':date, 'popularity':popularity, 'explicit': explicit*1}
    
    def add_audio_features(self):
        """Creates a df with songs from personal listening history.  Audio features obtained through Spotify's API"""
        
        spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=self.client_id, 
                                                                                      client_secret=self.secret))
        song_play_list = self._play_count_list()
        full_tracks_data = []
        count = 0

        for track in song_play_list:  
            count += 1 
            attempt = 0
            if count%200 == 0:
                print(f'working on track {count}')
            while attempt < 4:
                try:
                    artist, song = track['artistName'], track['trackName']
                    if artist == 'BG Kenny Lou':                         # here I make adjustments for songs/artist that 
                        artist = 'Rudy Alexander'                       # returned errors
                    if song == 'Could You Be Loved - 12" Mix':          # these will be different for every user.  
                        song = 'Could You Be Loved'
                    if song == 'Open Your Mouth and Say Something (album Mix)':
                        song = 'Open Your Mouth and Say Something'
                    if song == 'Traffic' and artist == 'China Mac':
                        break
                    if song == 'Wu Tang' and artist == 'China Mac':
                        break
                    if song == 'Better (feat. Hezekiah Walker)':
                        song = 'Better'
                    if song == 'No Twirl Zone':
                        break
                    uri, date_pop_expl = self._get_spotify_uri(song.replace("'", ""), artist.replace("'", ""))
                    data = spotify.audio_features(uri)[0]
                    full_tracks_data.append({**track, **data, **date_pop_expl})
                except:
                    attempt += 1
                    continue
                break
            if attempt == 4:
                print('error. . . ')
                print('  ', track['artistName'], track['trackName'])
        self.df_personal = pd.DataFrame(full_tracks_data)
        return self.df_personal
    
    def merge_personal_kaggle(self, match_personal=True, rs=11, n_size=2797):
        """Creates a dataset with personal listening data merged with a random sample of songs from 
        the Kaggle Spotify dataset.  Target is 0 for songs from Kaggle random sample, 1 for song from 
        personal listening data.  match_personal=True means the random sample will be the same size as 
        the number of songs from your personal listening data, meaning that the classification problem 
        will be completely balanced. If a user sets match_personal=False, the size of the random sample 
        can be change with the variable n_size which can be set to any integer <= the size of the Kaggle 
        dataset.  rs sets the random state for the random sampling from the Kaggle dataset."""
        
        if match_personal==True:
            n = self.df_personal.shape[0]
        elif match_personal==False:
            n = n_size
        df_kaggle = pd.read_csv('data/data.csv')
        df_kaggle = df_kaggle.drop(['id', 'release_date'], axis=1)
        df_kaggle['y'] = np.zeros(df_kaggle.shape[0])
        df_kaggle['artists'] = df_kaggle['artists'].apply(lambda x: x.replace("['", "").replace("']", ""))
        df_kaggle = df_kaggle.rename(columns={'artists':'artistName', 'name': 'trackName'})
        df_kaggle = df_kaggle.sample(n=n,random_state=rs)

        df_personal = self.df_personal.copy()
        df_personal['y'] = np.ones(df_personal.shape[0])
        df_personal = df_personal.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1)
        self.df_personal_kaggle = pd.concat([df_personal, df_kaggle])
        return self.df_personal_kaggle

In [3]:
client_id = #insert client id
secret = # insert secret
spotify_token = #insert token
spotify_user_id = #insert user id

In [6]:
cd = Create_DF(client_id, secret, spotify_token, spotify_user_id)
df = cd.add_audio_features()

In [7]:
df_master = cd.merge_personal_kaggle()
df_master.to_csv('data/spotify_personal_kaggle.csv', index=False, sep='\t')