Parameters

In [None]:
#Spotify API requires you to set-up an app to use its developer tools
#Find out more at the following link and use it to fill in secret and cid in the cell below
print('https://developer.spotify.com/documentation/web-api/concepts/apps')

In [2]:
#Spotify login info

secret =    #Fill in
cid =       #Fill in

#Your name
your_name = 'My_Name'

#CSV file names
your_listening_csv = your_name + '_listen_data.csv'
audio_details_csv = your_name + '_audio_data.csv'
track_det_csv = your_name + '_track_data.csv'
art_csv = your_name + '_art_data.csv'


Upload personal data

In [13]:
import os, json
import pandas as pd

# Identify all json files in current directory
json_files = [pos_json for pos_json in os.listdir(os.getcwd()) if pos_json.endswith('.json')]

# Add all json data to dataframe
listening_df = None
for file_name in json_files:
    single_file_listening=pd.read_json(file_name)
    listening_df=pd.concat(objs=[listening_df,single_file_listening], ignore_index=True)

#Eport listening data to csv
print(listening_df.shape)
listening_df = listening_df[['ts','conn_country',
       'master_metadata_track_name', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'spotify_track_uri','reason_end', 'ms_played']]

listening_df.rename(columns={'ts': 'Time Played', 'master_metadata_track_name': 'Track',
                   'master_metadata_album_artist_name': 'Artist', 'master_metadata_album_album_name': 'Album'}
                    , inplace=True)

listening_df.to_csv(your_listening_csv, encoding='utf-8', index=False)
len(listening_df)

(60076, 21)


60076

In [3]:
import os, json
import pandas as pd

# Identify all json files in current directory
path_to_json = 'json/'
json_files = [pos_json for pos_json in os.listdir(os.getcwd()) if pos_json.endswith('.json')]

# Add all json data to dataframe
listening_df = None
for file_name in json_files:
    single_file_listening=pd.read_json(file_name)
    listening_df=pd.concat(objs=[listening_df,single_file_listening], ignore_index=True)

#Eport listening data to csv
print(listening_df.columns.values.tolist())

['ts', 'username', 'platform', 'ms_played', 'conn_country', 'ip_addr_decrypted', 'user_agent_decrypted', 'master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name', 'spotify_track_uri', 'episode_name', 'episode_show_name', 'spotify_episode_uri', 'reason_start', 'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp', 'incognito_mode']


Create chunked list of unique songs

In [4]:
import pandas as pd
import numpy as np

unique_songs_array = pd.unique(listening_df['spotify_track_uri'])

#investigate pd is not null
unique_songs_array = unique_songs_array[unique_songs_array!=None]
unique_songs_array = [i for i in unique_songs_array if i is not np.nan]

chunks = [unique_songs_array[x:x+100] for x in range(0, len(unique_songs_array), 100)]
print(len(chunks))
# print(unique_songs_array)

200


Retrieve audio data from spotify

In [6]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep


#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager, requests_timeout=10, retries=10)
audio_data=[]
for batch in chunks:
    for track in sp.audio_features(batch):
        try: 
            audio_data.append(track)
        except:
            break
    sleep(1.5)


audio_data = [i for i in audio_data if i is not None]    
audio_df = pd.DataFrame(audio_data)
audio_df_concise = audio_df[['danceability','energy', 'uri']]
# print(song_data)
print(audio_df_concise.shape)
audio_df_concise.to_csv(audio_details_csv, encoding='utf-8', index=False)

(19929, 3)


Make smaller chunks for track details

In [7]:
# Make smaller chunks
import numpy as np
from scipy.special import entr

unique_songs_array = pd.unique(listening_df['spotify_track_uri'])
unique_songs_array = unique_songs_array[unique_songs_array!=None]
unique_songs_array = [i for i in unique_songs_array if i is not np.nan]
small_chunks = [unique_songs_array[x:x+50] for x in range(0, len(unique_songs_array), 50)]

Import track data

In [9]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret) 
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager, requests_timeout=10, retries=10)

track_det_data=[]
for small_batch in small_chunks:
    for track_det in sp.tracks(small_batch)['tracks']:
        try: 
            track_det_data.append(track_det)
        except:
            break
    sleep(1.5)


track_det_data = [i for i in track_det_data if i is not None]
track_det_df = pd.DataFrame(track_det_data)
track_det_data_concise =  track_det_df[['popularity', 'uri','artists']]
# print(song_data)
print(track_det_data_concise.shape)
track_det_data_concise.to_csv(track_det_csv, encoding='utf-8', index=False)

(19929, 3)


Retrieve Artist IDs

In [4]:
import pandas as pd
track_det_data=pd.read_csv(track_det_csv)

art_ids=pd.DataFrame()
art_ids['artist_ids']=track_det_data['artists'].str.extract(r"((?<=id': ').*?(?='))")



Make artist chunks

In [5]:
# Make smaller chunks
import numpy as np
import pandas as pd
from scipy.special import entr

unique_art_array = pd.unique(art_ids['artist_ids'])
unique_art_array = unique_art_array[unique_art_array!=None]
unique_art_array = [i for i in unique_art_array if i is not np.nan]
art_chunks = [unique_art_array[x:x+50] for x in range(0, len(unique_art_array), 50)]

Import artist details

In [6]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret) 
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager, requests_timeout=10, retries=10)

art_data=[]
for art_batch in art_chunks:
    for art_det in sp.artists(art_batch)['artists']:
        try: 
            art_data.append(art_det)
        except:
            break
    sleep(1.5)


art_data = [i for i in art_data if i is not None]
art_df = pd.DataFrame(art_data)

art_concise_df =  art_df[['genres','id']]
art_concise_df.rename(columns={'genres': 'genre', 'id': 'artist_id'}, inplace=True)

art_concise_df.to_csv(art_csv, encoding='utf-8', index=False)
   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  art_concise_df.rename(columns={'genres': 'genre', 'id': 'artist_id'}, inplace=True)


Import data from CSVs

In [None]:
audio_data=pd.read_csv(audio_details_csv)
listen_data = pd.read_csv(your_listening_csv)
track_det_data=pd.read_csv(track_det_csv)

In [None]:
from pyexcelerate import Workbook

track_temp_table=audio_data.merge(track_det_data, left_on='uri', right_on='uri')
master_table = track_temp_table.merge(listen_data, left_on='uri', right_on='spotify_track_uri')

master_table.to_csv(your_name + '_master_data.csv', encoding='utf-8', index=False)

Prep artist dataset

In [None]:
#Import Data from CSVs (specified in parameters)
track_det_data=pd.read_csv(track_det_csv)
track_det_data['artists']=track_det_data['artists'].str.extract(r"((?<=id': ').*?(?='))")   #isolate artist
listen_data = pd.read_csv(your_listening_csv)
art_data=pd.read_csv(art_csv)

#Create Genre List
full_genre_list=[]
for i in range(art_data.shape[0]):
    temp_genre_list = art_data['genre'][i].strip("][").split(', ')
    temp_genre_list = [s.strip("'") for s in temp_genre_list]
    art_data['genre'][i]= temp_genre_list
    for j in range(len(temp_genre_list)):
        full_genre_list.append(temp_genre_list[j])
genre_list=list(set(full_genre_list))

#Create a row for every genre each track has (5 genres, 5 rows)
art_genre_data = pd.DataFrame({
      col:np.repeat(art_data[col].values, art_data['genre'].str.len())
      for col in art_data.columns.drop('genre')}
    ).assign(**{'genre':np.concatenate(art_data['genre'].values)})[art_data.columns]

Create artist dataset and export to csv

In [None]:
art_temp_table_1=art_genre_data.merge(track_det_data, left_on='artist_id', right_on='artists')
art_temp_table_2=art_temp_table_1.merge(audio_data, left_on='uri', right_on='uri')
art_full_table = art_temp_table_2.merge(listen_data, left_on='uri', right_on='spotify_track_uri')

art_full_table.to_csv(your_name + '_art_master_data.csv', encoding='utf-8', index=False)