In [None]:
# !pip3 install openpyxl

In [1]:
import requests
import base64
import os
import numpy as np
import pandas as pd
import json
import time

In [162]:
class Spotify():
    '''
    Class to handle hitting spotify API to get music features
    '''
    
    def __init__(self, 
                 client_id=None, 
                 client_secret=None):
        
        self.client_id = client_id
        self.client_secret = client_secret
        
        self.get_access_token()
        
        
    def get_access_token(self):
        ## Example use:
        ## spotify_instance = Spotify(CLIENT_ID, CLIENT_SECRET)
        ## spotify_instance.get_access_token()
        
        secret_bytes = bytes(('{}:{}'.format(self.client_id, self.client_secret)),'utf-8')
        secret_enc = base64.b64encode(secret_bytes).decode('utf-8')
        
        data = {'grant_type': 'client_credentials'}
        headers = {'Authorization': 'Basic {}'.format(secret_enc)}
        url = 'https://accounts.spotify.com/api/token'
        r = requests.post(url, headers=headers, data=data)
        
        self.access_token = r.json()['access_token']
    
    def lookup_spotify_id(self, id_list):
        '''
        this takes in ISRC and returns ISRC to Spotify ID map
        '''
        df = pd.DataFrame([])
        bad_ids = []
        for i in id_list:
            r = requests.get(f'https://api.spotify.com/v1/search?type=track&q=isrc:{i}',
                             headers = {'Authorization': 'Bearer ' + self.access_token})
            while r.status_code == 429:
                retry_secs = int(r.headers['Retry-After'])
                print('sleeping for {}'.format(retry_secs))
                time.sleep(retry_secs)
            try:
                spotify_id = r.json()['tracks']['items'][0]['id']
                row = {'isrc': str(i), 'spotify_id': spotify_id}
                df = df.append(row, ignore_index = True)
            except:
#                 print(f'id issue {len(bad_ids) + 1}')
                row = {'isrc': str(i), 'spotify_id': None}
                df = df.append(row, ignore_index = True)
                bad_ids.append(i)
        return df
    
    def lookup_album_id(self, id_list):
        '''
        this takes in UPC and returns UPC to Spotify Album ID
        '''
        df = pd.DataFrame([])
        bad_ids = []
        for i in id_list:
            r = requests.get(f'https://api.spotify.com/v1/search?type=album&q=upc:{i}',
                             headers = {'Authorization': 'Bearer ' + self.access_token})
            while r.status_code == 429:
                retry_secs = int(r.headers['Retry-After'])
                print('sleeping for {}'.format(retry_secs))
                time.sleep(retry_secs)
            try:
                spotify_id = r.json()['albums']['items'][0]['id']
                row = {'upc': str(i), 'spotify_album_id': spotify_id}
                df = df.append(row, ignore_index = True)
            except:
#                 print(f'id issue {len(bad_ids) + 1}')
                row = {'upc': str(i), 'spotify_album_id': None}
                df = df.append(row, ignore_index = True)
                bad_ids.append(i)
        return df
    
    def group_ids(self, list_of_ids, bucket_size=100):
        list_of_ids = pd.DataFrame(list_of_ids, columns=['id'])
        list_of_ids['bucket'] = np.floor(np.arange(len(list_of_ids)) / bucket_size)
        
        gpd_ids = list_of_ids.groupby('bucket')['id'].apply(lambda x: ','.join(x))
        
        return gpd_ids

    def get_audio_features(self, id_list, bucket_size=100):
        
        gpd_ids = self.group_ids(id_list, bucket_size)
        col_list = ['acousticness', 'liveness', 'instrumentalness', 
                    'analysis_url', 'uri', 'time_signature', 'loudness', 
                    'speechiness', 'duration_ms', 'danceability', 'mode', 
                    'id', 'energy', 'key', 'track_href', 
                    'valence', 'type', 'tempo']
        
        df = []
        
        for x in gpd_ids.values:
            r = requests.get('https://api.spotify.com/v1/audio-features?ids={}'.format(x), 
                             headers = {'Authorization': 'Bearer ' + self.access_token})
            while r.status_code == 429:
                retry_secs = int(r.headers['Retry-After'])
                time.sleep(retry_secs)
            
            audio_features = r.json()['audio_features']
            audio_features = [i for i in audio_features if i]
            
            for row in audio_features:
                for c in col_list:
                    if c not in row:
                        row[c] = None
                df.append(row)
    
        return pd.DataFrame(df)
    
    def get_tracks(self, id_list, bucket_size=50):
        
        gpd_ids = self.group_ids(id_list, bucket_size)
        col_list = ['popularity', 'name', 'uri', 'external_urls', 'type', 'duration_ms', 'external_ids',
                    'album', 'explicit', 'id', 'preview_url', 'track_number', 'available_markets', 'is_local', 
                    'artists', 'href', 'disc_number']
        
        track_df = []
        
        for x in gpd_ids.values:
            r = requests.get('https://api.spotify.com/v1/tracks?ids={}'.format(x),
                             headers = {'Authorization': 'Bearer ' + self.access_token})
            
            while r.status_code == 429:
                retry_secs = int(r.headers['Retry-After'])
                time.sleep(retry_secs)
                  
            tracks = r.json()['tracks']
            tracks = [i for i in tracks if i]
            
            for row in tracks:
                for c in col_list:
                    if c not in row:
                        row[c] = None
                track_df.append(row)
        
        final_track_df = pd.DataFrame(track_df)
        
        final_track_df['release_date'] = pd.json_normalize(final_track_df['album'])['release_date']
        final_track_df['album_id'] = pd.json_normalize(final_track_df['album'])['id']
        
        return final_track_df[['id','popularity','album_id','release_date']]

    def get_albums(self, id_list, bucket_size=20):
        
        gpd_ids = self.group_ids(id_list, bucket_size)
        col_list = ['id','release_date', 'release_date_precision','total_tracks','type']
        
        album_df = []
        
        for x in gpd_ids.values:
            r = requests.get('https://api.spotify.com/v1/albums?ids={}'.format(x),
                             headers = {'Authorization': 'Bearer ' + self.access_token})
            
            while r.status_code == 429:
                retry_secs = int(r.headers['Retry-After'])
                time.sleep(retry_secs)
                  
            albums = r.json()['albums']
            albums = [i for i in albums if i]
            
            for a in albums:
                df_row = {x: a[x] for x in col_list}
                album_df.append(df_row)
                
        final_album_df = pd.DataFrame(album_df)
        return final_album_df

In [163]:
client_id = 'ae17fb14354d4d98a442007563fafab9'
client_secret = 'cb9e3752f07f4cfc9685c9369fd4b11c'

sp = Spotify(client_id,client_secret)
sp.get_access_token()

In [None]:
data_path = r'data/'
dataset_path = r'data/mri_full_data.csv'

music_data = pd.read_csv(dataset_path)
music_data.drop('Unnamed: 0', axis=1, inplace=True)
music_data['Display Upc'] = music_data['Display Upc'].astype('string')

In [165]:
isrc_ids = list(music_data[~music_data.ISRC.isna()].ISRC.unique())
upc_ids = list(music_data[~music_data['Display Upc'].isna()]['Display Upc'].unique())

In [None]:
isrc_to_spotify_id = sp.lookup_spotify_id(isrc_ids)

In [None]:
upc_to_spotify_album_id = sp.lookup_album_id(upc_ids)

In [None]:
isrc_to_spotify_id.to_csv(os.path.join(data_path, 'isrc_to_spotify_id.csv'))
upc_to_spotify_album_id.to_csv(os.path.join(data_path, 'upc_to_spotify_album_id.csv'))

In [None]:
spotify_ids = (isrc_to_spotify_id[~(isrc_to_spotify_id.spotify_id.isna())].spotify_id.unique())

In [None]:
track_audio_feats = sp.get_audio_features(spotify_ids)

In [None]:
track_audio_feats.to_csv(os.path.join(data_path, 'track_audio_feats.csv'))

In [None]:
spotify_album_ids = list(
    upc_to_spotify_album_id[
        ~(upc_to_spotify_album_id.spotify_album_id.isna())
    ].spotify_album_id.unique()
)

In [None]:
album_information = sp.get_albums(spotify_album_ids)

In [None]:
album_information.to_csv(os.path.join(data_path, 'album_information.csv'))

# join all data

In [None]:
music_data.merge(
    isrc_to_spotify_id,
    how = 'left',
    left_on = 'ISRC',
    right_on = 'isrc'
).merge(
    upc_to_spotify_album_id,
    how = 'left',
    left_on = 'Display Upc',
    right_on = 'upc'
)

In [154]:
music_data.head(2)

Unnamed: 0,Statement Month,Store Name,Label Name,Artist Name,Release Name,Track Artist (Performer),Track Name,ISRC,Display Upc,Country Code,Continent,"Stream Ad-Supp, Stream Prem, Download, Physical, Locker, Other",Units,Gross Revenue USD,Net Revenue USD
0,2018-07,iTunes/Apple,MRI Entertainment,Rebelution,Free Rein,,,,657481106983,US,North America,Download,6650,"$46,550.00","$41,895.00"
1,2021-07,iTunes/Apple,MRI Entertainment,Rebelution,In the Moment,,,,657481109182,US,North America,Download,1723,"$12,061.00","$10,854.90"


In [156]:
isrc_to_spotify_id.head(2)

Unnamed: 0,isrc,spotify_id
0,US4CL1410008,4TyCnstYu6LGrjka5WW6ft
1,US4CL1410011,49CdYBpfABUa0ZfT8FizQZ


In [155]:
upc_to_spotify_album_id.head(2)

Unnamed: 0,isrc,spotify_album_id
0,657481100000.0,5FfbqVo0OapeCvoJvjWlOp
1,657481100000.0,4ZO52CTQ2EzwDmAIrFjHaW


In [21]:
len(music_data)

369017

In [29]:
len(music_data['ISRC'].unique())

1182

In [30]:
len(music_data['Display Upc'].unique())

287

In [28]:
len(music_data['Store Name'].unique())

47

In [37]:
(music_data['Stream Ad-Supp, Stream Prem, Download, Physical, Locker, Other'].unique())

array(['Stream Premium', 'Stream Ad-Supported', 'Download', 'Locker',
       'Other'], dtype=object)

In [None]:
audio_feats.

In [20]:
music_data[music_data.ISRC.isna()]

Unnamed: 0,Statement Month,Store Name,Artist Name,Release Name,Track Artist (Performer),Track Name,ISRC,Display Upc,Country Code,Continent,"Stream Ad-Supp, Stream Prem, Download, Physical, Locker, Other",Units,Gross Revenue USD
176,2021-02,iTunes/Apple,Rebelution,Count Me In,,,,886444576840,US,North America,Download,51,357.00
225,2021-01,iTunes/Apple,Double Tiger,The Journey,,,,657481107980,US,North America,Download,40,280.00
233,2021-03,iTunes/Apple,Rebelution,Count Me In,,,,886444576840,US,North America,Download,39,273.00
256,2021-01,iTunes/Apple,Rebelution,Count Me In,,,,886444576840,US,North America,Download,35,245.00
304,2021-02,iTunes/Apple,Rebelution,Falling into Place,,,,886445840513,US,North America,Download,59,206.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368399,2021-01,Amazon Music,Unified Highway,Headlines,,,,657481107881,US,North America,Download,0,0.00
368622,2021-02,iTunes/Apple,Cas Haley,La Si Dah,,,,886443999954,US,North America,Download,0,0.00
368912,2021-03,Amazon Music,Jesse Royal,Lily of da Valley,,,,886446677620,GB,Europe,Download,0,0.00
369015,2021-03,Musictoday,Easy Star All-Stars,Easy Star's Thrillah,Easy Star All-Stars,,,886443510814,US,North America,Download,1,6.79


In [None]:
# 369017 rows in data
# 399 rows without ISRCs
# 

In [22]:
len(audio_feats)

953

In [18]:
len(music_data.ISRC.unique())

1062

In [15]:
len(audio_feats)

953

In [12]:
audio_feats.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0,0.92,0.56,8,-9.53,1,0.0576,0.0551,4.1e-05,0.097,0.625,140.004,audio_features,49CdYBpfABUa0ZfT8FizQZ,spotify:track:49CdYBpfABUa0ZfT8FizQZ,https://api.spotify.com/v1/tracks/49CdYBpfABUa...,https://api.spotify.com/v1/audio-analysis/49Cd...,273996,4
1,1,0.685,0.54,9,-9.701,1,0.0253,0.181,0.015,0.114,0.235,109.926,audio_features,4TyCnstYu6LGrjka5WW6ft,spotify:track:4TyCnstYu6LGrjka5WW6ft,https://api.spotify.com/v1/tracks/4TyCnstYu6LG...,https://api.spotify.com/v1/audio-analysis/4TyC...,209253,4
2,2,0.877,0.618,5,-5.597,0,0.0456,0.0654,1e-05,0.056,0.846,90.997,audio_features,548ddxWI4sZKK1aCLuMR16,spotify:track:548ddxWI4sZKK1aCLuMR16,https://api.spotify.com/v1/tracks/548ddxWI4sZK...,https://api.spotify.com/v1/audio-analysis/548d...,242723,4
3,3,0.8,0.661,7,-8.291,1,0.0304,0.0204,0.000107,0.0687,0.59,92.015,audio_features,0BweE3lWBMXRPWWLtLV5z8,spotify:track:0BweE3lWBMXRPWWLtLV5z8,https://api.spotify.com/v1/tracks/0BweE3lWBMXR...,https://api.spotify.com/v1/audio-analysis/0Bwe...,189613,4
4,4,0.775,0.824,11,-2.558,0,0.047,0.00397,0.0,0.0509,0.683,137.989,audio_features,60j4KO4XiFxLM6qy2Enic2,spotify:track:60j4KO4XiFxLM6qy2Enic2,https://api.spotify.com/v1/tracks/60j4KO4XiFxL...,https://api.spotify.com/v1/audio-analysis/60j4...,187710,4


In [40]:
id_map.head()

Unnamed: 0,isrc,spotify_id
0,US4CL1410011,49CdYBpfABUa0ZfT8FizQZ
1,US4CL1410008,4TyCnstYu6LGrjka5WW6ft
2,US4CL1610057,548ddxWI4sZKK1aCLuMR16
3,US4CL1410004,0BweE3lWBMXRPWWLtLV5z8
4,US4CL1610055,60j4KO4XiFxLM6qy2Enic2


In [43]:
merged_df = music_data.merge(
    id_map,
    left_on= 'ISRC',
    right_on= 'isrc',
    how = 'left'
)

In [44]:
len(merged_df)

369017

In [49]:
len(merged_df[~merged_df.spotify_id.isna()])

364671