In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import requests
import bs4
import time
import sqlite3
from getpass import getpass
from urllib.parse import urljoin

# SPOTIFY DATA EXTRACTION VIA API AND WEB SCRAPING

## STEP 1: Create Access Token
Spotify's API calls require an access/bearer token.<br>
The token can be acquired by using the **<i>"token"</i>** API call of Spotify.

In [2]:
# Input client_id, obtained after creating Spotify Dev Acct.
client_id = getpass()

 ········


In [3]:
# Input client_secret, obtained after creating Spotify Dev Acct.
client_secret = getpass()

 ········


In [4]:
url = "https://accounts.spotify.com/api/token"
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
params = {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret
}
response = requests.post(url, headers=headers, data=params).json()
api_key = response['access_token']

## STEP 2: Retrieve a Pool of Filipino Artist ID's

Since the website is dynamic, scraping it is not as straightforward. Using developer tools we can inspect the network activity of the website and look for a specific query API call and recreate that API call in the notebook. The API call will then return a response containing a JSON formatted page that we can extract information from.

The code below is specific to each machine, it is necessary to update the headers in the request function below, based on the above narrated step above.

In [5]:
result = []
n_artist = 120
for i in range(0, n_artist, 30):
    url = ('https://api-partner.spotify.com/pathfinder/v1/query?'
           'operationName=searchArtists&variables=%7B%22searchTerm'
           f'%22%3A%22filipino%22%2C%22offset%22%3A{i}%2C%22limit%'
           '22%3A30%2C%22numberOfTopResults%22%3A20%2C%22includeAu'
           'diobooks%22%3Atrue%7D&extensions=%7B%22persistedQuery%'
           '22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%224e7c'
           'dd33163874d9db5e08e6fabc51ac3a1c7f3588f4190fc04c5b863f'
           '6b82bd%22%7D%7D')
    headers = {'authority': 'api-partner.spotify.com',
               'method': 'GET',
               'path': ('/pathfinder/v1/query?operationName=searchArtists'
                        '&variables=%7B%22searchTerm%22%3A%22filipino%22%'
                        '2C%22offset%22%3A60%2C%22limit%22%3A30%2C%22numb'
                        'erOfTopResults%22%3A20%2C%22includeAudiobooks%22'
                        '%3Atrue%7D&extensions=%7B%22persistedQuery%22%3A'
                        '%7B%22version%22%3A1%2C%22sha256Hash%22%3A%224e7'
                        'cdd33163874d9db5e08e6fabc51ac3a1c7f3588f4190fc04'
                        'c5b863f6b82bd%22%7D%7D'),
               'scheme': 'https',
               'Accept': 'application/json',
               'Accept-Encoding': 'gzip, deflate, br',
               'Accept-Language': 'en-GB',
               'App-Platform': 'WebPlayer',
               'Authorization': ('Bearer BQD5uHXkKVHRhHkfIl6z1c5mgkbYR2Xp'
                                 'wFFjEnY2K6aAqZEt47Gm6zuioeNacy9tJC9OpUX'
                                 'efgv2BMH4aSA-1oFeKAOsu4wTtLF3FZleD9f1ES'
                                 'Aun_1FKTmR2KijP-A_ryYJOcqM0xOZyUnRINEt8'
                                 '7Il9AYyA3e4Ubo_7v-t7D3U2kzTi9gBtyHFqVm3'
                                 'S-DPWjNKY1P6Ty52_yiTmFPcjbUoN76QQc0yhiS'
                                 'rs9NHSueIx5GkeqGiJ8cmruOPsPk1tlpaoJirb8'
                                 'fkshdtaokpF9uwsTk7yBycEs1P2ZIb34Q2LEWQr'
                                 'dBsLeCtBX0q80BIloIDFQ1Ml1-dnb1NnA53iIMv'),
               'Client-Token': ('AABY3Ay6FLSw5JBHe5vPq7E6MZRrV4xw8p+AoXJU'
                                'C0HOy9t6lLxBL1FOdxTD8jGWIghsO9FS6lO8C68Q'
                                '54tUmPQeSAntPvrAJmSuUD31zln+YUO1m8Y0x13C'
                                'aX9jXPoLx0Q+rtu1O7Eax+UJCE9Ci3wlCZtoNzhg'
                                'hrQtIwceH2lf1Tkmk5WOBY0Y1k05CBFFHGJBz/aN'
                                '6fKT1RptrfxWWeZPXW8/SoWO0evysacq1F5ja0Hy'
                                'L2salQ690Oy2+P1Wbydlh9QHct4H0iSa1HRxsmYm'
                                'W77dKXdsWT38uKnn/2DyAtqv'),
               'Content-Type': 'application/json;charset=UTF-8',
               'Origin': 'https://open.spotify.com',
               'Referer': 'https://open.spotify.com/',
               'Sec-Ch-Ua': ('"Microsoft Edge";v="119", '
                             '"Chromium";v="119", '
                             '"Not?A_Brand";v="24"'),
               'Sec-Ch-Ua-Mobile': '?0',
               'Sec-Ch-Ua-Platform': "Windows",
               'Sec-Fetch-Dest': 'empty',
               'Sec-Fetch-Mode': 'cors',
               'Sec-Fetch-Site': 'same-site',
               'Spotify-App-Version': '1.2.26.607.g1b6449bf',
               'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                              'AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0')
               }
    content = requests.get(url, headers=headers).json()
    result.extend(content['data']['searchV2']['artists']['items'])

ids = [item['data']['uri'].split(':')[2] for item in result]

## STEP 3: Get a list of tracks with audio features

### Retrieve top tracks for each artist
The **<i>top tracks</i>** API call can give us the top tracks for each of the artist id that we retrieved earlier.

In [6]:
tracks = []
for spotify_id in ids:
    url = f'https://api.spotify.com/v1/artists/{spotify_id}/top-tracks'
    headers = {'Authorization': f'Bearer {api_key}'}
    params = {'market': 'PH',
              'ids': f'{spotify_id}'}
    content = requests.get(url, headers=headers,
                           params=params
                           ).json()
    tracks.append(content)

track_ids = []
for track in tracks:
    for inner_track in track['tracks']:
        uri_value = inner_track['uri'].split(':')[2]
        track_ids.append(uri_value)

# Add track id of exemplar song (for IR sanity check)
track_ids.append('14cz01nTqAPQ9uXzIlH9qD')

### Get the audio features per track
The **<i>audio-features</i>** API call will give us the audio features of each track upon providing its id.

In [7]:
stats = []
for track_id in track_ids:
    url = f'https://api.spotify.com/v1/audio-features/{track_id}'
    headers = {'Authorization': f'Bearer {api_key}'}
    params = {'ids': track_id}
    content = requests.get(url, headers=headers, params=params).json()
    stats.append(content)

In [8]:
# Extract the information and convert it to a dataframe
dict_stats = {}
for stat in stats:
    for key, value in stat.items():
        if key not in dict_stats:
            dict_stats[key] = [value]
        else:
            dict_stats[key].append(value)
df_tracks = pd.DataFrame(dict_stats)
df_tracks.drop_duplicates(subset='id', inplace=True, ignore_index=True)

In [9]:
# Retrieve artists of tracks in df_tracks
artists = []
base_url = 'https://open.spotify.com/track/'
links = [urljoin(base_url, id_) for id_ in df_tracks['id']]
for link in links:
    content = requests.get(link).content
    soup = bs4.BeautifulSoup(content)
    artists.append(soup.find('a').text)

df_tracks['artist'] = artists

Since we only used a query API call on keyword 'Filipino' the resulting pool of artists won't be perfectly filtered to include only Filipino Artists. To improve the pool of artists, we did a quick scan of the artists and dropped tracks that belong to foreign artists. We identified a few and dropped them from the dataframe.

In [10]:
print(df_tracks['artist'].unique())

['Eraserheads' 'Ben&Ben' 'Flow G' 'Bosx1ne' 'PDL' 'Ex Battalion'
 'Shanti Dope' 'Dilaw' 'Cup of Joe' 'Hale' 'Moira Dela Torre'
 'Maximillian' 'mrld' 'Maki' 'Cesca' 'KaixAaron' 'Filipino Music'
 'Hope Filipino Worship' 'Skusta Clee' 'MC Einstein'
 'Philippine Madrigal Singers' 'Ian Filipino'
 'Filipino American Symphony Orchestra' 'Filipino Music Productions'
 'DJ Jester The Filipino Fist' 'The Noise Revival Orchestra'
 'Endorfino Filipino' 'Paskong Pinoy' 'Cornerstone Filipino Worship Team'
 'Borben Dallas & His Filipino Cupids' 'Calein'
 'Priscilla Lee & The Filipino Rockets' 'Jed Baruelo'
 'Grace Filipino Worship' 'Cornerstone Filipino Worship'
 'Filipino Christian Church Abu Dhabi' 'Kiyo' 'syd hartha'
 'The Itchyworms' 'Mayonnaise' 'Asin' 'Belle Mariano' 'SB19' 'JVKE' 'Zild'
 'MYMP' 'Cueshé' 'Filipino Boxing' 'Kamikazee' 'Bandang Lapis' 'Callalily'
 'Freestyle' 'Malayang Pilipino Music' 'Regine Velasquez' 'Ogie Alcasid'
 'Umuusbong Na Samahang May Atikha Sa Filipino' 'Two Filipinos'

In [11]:
foreign = ['Álvaro De Luna',
           'JVKE',
           'Philippine',
           'Borben Dallas & His Filipino Cupids']

df_tracks = df_tracks[df_tracks['artist'].apply(lambda x: x not in foreign)]
df_tracks = df_tracks.reset_index(drop=True)

## STEP 4: Write to CSV for use in the Main Report Notebook

In [12]:
df_tracks.to_csv('Tracks.csv', index=False)

## STEP 5: Retrieve information on each artist (followers and popularity)

The code below will use the **<i>artists</i>** API call to get information on the artist like popularity, followers etc.

In [13]:
dict_artists = []
for spotify_id in ids:
    end_point = 'https://api.spotify.com/v1/artists'
    headers = {'Authorization': f'Bearer {api_key}'}
    params = {'ids': f'{spotify_id}'}
    content = requests.get(end_point, headers=headers, params=params).json()
    dict_artists.append(content)

In [14]:
followers = []
names = []
popularity = []
id_ = []
for dict_ in dict_artists:
    for i in dict_['artists']:
        followers.extend([i.get('followers')])
        names.extend([i.get('name')])
        popularity.extend([i.get('popularity')])
        id_.extend([i.get('id')])

df = pd.DataFrame({'followers': followers,
                   'artist_name': names,
                   'popularity': popularity,
                   'id_for_artist': id_})
df['followers'] = df['followers'].apply(
    lambda x: x.get('total') if isinstance(x, dict)else None)
df_followers = df.sort_values('artist_name').reset_index(drop=True)

In [15]:
df_followers.to_csv('Artists_info.csv')