# Scrape Artist Song Clips

This notebook looks through the top 10000 artists in the provided dataset, and for each artist, finds the best possible matches on Spotify using iTunes as a fallback, then collects 30 second track preview URLs for each artist and saves them all to tracks.json. The later half of the notebook uses the track preview lists to download a specified number of preview clips for each artist, then saves them to the tracks directory, with the filename and directory based on the artist's MusicBrainz ID.

In [6]:
import pandas as pd
import numpy as np
import json
import sys
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
from multiprocessing import Pool
from tqdm import tnrange, tqdm_notebook
import musicbrainzngs
import requests
import time
from slugify import slugify
import os
import hashlib
from utils import load_json, make_logger
import logging

params = load_json('params.json')
logger = make_logger('scrapeartists', 'log/scrapeartists.log')
    
musicbrainzngs.set_useragent('CS230 Scaper', '0.1', 'naallen@stanford.edu')

client_id = params['spotifyClientID']
client_secret = params['spotifyClientSecret']

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Load data and get unique artists

This part presumes that you have downloaded the dataset at http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz and extracted the contents to the dataset directory.

In [3]:
playscols = ['usersha1', 'mbid', 'artistname', 'plays']
playsdf = pd.read_csv('dataset/usersha1-artmbid-artname-plays.tsv', sep='\t', names=playscols, index_col=False)
profilecols = ['usersha1', 'gender', 'age', 'country', 'registration']
profiledf = pd.read_csv('dataset/usersha1-profile.tsv', sep='\t', names=profilecols, index_col=False)
df = playsdf.merge(profiledf, on=['usersha1'], how='left')

logger.info('Loading Last.FM data...')
# Clean the data by removing artists without mbid (usually nonsense)
# and profiles without registration (also usually nonsense, maybe dataset artifact)
# Also delete entries with unknown artists, and artists that only show up once
df = df[df['mbid'].notnull() & df['registration'].notnull() 
        & (df['artistname'] != '[unknown]') & df.duplicated(subset='artistname', keep=False)]
logger.info('Loaded Last.FM data')

In [1]:
uniqueartists = df['mbid'].value_counts()
uniqueartists = uniqueartists.to_frame(name='listeners')
uniqueartists.head(10)
logger.info('{} unique artists in dataset'.format(len(uniqueartists)))

NameError: name 'df' is not defined

## Save/load unique artists

In [3]:
uniqueartists.to_csv('dataset/uniqueartists.csv')
logger.info('Saved uniqueartists.csv')

NameError: name 'uniqueartists' is not defined

In [8]:
uniqueartists = pd.read_csv('dataset/uniqueartists.csv', index_col = 0)
logger.info('Loaded uniqueartists.csv')

Loaded uniqueartists.csv


## MBID to Artist Mapping

In [6]:
mbid_to_artist = df[['mbid', 'artistname']].drop_duplicates('mbid').set_index('mbid')
mbid_to_artist.head(10)
logger.info('MBID to artist name mapping done')

Unnamed: 0_level_0,artistname
mbid,Unnamed: 1_level_1
3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch
f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte
b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge
3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking
bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks
8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers
6531c8b1-76ea-4141-b270-eb1ac5b41375,magica
21f3573f-10cf-44b3-aeaa-26cccd8448b5,the black dahlia murder
c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs
0639533a-0402-40ba-b6e0-18b067198b73,lunachicks


In [7]:
mbid_to_artist.to_csv('dataset/mbid_to_artist.csv')
logger.info('Saved mbid_to_artist.csv')

In [3]:
mbid_to_artist = pd.read_csv('dataset/mbid_to_artist.csv', index_col=0)
logger.info('Loaded mbid_to_artist.csv')

## Get preview URLs for all artists

Using the spotify API, get recordings for each artist with a given name, then get artist ID from whichever recording first matches the list we have for the given artist. Then, get a list of top tracks for that artist, and get all preview URLs.

In [40]:
def get_track_urls(mbid):
    try:
        # Get artist name
        artistname = mbid_to_artist[mbid_to_artist.index == mbid].values[0][0]
        if len(mbid_to_artist[mbid_to_artist['artistname'] == artistname]) > 1:
            # We have more than 1 artist with that name, so first we get release data from 
            # MB, then search Spotify for matches.
            # This is done because the musicbrainz API has a rather low rate limit, so we don't
            # want to use it unless we have to.
            recording_data = musicbrainzngs.get_artist_by_id(mbid, includes=['release-groups', 'url-rels'])
            found_spotify = False
            for relation in recording_data['artist'].get('url-relation-list', []):
                if 'spotify' in relation['target']:
                    found_spotify = True
                    artistid = relation['target'].split('/artist/')[1]
            
            if found_spotify:
                # If we found a spotify URL for the given artist, we can just search for the Spotify ID directly
                artistid = relation['target'].split('/artist/')[1]
                tracks = sp.artist_top_tracks(artistid)
                urls = []
                tracknames = []
                for track in tracks['tracks']:
                    urls.append(track['preview_url'])
                    tracknames.append(track['name'])
                return {'mbid': mbid, 'urls': [url for url in urls if url is not None], 'tracknames': tracknames}
            else:
                # If we didn't find a Spotify URL, we should just search Spotify for the artist name,
                # then use the data we got from MB earlier to match album names and download tracks 
                # with matching album names
                releasenames = [recording['title'].lower() for recording in recording_data['artist']['release-group-list']]
                tracks = sp.search('artist:{}'.format(artistname), type='track')

                urls = []
                tracknames = []
                for track in tracks['tracks']['items']:
                    if track['album']['name'].lower() in releasenames:
                        urls.append(track['preview_url'])
                        tracknames.append(track['name'])
                return {'mbid': mbid, 'urls': [url for url in urls if url is not None], 'tracknames': tracknames}
        
        # Search all artists with the given name, search for an exact match, and take the most popular
        # The MB API has a low rate limit, so we do it like this to save time
        artists = sp.search('artist:{}'.format(artistname), type='artist')
        artistid = None
        maxpop = 0

        for artist in artists['artists']['items']:
            if artist['name'].lower() == artistname.lower() and artist['popularity'] > maxpop:
                artistid = artist['id']
                maxpop = artist['popularity']

        # Get all track names and preview URLs, if they exist
        tracks = sp.artist_top_tracks(artistid)

        urls = []
        tracknames = []
        
        for track in tracks['tracks']:
            urls.append(track['preview_url'])
            tracknames.append(track['name'])

        return {'mbid': mbid, 'urls': [url for url in urls if url is not None], 'tracknames': tracknames}
    except Exception as e:
        raise e
        return {'mbid': mbid, 'urls': [], 'tracknames': []}

# Get top artists as per the params
mbids = uniqueartists.sort_values(ascending=False, by='listeners').head(params['numArtists']).index
logger.info('Collected top {} artists'.format(len(mbids)))
logger.info('Beginning Spotify URL scraping...')
p = Pool(15)
track_urls = list(tqdm_notebook(p.imap(get_track_urls, mbids), total=len(mbids)))
logger.info('Spotify URLs scraped')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs


AttributeError: 'NoneType' object has no attribute 'split'

retrying ...7secs
retrying ...7secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs
retrying ...6secs


Process ForkPoolWorker-66:
Process ForkPoolWorker-71:
Process ForkPoolWorker-73:
Process ForkPoolWorker-65:
Process ForkPoolWorker-63:
Process ForkPoolWorker-72:
Process ForkPoolWorker-61:
Process ForkPoolWorker-69:
Process ForkPoolWorker-67:
Process ForkPoolWorker-62:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-68:
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 119, in _internal_call
    r.raise_for_status()
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 119, in _internal_call
    r.raise_for_status()
Traceback (most recent call last):
Process ForkPoolWorker-64:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 119, in _internal_call
    r.raise_for_status()
Process F

requests.exceptions.HTTPError: 429 Client Error: Too Many Requests for url: https://api.spotify.com/v1/search?q=artist%3Aenya&limit=10&offset=0&type=artist
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
Traceback (most recent call last):

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
Traceback (most recent call last):

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 146, in _get
    return self._internal_call('GET', url, payload, kwargs)

During handling of the above exception, another exception occurred:

  File "/home/ubuntu/.loc

Traceback (most recent call last):

During handling of the above exception, another exception occurred:

spotipy.client.SpotifyException: http status: 429, code:-1 - https://api.spotify.com/v1/artists/6Ghvu1VvMGScGpOUJBAHNH/top-tracks?country=US:
 API rate limit exceeded

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:

  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:

  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 124, in _internal_call
    headers=r.headers)
  File "/home/ubuntu/anac

  File "<ipython-input-40-47b74ef248cf>", line 47, in get_track_urls
    tracks = sp.artist_top_tracks(artistid)
  File "<ipython-input-40-47b74ef248cf>", line 47, in get_track_urls
    tracks = sp.artist_top_tracks(artistid)
  File "<ipython-input-40-47b74ef248cf>", line 37, in get_track_urls
    artists = sp.search("artist:{}".format(artistname), type='artist')
  File "<ipython-input-40-47b74ef248cf>", line 37, in get_track_urls
    artists = sp.search("artist:{}".format(artistname), type='artist')
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 339, in search
    return self._get('search', q=q, limit=limit, offset=offset, type=type, market=market)
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/ubuntu/.local/lib/python3.6/site-packages/spotipy/client.py", line 157, in _get
    time.sleep(sleep_seconds)
  File "/home/ubuntu/.local/lib/python3.6/site-packa

## Reorganize the list and find artists with empty lists

In [37]:
# Reorganize the multiprocessing results from a list of dicts to a flat dict of mbid:[urls]
flat_track_urls = {}
for entry in track_urls:
    artist = entry['mbid']
    urls = entry['urls']
    flat_track_urls[artist] = urls

In [38]:
emptyartists = [artist for artist, urls in flat_track_urls.items() if urls != None and len(urls) == 0]
logger.info('{} artists without Spotify data'.format(len(emptyartists)))

1580

## Fill gaps in the spotify URLs using iTunes

Here we employ a similar approach, except we use the iTunes API. The iTunes API is usually more complete, but it has a lower rate limit than Spotify. I use lots of sleep statements so we don't run into API limits. Note that this part is really finnicky, since the iTunes API doesn't play as nicely as the Spotify API, so it may have to be run several times and tweaked to get things working.

In [16]:
itunes_url = 'https://itunes.apple.com/search?term={}&entity=song'
itunes_url_id = 'https://itunes.apple.com/lookup?id={}&entity=song'

emptyartists = [artist for artist, urls in flat_track_urls.items() if urls != None and len(urls) == 0]
nonefound = list()

logger.info('Scraping iTunes data...')
for mbid in tqdm_notebook(emptyartists):
    try:
        try:
            recording_data = musicbrainzngs.get_artist_by_id(mbid, includes=['release-groups', 'url-rels'])
        except MusicBrainzError:
            # Sometimes, the artist can't be found on MB for whatever reason. 
            # Print the missing artist and delete them from the list.
            logger.info('MBID {} (artist name {}) not found on MusicBrainz API, removing'.format(
                mbid,mbid_to_artist.loc[mbid].values[0]))
            del flat_track_urls[mbid]
            time.sleep(5)
            continue
        
        releasenames = [recording['title'].lower() for recording in recording_data['artist']['release-group-list']]
        
        found_itunes_id = False
        for relation in recording_data['artist'].get('url-relation-list', []):
            # As before, see if the MB data has an iTunes URL. If so, get the artist info directly
            if 'itunes' in relation['target'] and 'artist' in relation['target']:
                itunesid = relation['target'].split('/artist/id')[1]
                found_itunes_id = True
                break
        
        urls_tmp = list()
        if found_itunes_id:
            url = itunes_url_id.format(itunesid)
            tracks = requests.get(url).json()
            for result in tracks['results']:
                if 'previewUrl' in result.keys():
                    urls_tmp.append(result['previewUrl'])
        else:
            # If we can't find an iTunes URL in the MB data, match release names
            artistname = mbid_to_artist.loc[mbid].values[0]
            url = itunes_url.format(artistname)
            tracks = requests.get(url)
            tracks = tracks.json() 
            urls_tmp = list()
            for result in tracks['results']:
                if result['collectionName'].lower() in releasenames and 'previewUrl' in result.keys():
                    urls_tmp.append(result['previewUrl'])

        if len(urls_tmp) == 0:
            # This artist doesn't exist on either iTunes or Spotify. Remove them from the list.
            nonefound.append(mbid)
            del flat_track_urls[mbid]
        flat_track_urls[mbid] = urls_tmp
        time.sleep(5)
    except ValueError as e:
        # Occasionally, iTunes will act up and not return anything. Sometimes this indicates that 
        # the rate limit has been hit, sometimes it's a weird issue with the API. If this happens,
        # print the artist and url and exit
        print(mbid)
        print(url)
        raise e


HBox(children=(IntProgress(value=0, max=445), HTML(value='')))

o.n.a.
nintendo
増田俊郎
derek and the dominos
ultra bra
christina stürmer
fats domino
kat
sacred spirit
platero y tú
clazziquai
2000 maniacs
opgezwolle
bodies without organs
juan luis guerra
satanic warmaster
cock and ball torture
eins zwo
hide
grajdanskaya oborona
jimmy smith
cinema bizarre
pete & the pirates
the royal philharmonic orchestra
ancafe
the onion
cajun dance party
kat-tun
mamonas assassinas
the la's
last days of humanity
andre 3000
news
konami
sérgio mendes & brasil 66
Ляпис Трубецкой
the brilliant green
#####
いきものがかり
laura
rentrer en soi
peste noire
tomaso giovanni albinoni
川井憲次
lюк
edward shearmur
tiktak
connie francis
the mighty boosh
evan rachel wood
j.k. rowling
m
stella
regina
virginia jetzt!
queen + paul rodgers
miranda
近藤浩治
fintelligens
下村陽子
danny
saïan supa crew
porcelain and the tramps
meryl streep
amanda woodward
girlicious
sweet noise
loituma
fish
die drei ???
acidman
larrikin love
kaenżet
emc
irina
angela aki
new radicals
antsy pants
3
pur
kelly bailey
ornatos vi

KeyboardInterrupt: 

In [41]:
# Sanity check, remove any remaining artists that may be missing audio data
new_urls = {}
new_urls = flat_track_urls.copy()
for mbid, urls in flat_track_urls.items():
    if len(urls) == 0:
        del new_urls[mbid]
        
logger.info('Total number of artists for which we have audio data: {}'.format(len(new_urls)))
flat_track_urls = new_urls

9484


## Save/Load Tracks data

In [42]:
logger.info('Saving tracks.json')
f = open('dataset/tracks.json','w+')
json.dump(flat_track_urls, f)
f.close()

In [5]:
logger.info('Loading tracks.json')
f = open('dataset/tracks.json','r')
flat_track_urls = json.load(f)
f.close()

## Download tracks

Collect a list of all mbid:track mappings (cap at a specific number of tracks per artist, or however much is available), then download them in parallel. There don't seem to be any rate limits, so this step is very fast.

In [6]:
alltracks = []

logger.info('Collecting mbid:track pairings')
for artist, urls in flat_track_urls.items():
    if urls is not None and len(urls) > 0:
        for url in urls[0:min(params['numTracksPerArtist'], len(urls))]:
            alltracks.append((artist, url))

logger.info('Collected {} mbid:track pairings'.format(len(alltracks)))

27472

In [9]:
def mbid_to_filename(mbid, suffix, m4a = False):
    if m4a:
        extension = 'm4a'
    else:
        extension = 'mp3'
    return os.path.join('tracks', *[mbid[i] for i in range(4)], '{}-{}.{}'.format(mbid, suffix, extension))

def download_track(mbid_url):
    # Download a given track, this gets saved to nested folders based on the mbid, and the filename itself
    # is appended with the first 8 characters of the file's md5 sum
    (mbid, url) = mbid_url
    r = requests.get(url)
    suffix = hashlib.md5(r.content).hexdigest()[0:8]
    # Some files are m4a files, some are mp3 files. Preserve the extension when saving
    if url.endswith('m4a'):
        path = mbid_to_filename(mbid, suffix, m4a=True)
    else:
        path = mbid_to_filename(mbid, suffix)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    f = open(path, 'wb')
    f.write(r.content)
    f.close()

logger.info('Downloading all tracks...')
p = Pool(25)
r = list(tqdm_notebook(p.imap(download_track, alltracks), total=len(alltracks)))
logger.info('Tracks downloaded')