In [1]:
# Pull Billboard Charts and Details from 2000-2016

In [2]:
import billboard
import pandas as pd
import spotipy
import spotipy.oauth2
import more_itertools
import time
credentials = spotipy.oauth2.SpotifyClientCredentials('XXX', 'XXX')
spotify = spotipy.Spotify(client_credentials_manager=credentials)

In [None]:
import discogs_client
import requests
import json
import collections

import operator
d = discogs_client.Client('ExampleApplication/0.1', user_token="XXX")
from tqdm import tqdm, tqdm_pandas
tqdm.pandas(tqdm())

import numpy as np

import datetime
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

In [5]:
# pull all charts from present thru "first year" starting
# with current chart
# outputs a dataframe of all charts ever
def loop_through_charts(chart, first_year):
    columns = ['chartDate', 'title', 'artist', 'peakPos', 'lastPos', 'weeks', 'rank', 'change', 'spotifyID', 'spotifyLink', 'videoLink']
    all_charts  = pd.DataFrame(columns=columns)

    while chart.previousDate[:4] != first_year:
        chart = billboard.ChartData('hot-100', chart.previousDate)
        for entry in chart.entries:
            details = pd.Series([chart.date, entry.title,  entry.artist, entry.peakPos, entry.lastPos, entry.weeks, entry.rank,
                                 entry.change, entry.spotifyID, entry.spotifyLink, entry.videoLink], index=columns)
            all_charts = all_charts.append(details, ignore_index=True)
    return(all_charts)

In [6]:
def get_artist_names(dataframe):
    distinct_ids = list(dataframe['spotifyID'].unique())
    distinct_ids = list(filter(None, distinct_ids)) 
    chunks = more_itertools.chunked(distinct_ids, 49)
    
    artist_names = pd.DataFrame(columns=['spotifyID', 'artist_list'])
    for chunk in chunks:
        for track in spotify.tracks(tracks = chunk)['tracks']:
            artist_id = track['id']
            all_names = []
            for artists in track['artists']:
                all_names.append(artists['name'])
            artist_names = artist_names.append(pd.Series({'spotifyID': artist_id, 'artist_list': all_names}), ignore_index=True)
    return(artist_names)

In [21]:
if __name__ == '__main__':
    start_time = time.time()
    all_charts = loop_through_charts(billboard.ChartData('hot-100'), '1999')
    print('Done retrieving:\n', 
          '# Charts                   :', all_charts['chartDate'].nunique(), '\n',
          '# Artists                  :', all_charts['artist'].nunique(), '\n',
          '# Tracks                   :', len(all_charts.groupby(['artist', 'title']).count()),'\n',
          '# Tracks w/ Audio Features :', all_charts['spotifyID'].nunique(), '\n', 
          'Time Elapsed               :', time.time() - start_time, ' seconds\n',
          '... Now onto the features ..')
    artist_names = get_artist_names(all_charts)
    # join in artist names
    all_charts = pd.merge(all_charts, artist_names, how='left', on=['spotifyID', 'spotifyID'])
    # split artist names
    all_charts = pd.concat([all_charts, all_charts.artist_list.apply(pd.Series).add_prefix('art_')], axis=1)
    all_charts.to_csv('all_charts.csv')
    start_time = time.time()
    print('Time Elapsed :', time.time() - start_time, ' seconds\nAll Done!')

Done retrieving:
 # Charts                   : 879 
 # Artists                  : 3094 
 # Tracks                   : 6636 
 # Tracks w/ Audio Features : 5817 
 Time Elapsed               : 2605.6148800849915  seconds
 ... Now onto the features ..
Time Elapsed : 4.0531158447265625e-06  seconds
All Done!


In [299]:
all_charts = pd.read_csv('all_charts.csv', low_memory=False, index_col=0)
# look at artist names that weren't split out
#thefile = open('test.txt', 'w')
#for item in list(all_charts[all_charts.art_0.isnull()]['artist'].unique()):
#  thefile.write("%s\n" % item)

In [300]:
# check for artists that werent identified via. spotify artist lookup
separators = [' Featuring ', ' & ', ', ', ' Vs. ', ' Wtih ', ' Duet With ', ' Introducing ', ' With ']
exceptions = ['Prince And The Revolution', 'Alvin And The Chipmunks', 'Alvin And The Chipmunks Featuring Chris Classic']

In [301]:
# extract names to clean remove exceptions
dirty_names = pd.DataFrame(all_charts[all_charts.art_0.isnull() & 
                                      -all_charts.artist.isin(exceptions)]['artist'].unique(), 
                          columns=['artist'])
dirty_names['clean_names'] = dirty_names['artist']

In [302]:
# replace patterns from seperators list
for pattern in separators:
    dirty_names['clean_names'] = dirty_names['clean_names'].str.replace(pattern,'//')

In [303]:
# create name lookup
dirty_names = dirty_names.join(dirty_names['clean_names']
                               .apply(lambda x: pd.Series(x.split('//')))
                               .add_prefix('art_')).drop('clean_names', 1)

In [304]:
all_charts_final = all_charts.merge(dirty_names, on='artist', how='left', suffixes=('', '_y'))
for i in range(0,8):
    all_charts_final['art_'+str(i)] = all_charts_final['art_'+str(i)+'_y'].fillna(all_charts_final['art_'+str(i)])
    all_charts_final.drop('art_'+str(i)+'_y', inplace=True, axis=1)

In [306]:
list(all_charts_final[all_charts_final.art_0.isnull()]['artist'].unique())

['Prince And The Revolution',
 'Alvin And The Chipmunks Featuring Chris Classic',
 'Alvin And The Chipmunks']

In [307]:
# list of boilerplate names to ignore
plate_names = ["Unknown Artist","Various", "Traditional", "Folk", "Anonymous"]

In [338]:
# this method searches for all versions associated with a master release and pulls the distinct
# "credits" and then extracts "extraartist" fields from the api's release page
# the latter should match credits but have additional role info and is (apparently?) not in the api
def ghostbuster(row):
    ghosts = []; roles = []; names = []
    headers = {'accept-encoding': 'gzip, deflate', 'user-agent': 'cde456'} 
    params = {} 
    search = d.search(artist=row['art_0'], title=row['title'], type='master')
    for release in search:
        for version in release.versions:
            names+=[art.real_name for art in version.artists if art.name not in plate_names and art.real_name!=None]
            variations=[art.name_variations for art in version.artists if art.name not in plate_names and art.name_variations!=None]
            if len(variations)>0:
                names+=variations[0]
            for credit in version.credits:
                ghosts.append(credit.name)
            json_data = json.loads(requests.get('https://api.discogs.com/releases/'+str(version.id), params=params,headers=headers).text)
            roles.extend([(extra['name'], extra['role']) for extra in json_data['extraartists']])
    return pd.Series({'ghosts': list(set(ghosts)), 'roles': list(set(roles)), 'other_names': list(set(names))})

In [339]:
# this method performs a broader search where the first method fails (takes a bit longer and has some mis-hits)
# some searches like r. kelly - fiesta and bieber's one dance did not return results either due to 
# not having a master identifed or other search limitations of the API - use the "q" method instead of artist
def ghostbuster2(row):
    ghosts = []
    roles = []
    names = []
    headers = {'accept-encoding': 'gzip, deflate', 'user-agent': 'cde456'} 
    params = {} 
    search = d.search(q=row['art_0'], title=row['title'], type='release')
    for release in search:
        names+=[art.real_name for art in release.artists if art.name not in plate_names and art.real_name!=None]
        variations=[art.name_variations for art in release.artists if art.name not in plate_names and art.name_variations!=None]
        if len(variations)>0:
            names+=variations[0]
        for credit in release.credits:
            ghosts.append(credit.name)
        json_data = json.loads(requests.get('https://api.discogs.com/releases/'+str(release.id), params=params,headers=headers).text)
        roles.extend([(extra['name'], extra['role']) for extra in json_data['extraartists']])
    return pd.Series({'ghosts': list(set(ghosts)), 'roles': list(set(roles)), 'other_names': list(set(names))})

In [348]:
# extract artists and songs
bb = all_charts_final[all_charts_final['peakPos'] <11][['artist', 'title', 'art_0']].drop_duplicates()

In [None]:
bb[['ghosts', 'other_names', 'roles']] = bb.progress_apply(lambda row: ghostbuster2(row),axis=1)