In [None]:
# %pip install billboard.py

In [1]:
import billboard
import pandas as pd
import numpy as np
import datetime

In [None]:
# %reset

In [None]:
# The official Billboard API has fallen out of use, see: https://stackoverflow.com/questions/7835398/how-to-get-billboard-hot-100-chart-listing-via-billboard-api/9613219
# This script uses the billboard.py package as documented here: https://github.com/guoguo12/billboard-charts

In [2]:
# Create a list of dates from 2015-01-01 to present day by weekly intervals (since this chart updates weekly)
# We start in 2015 because we want to give some prior history to artists appearing in 2017 (when the Spotify Top 200 Chart begins), 
# otherwise their mean rank/frequency would be undefined.

date_list = []
date_start = datetime.datetime(2015, 1, 1)
d = datetime.timedelta(days = 7)

while date_start <= datetime.datetime(2020, 12, 12): # can change this to whatever today's date is
    date_list.append(date_start)
    date_start += d

In [4]:
# Fetch the Top 100 Artist chart for every week from the start date to present
chart_list = []
for i in range(len(date_list)):
    chart = billboard.ChartData('artist-100', date = date_list[i].strftime('%Y-%m-%d'))
    chart_list.append(chart)
    print('Downloaded', date_list[i])
    # a 5-second rest between every other ~5 calls is automatically built into the billboard.ChartDate() function

Downloaded 2015-01-01 00:00:00
Downloaded 2015-01-08 00:00:00
Downloaded 2015-01-15 00:00:00
Downloaded 2015-01-22 00:00:00
Downloaded 2015-01-29 00:00:00
Downloaded 2015-02-05 00:00:00
Downloaded 2015-02-12 00:00:00
Downloaded 2015-02-19 00:00:00
Downloaded 2015-02-26 00:00:00
Downloaded 2015-03-05 00:00:00
Downloaded 2015-03-12 00:00:00
Downloaded 2015-03-19 00:00:00
Downloaded 2015-03-26 00:00:00
Downloaded 2015-04-02 00:00:00
Downloaded 2015-04-09 00:00:00
Downloaded 2015-04-16 00:00:00
Downloaded 2015-04-23 00:00:00
Downloaded 2015-04-30 00:00:00
Downloaded 2015-05-07 00:00:00
Downloaded 2015-05-14 00:00:00
Downloaded 2015-05-21 00:00:00
Downloaded 2015-05-28 00:00:00
Downloaded 2015-06-04 00:00:00
Downloaded 2015-06-11 00:00:00
Downloaded 2015-06-18 00:00:00
Downloaded 2015-06-25 00:00:00
Downloaded 2015-07-02 00:00:00
Downloaded 2015-07-09 00:00:00
Downloaded 2015-07-16 00:00:00
Downloaded 2015-07-23 00:00:00
Downloaded 2015-07-30 00:00:00
Downloaded 2015-08-06 00:00:00
Download

Downloaded 2020-01-30 00:00:00
Downloaded 2020-02-06 00:00:00
Downloaded 2020-02-13 00:00:00
Downloaded 2020-02-20 00:00:00
Downloaded 2020-02-27 00:00:00
Downloaded 2020-03-05 00:00:00
Downloaded 2020-03-12 00:00:00
Downloaded 2020-03-19 00:00:00
Downloaded 2020-03-26 00:00:00
Downloaded 2020-04-02 00:00:00
Downloaded 2020-04-09 00:00:00
Downloaded 2020-04-16 00:00:00
Downloaded 2020-04-23 00:00:00
Downloaded 2020-04-30 00:00:00
Downloaded 2020-05-07 00:00:00
Downloaded 2020-05-14 00:00:00
Downloaded 2020-05-21 00:00:00
Downloaded 2020-05-28 00:00:00
Downloaded 2020-06-04 00:00:00
Downloaded 2020-06-11 00:00:00
Downloaded 2020-06-18 00:00:00
Downloaded 2020-06-25 00:00:00
Downloaded 2020-07-02 00:00:00
Downloaded 2020-07-09 00:00:00
Downloaded 2020-07-16 00:00:00
Downloaded 2020-07-23 00:00:00
Downloaded 2020-07-30 00:00:00
Downloaded 2020-08-06 00:00:00
Downloaded 2020-08-13 00:00:00
Downloaded 2020-08-20 00:00:00
Downloaded 2020-08-27 00:00:00
Downloaded 2020-09-03 00:00:00
Download

In [5]:
artist_list = []
rank_list = []
peakpos_list = []
lastpos_list = []
isnew_list = []
date_list_ext = []

# Create a long (stacked) list of Top 100 Artist for every week
for i in range(len(chart_list)):
    for j in range(100):
        # Append the date (each date will be repeated as many times as there are artists)
        date_list_ext.append(date_list[i])
        # Append the artist's name and rank
        artist_list.append(chart_list[i][j].artist)
        rank_list.append(chart_list[i][j].rank)
        # Append the artist's last position and peak position as of that date
        peakpos_list.append(chart_list[i][j].peakPos)
        lastpos_list.append(chart_list[i][j].lastPos)
        isnew_list.append(chart_list[i][j].isNew)

In [6]:
# Convert list to long dataframe
df = pd.DataFrame(list(zip(date_list_ext, artist_list, rank_list, peakpos_list, lastpos_list, isnew_list)), 
                  columns =['Date', 'Artist', 'BB Pos', 'Peak BB Pos', 'Last BB Pos', 'New'])

In [7]:
df

Unnamed: 0,Date,Artist,BB Pos,Peak BB Pos,Last BB Pos,New
0,2015-01-01,Taylor Swift,1,1,1,False
1,2015-01-01,Nicki Minaj,2,2,12,False
2,2015-01-01,Ed Sheeran,3,3,4,False
3,2015-01-01,Pentatonix,4,2,3,False
4,2015-01-01,Sam Smith,5,1,5,False
...,...,...,...,...,...,...
31095,2020-12-10,Lady A,96,6,94,False
31096,2020-12-10,Elton John,97,11,96,False
31097,2020-12-10,Gwen Stefani,98,4,0,False
31098,2020-12-10,Michael Jackson,99,20,0,False


In [8]:
df.to_csv('2020.12.12 Billboard Top 100 Artist.csv')

In [None]:
# Summarize the rank based on all available observations
df2 = df.groupby(['Artist'])[['BB Pos']].agg(['mean', 'count', 'max', 'min']).reset_index()
df2 = df2.sort_values(by=['Artist'])

In [None]:
df2.head(25)

In [10]:
date_list2 = [i for i in date_list if i >= datetime.datetime(2017, 1, 1)]

# Create a list of every combination of date - artist
outlist = [ (i, j)
    for i in date_list2
    for j in df.Artist.unique() ]

In [None]:
mean_rank = []
freq = []

# This step will take a while -- it will create ~ 225,364 observations
# For each pair of date i - artist j, calculate the cumulative mean of an artist j's rank
# and how many times the artist j has appeared on the Top 100 chart prior to and including date i

for i in date_list2:
    for j in list(df.Artist.unique()):
        mean_rank.append(df[(df['Date'] <= i) & 
                            (df['Artist'] == j)]['BB Pos'].mean())
        freq.append(len(df[(df['Date'] <= i) & 
                           (df['Artist'] == j)]['BB Pos']))
        
        # Print percentage completed to track progress
        print(len(freq)/(len(date_list)*len(list(df.Artist.unique()))))

In [None]:
# Create a dataframe of each artist's cumulative mean rank and frequency on each of the available date
df3 = pd.DataFrame(data = outlist, columns = ['Date', 'Artist'])
df3['Cumu_mean_rank'] = mean_rank
df3['Cumu_frequency'] = freq

In [None]:
df3

In [None]:
# df3.to_csv('2020.12.12 Billboard Cumu Rank.csv')