In [None]:
# %pip install billboard.py

In [1]:
import billboard
import pandas as pd
import numpy as np
import datetime

In [None]:
# %reset

In [None]:
# The official Billboard API has fallen out of use, see: https://stackoverflow.com/questions/7835398/how-to-get-billboard-hot-100-chart-listing-via-billboard-api/9613219
# This script uses the billboard.py package as documented here: https://github.com/guoguo12/billboard-charts

# When do charts update each week?
# Almost all weekly charts update each Tuesday morning (while, during weeks with Monday holidays, charts update on Wednesdays).

In [2]:
# Create a list of dates from 2015-01-01 to present day by weekly intervals (since this chart updates weekly)
# We start in 2015 because we want to give some prior history to artists appearing in 2017 (when the Spotify Top 200 Chart begins), 
# otherwise their mean rank/frequency would be undefined.

date_list = []
date_start = datetime.datetime(2015, 1, 1)
d = datetime.timedelta(days = 7)

while date_start <= datetime.datetime(2020, 12, 12): # can change this to whatever today's date is
    date_list.append(date_start)
    date_start += d

In [None]:
# Fetch the Top 100 Artist chart for every week from the start date to present
chart_list = []
for i in range(len(date_list)):
    chart = billboard.ChartData('artist-100', date = date_list[i].strftime('%Y-%m-%d'))
    chart_list.append(chart)
    print('Downloaded', date_list[i])
    # a 5-second rest between every other ~5 calls is automatically built into the billboard.ChartDate() function

In [5]:
artist_list = []
rank_list = []
peakpos_list = []
lastpos_list = []
isnew_list = []
date_list_ext = []

# Create a long (stacked) list of Top 100 Artist for every week
for i in range(len(chart_list)):
    for j in range(100):
        # Append the date (each date will be repeated as many times as there are artists)
        date_list_ext.append(date_list[i])
        # Append the artist's name and rank
        artist_list.append(chart_list[i][j].artist)
        rank_list.append(chart_list[i][j].rank)
        # Append the artist's last position and peak position as of that date
        peakpos_list.append(chart_list[i][j].peakPos)
        lastpos_list.append(chart_list[i][j].lastPos)
        isnew_list.append(chart_list[i][j].isNew)

In [6]:
# Convert list to long dataframe
df = pd.DataFrame(list(zip(date_list_ext, artist_list, rank_list, peakpos_list, lastpos_list, isnew_list)), 
                  columns =['Date', 'Artist', 'BB Pos', 'Peak BB Pos', 'Last BB Pos', 'New'])

In [7]:
df

Unnamed: 0,Date,Artist,BB Pos,Peak BB Pos,Last BB Pos,New
0,2015-01-01,Taylor Swift,1,1,1,False
1,2015-01-01,Nicki Minaj,2,2,12,False
2,2015-01-01,Ed Sheeran,3,3,4,False
3,2015-01-01,Pentatonix,4,2,3,False
4,2015-01-01,Sam Smith,5,1,5,False
...,...,...,...,...,...,...
31095,2020-12-10,Lady A,96,6,94,False
31096,2020-12-10,Elton John,97,11,96,False
31097,2020-12-10,Gwen Stefani,98,4,0,False
31098,2020-12-10,Michael Jackson,99,20,0,False


In [8]:
df.to_csv('2020.12.12 Billboard Top 100 Artist.csv')

In [None]:
# Summarize the rank based on all available observations
# df2 = df.groupby(['Artist'])[['BB Pos']].agg(['mean', 'count', 'max', 'min']).reset_index()
# df2 = df2.sort_values(by=['Artist'])
# df2.head(25)

In [10]:
date_list2 = [i for i in date_list if i >= datetime.datetime(2017, 1, 1)]

# Create a list of every combination of date - artist
outlist = [ (i, j)
    for i in date_list2
    for j in df.Artist.unique() ]

In [None]:
mean_rank = []
freq = []

# This step will take a while -- it will create ~ 225,364 observations
# For each pair of date i - artist j, calculate the cumulative mean of an artist j's rank
# and how many times the artist j has appeared on the Top 100 chart prior to and including date i

for i in date_list2:
    for j in list(df.Artist.unique()):
        mean_rank.append(df[(df['Date'] <= i) & 
                            (df['Artist'] == j)]['BB Pos'].mean())
        freq.append(len(df[(df['Date'] <= i) & 
                           (df['Artist'] == j)]['BB Pos']))
        
        # Print percentage completed to track progress
        print(len(freq)/(len(date_list2)*len(list(df.Artist.unique()))))

In [12]:
# Create a dataframe of each artist's cumulative mean rank and frequency on each of the available date
df3 = pd.DataFrame(data = outlist, columns = ['Date', 'Artist'])
df3['Cumu_mean_rank'] = mean_rank
df3['Cumu_frequency'] = freq

In [13]:
df3

Unnamed: 0,Date,Artist,Cumu_mean_rank,Cumu_frequency
0,2017-01-05,Taylor Swift,12.122642,106
1,2017-01-05,Nicki Minaj,41.093750,96
2,2017-01-05,Ed Sheeran,30.620000,100
3,2017-01-05,Pentatonix,19.923077,26
4,2017-01-05,Sam Smith,30.411765,68
...,...,...,...,...
284275,2020-12-10,King Von,42.500000,4
284276,2020-12-10,System Of A Down,94.000000,1
284277,2020-12-10,Aesop Rock,88.000000,1
284278,2020-12-10,KISS,89.000000,1


In [14]:
# df3.to_csv('2020.12.12 Billboard Cumu Rank.csv')