## Billboard Initial Data Collection

This notebook describes how data was collected from the Billboard Year-End Hot 100 charts from 2017 and 2020, including the metrics of song titles, artist names, and year-end rankings.

In [1]:
# Importing modules
import billboard
import pandas as pd
import matplotlib.pyplot as plt
import spotipy
import time
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
# Collecting year-end charts
year_end_2017 = billboard.ChartData('hot-100-songs', year = 2017)
year_end_2020 = billboard.ChartData('hot-100-songs', year = 2020)

In [3]:
# Transforming 2017 chart into DataFrame
song_list_2017 = []
for song in year_end_2017:
    song_list_2017.append([song.title, song.artist, song.rank])
song_data_2017 = pd.DataFrame(song_list_2017, columns = ['Title', 'Artist', 'Rank'])
song_data_2017

Unnamed: 0,Title,Artist,Rank
0,Shape Of You,Ed Sheeran,1
1,Despacito,Luis Fonsi & Daddy Yankee Featuring Justin Bieber,2
2,That's What I Like,Bruno Mars,3
3,Humble.,Kendrick Lamar,4
4,Something Just Like This,The Chainsmokers & Coldplay,5
...,...,...,...
95,Havana,Camila Cabello Featuring Young Thug,96
96,What Lovers Do,Maroon 5 Featuring SZA,97
97,Do Re Mi,blackbear,98
98,Look At Me!,XXXTENTACION,99


In [4]:
# Transforming 2020 chart into DataFrame
song_list_2020 = []
for song in year_end_2020:
    song_list_2020.append([song.title, song.artist, song.rank])
song_data_2020 = pd.DataFrame(song_list_2020, columns = ['Title', 'Artist', 'Rank'])
song_data_2020

Unnamed: 0,Title,Artist,Rank
0,Blinding Lights,The Weeknd,1
1,Circles,Post Malone,2
2,The Box,Roddy Ricch,3
3,Don't Start Now,Dua Lipa,4
4,Rockstar,DaBaby Featuring Roddy Ricch,5
...,...,...,...
95,More Than My Hometown,Morgan Wallen,96
96,Lovin' On You,Luke Combs,97
97,Said Sum,Moneybagg Yo,98
98,Slide,H.E.R. Featuring YG,99


In [5]:
# Locating Spotify client ID and secret
cid = 'a1eebfea287e4f1cb0bdfe9e1dc3ef54'
secret = '2c31a72d61eb41959257562c2e2aa743'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [6]:
# Collecting track IDs based on playlist URL
def getTrackIDs(user, playlist_id):
    ids = []
    playlist = sp.user_playlist(user, playlist_id)
    for item in playlist['tracks']['items']:
        track = item['track']
        ids.append(track['id'])
    return ids

ids_2017 = getTrackIDs('spotify', '2XPEN88QyrPQ9zGqS8uS2x')
ids_2020 = getTrackIDs('spotify', '1WBljFutuk7uLQtfqfmjWV')

In [7]:
# Collecting track features for each song
def getTrackFeatures(id):
  meta = sp.track(id)
  features = sp.audio_features(id)

  # meta
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']
  duration_ms = meta['duration_ms']
  popularity = meta['popularity']
  explicit = meta['explicit']
    
  # features
  acousticness = features[0]['acousticness']
  danceability = features[0]['danceability']
  energy = features[0]['energy']
  instrumentalness = features[0]['instrumentalness']
  liveness = features[0]['liveness']
  loudness = features[0]['loudness']
  speechiness = features[0]['speechiness']
  tempo = features[0]['tempo']
  time_signature = features[0]['time_signature']
  positiveness = features[0]['valence']


  track = [name, album, artist, release_date, duration_ms, popularity, danceability, acousticness, danceability, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature, positiveness, explicit]
  return track

In [8]:
# Looping over 2017 track ids to append track-level metrics in a new row and create a dataset
tracks_2017 = []
for i in range(len(ids_2017)):
  time.sleep(.5)
  track = getTrackFeatures(ids_2017[i])
  tracks_2017.append(track)
tracks_2017_df = pd.DataFrame(tracks_2017, columns = ['name', 'album', 'artist', 'release_date', 'duration_ms', 'popularity', 'danceability', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'positiveness', 'explicit'])
tracks_2017_df

Unnamed: 0,name,album,artist,release_date,duration_ms,popularity,danceability,acousticness,danceability.1,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,positiveness,explicit
0,Shape of You,÷ (Deluxe),Ed Sheeran,2017-03-03,233712,86,0.825,0.581000,0.825,0.652,0.000000,0.0931,-3.183,0.0802,95.977,4,0.931,False
1,Despacito - Remix,Despacito Feat. Justin Bieber (Remix),Luis Fonsi,2017-04-17,228826,72,0.653,0.228000,0.653,0.816,0.000000,0.0967,-4.353,0.1670,178.085,4,0.816,False
2,That's What I Like,24K Magic,Bruno Mars,2016-11-17,206693,82,0.853,0.013000,0.853,0.560,0.000000,0.0944,-4.961,0.0406,134.066,4,0.860,False
3,HUMBLE.,DAMN.,Kendrick Lamar,2017-04-14,177000,82,0.908,0.000282,0.908,0.621,0.000054,0.0958,-6.638,0.1020,150.011,4,0.421,True
4,Something Just Like This,Memories...Do Not Open,The Chainsmokers,2017-04-07,247160,83,0.617,0.049800,0.617,0.635,0.000014,0.1640,-6.769,0.0317,103.019,4,0.446,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Havana (feat. Young Thug),Havana (feat. Young Thug),Camila Cabello,2017-08-03,216896,1,0.768,0.186000,0.768,0.517,0.000038,0.1040,-4.323,0.0312,104.992,4,0.418,False
96,What Lovers Do (feat. SZA),Red Pill Blues (Deluxe),Maroon 5,2017-11-03,199849,0,0.792,0.080500,0.792,0.612,0.000004,0.0852,-5.212,0.0693,109.959,4,0.420,False
97,do re mi,digital druglord,blackbear,2017-04-21,212027,45,0.745,0.005220,0.745,0.593,0.000005,0.1230,-6.350,0.0526,111.002,3,0.170,True
98,Look At Me!,Look At Me!,XXXTENTACION,2017-02-20,126345,83,0.763,0.259000,0.763,0.726,0.000000,0.0976,-6.405,0.2820,139.059,4,0.349,True


In [9]:
# Looping over 2020 track ids to append track-level metrics in a new row and create a dataset
tracks_2020 = []
for i in range(len(ids_2020)):
  time.sleep(.5)
  track = getTrackFeatures(ids_2020[i])
  tracks_2020.append(track)
tracks_2020_df = pd.DataFrame(tracks_2020, columns = ['name', 'album', 'artist', 'release_date', 'duration_ms', 'popularity', 'danceability', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'positiveness', 'explicit'])
tracks_2020_df

Unnamed: 0,name,album,artist,release_date,duration_ms,popularity,danceability,acousticness,danceability.1,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,positiveness,explicit
0,Blinding Lights,Blinding Lights,The Weeknd,2019-11-29,201573,19,0.513,0.00147,0.513,0.796,0.000209,0.0938,-4.075,0.0629,171.017,4,0.345,False
1,Circles,Hollywood's Bleeding,Post Malone,2019-09-06,215280,87,0.695,0.19200,0.695,0.762,0.002440,0.0863,-3.497,0.0395,120.042,4,0.553,False
2,The Box,Please Excuse Me for Being Antisocial,Roddy Ricch,2019-12-06,196652,84,0.896,0.10400,0.896,0.586,0.000000,0.7900,-6.687,0.0559,116.971,4,0.642,True
3,Don't Start Now,Don't Start Now,Dua Lipa,2019-10-31,183290,83,0.794,0.01250,0.794,0.793,0.000000,0.0952,-4.521,0.0842,123.941,4,0.677,False
4,ROCKSTAR (feat. Roddy Ricch),BLAME IT ON BABY,DaBaby,2020-04-17,181733,84,0.746,0.24700,0.746,0.690,0.000000,0.1010,-7.956,0.1640,89.977,4,0.497,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,More Than My Hometown,More Than My Hometown,Morgan Wallen,2020-04-17,216573,62,0.621,0.60100,0.621,0.882,0.000000,0.1320,-5.010,0.0459,126.014,4,0.574,False
96,Lovin' On You,What You See Is What You Get,Luke Combs,2019-11-08,194866,70,0.572,0.00165,0.572,0.949,0.000195,0.1630,-4.865,0.0600,118.974,4,0.530,False
97,Said Sum,Code Red,Moneybagg Yo,2020-09-18,155168,69,0.929,0.01850,0.929,0.667,0.000000,0.1000,-6.789,0.3530,126.998,4,0.274,True
98,Slide,Slide,H.E.R.,2019-09-27,238321,68,0.832,0.08070,0.832,0.469,0.000008,0.2070,-9.141,0.3390,97.023,4,0.197,True


In [10]:
# Merging 2017 Spotify metrics dataset with Billboard chart performance dataset and exporting to CSV
charts_merged_2017 = pd.concat([song_data_2017, tracks_2017_df], axis = 1)
charts_merged_2017
charts_merged_2017.to_csv("../data/billboard2017.csv", sep = ',')

In [11]:
# Merging 2020 Spotify metrics dataset with Billboard chart performance dataset and exporting to CSV
charts_merged_2020 = pd.concat([song_data_2020, tracks_2020_df], axis = 1)
charts_merged_2020
charts_merged_2020.to_csv("../data/billboard2020.csv", sep = ',')