In [5]:
# import the necessary packages
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
from datetime import datetime

In [7]:
# function to get a week's Billboard Top 100 songs, artists, and rankings
def get_top_100(url):
    
    # classes
    c_top_artist = "c-tagline a-font-primary-l a-font-primary-m@mobile-max lrv-u-color-black u-color-white@mobile-max lrv-u-margin-tb-00 lrv-u-padding-t-025 lrv-u-margin-r-150"
    c_top_song = "c-title__link lrv-a-unstyle-link"
    c_top99_artists = "c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only"
    c_top99_songs = "c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only"
    
    # tags
    t_top_artist = "p"
    t_top_song = "a"
    t_top99_artists = "span"
    t_top99_songs = "h3"
    
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    date = pd.to_datetime(soup.find_all(id="section-heading")[1].text.strip().strip("Week of "))
    rank = np.arange(1,101)
    songs = []
    artists = []
    
    if soup.text.find("Taylor Swift") < 0:
        return None
    else :
        top_artist = soup.find(t_top_artist, c_top_artist).text.strip()
        top_song = soup.find(t_top_song, c_top_song).text.strip()
        artists.append(top_artist)
        songs.append(top_song)

        top_99_artists = [artist.text.strip() for artist in soup.find_all(t_top99_artists, c_top99_artists)]
        top_99_songs = [song.text.strip() for song in soup.find_all(t_top99_songs, c_top99_songs)]
        artists.extend(top_99_artists)
        songs.extend(top_99_songs)


        df = pd.DataFrame({"song": songs, "artist": artists, "rank": rank})
        df['date'] = date
        
        return df

In [14]:
# run this function from June 2006 to now
dfs = []
base_url = "https://www.billboard.com/charts/hot-100/"
b = pd.date_range(end=datetime.now(), periods=860, freq="W-Mon")
dates = [str(date).split()[0] for date in b]

for date in dates:
    url = base_url + date
    df = get_top_100(url)
    if df is not None:
        dfs.append(df)

2022-09-12
2022-09-19
2022-09-26
2022-10-03
2022-10-10
2022-10-17
2022-10-24
2022-10-31
2022-11-07
2022-11-14


In [15]:
# combine each week's chart into one long dataframe
pd.concat(dfs).to_csv("long_df_swift.txt", sep='\t', index=False)