# SI 649 Narrative Viz Project

Title: Billboard 100

Members: Nan-Hsin Lin, Jia-Tong Choo, Yi-Chun Wang

Last update: 04/28/2024

## Data retrieval and cleaning

### Functions for retrieving data

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, date
import pandas as pd
import os
from dotenv import load_dotenv
import musicbrainzngs as mbz


# Billboard Table

def scrape_billboard(date):
    '''Scrape the Billboard Hot 100 chart for a given date.'''

    url = "https://www.billboard.com/charts/hot-100/" + str(date) + "/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    ul = soup.findAll("ul", class_="o-chart-results-list-row")

    billboard_list = []
    for index, value in enumerate(ul):
        billboard = {}
        billboard["date"] = date
        billboard["rank"] = index + 1
        billboard["song"] = value.find("h3").text.strip()
        billboard["artist"] = value.find("span", class_="a-font-primary-s").text.strip()
        billboard_list.append(billboard)
    
    return billboard_list

def get_billboard_n_weeks(weeks=52):
    '''Get the Billboard Hot 100 chart for the past n weeks.'''
    today = datetime.today().date()
    # today = date.fromisoformat('2019-04-26')
    saturday = today + timedelta(days=5-today.weekday())
    billboard = []
    for _ in range(weeks):
        billboard.extend(scrape_billboard(saturday))
        saturday -= timedelta(days=7)
    return billboard


# Artist Table

def get_infobox(artist):
    '''Get the infobox data for a given artist from Wikipedia.'''

    info = {}
    info["artist"] = artist
    artist = artist.replace(" ", "_")

    url = "https://en.wikipedia.org/wiki/" + artist
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    infobox = soup.find('table', {'class': 'infobox'})
    if not infobox:
        info["birthplace"] = None
        info["birthdate"] = None
    
    try:
        info["birthplace"] = infobox.find("div", {"class": "birthplace"}).text.strip()
    except AttributeError:
        info["birthplace"] = None
    
    try:
        info["birthdate"] = infobox.find("span", {"class": "bday"}).text.strip()
    except AttributeError:
        info["birthdate"] = None

    return info


def get_artist_info(artist_list):
    '''Get the artist info from MusicBrainz'''
    mbz.set_useragent("billboard_info_viz", "0.1", "nanhsin@umich.edu")

    info = []
    for artist in artist_list:
        # result = mbz.search_artists(artist=artist)["artist-list"][0]
        result = mbz.search_artists(artist=artist)
        try:
            result = result["artist-list"][0]
            info.append({"artist": artist,
                        "name": result.get("name"),
                        "type": result.get("type"),
                        "country": result.get("country"),
                        "birthdate": result.get("life-span").get("begin"),
                        "deathdate": result.get("life-span").get("ended")})
        except IndexError:
            info.append({"artist": artist,
                        "name": None,
                        "type": None,
                        "country": None,
                        "birthdate": None,
                        "deathdate": None})
            print(f"Artist {artist} not found.")
    return info


# Spotify Features Table

def get_spotify_token():
    '''Get the Spotify access token.'''
    load_dotenv()
    response = requests.post(
        "https://accounts.spotify.com/api/token",
        data={
        "grant_type": "client_credentials",
        "client_id": os.getenv("SPOTIFY_CID"),
        "client_secret": os.getenv("SPOTIFY_SECRET"),
    }).json()
    return response["access_token"]

def get_spotify_id(token, song, artist):
    '''Get the Spotify ID of a song.'''
    headers = {"Authorization": "Bearer " + token}
    url = f"https://api.spotify.com/v1/search?q={song}%20{artist}&type=track&market=US&limit=1"
    response = requests.get(url, headers=headers).json()
    try:
        return response["tracks"]["items"][0]["id"]
    except (KeyError, IndexError) as e:
        return None

def get_spotify_features(token, song_id):
    '''Get the Spotify features of a song.'''
    headers = {"Authorization": "Bearer " + token}
    url = f"https://api.spotify.com/v1/audio-features/{song_id}"
    try:
        return requests.get(url, headers=headers).json()
    except requests.exceptions.RequestException as e:
        return None

def get_spotify_track(token, song_id):
    '''Get the Spotify info of a song.'''
    headers = {"Authorization": "Bearer " + token}
    url = f"https://api.spotify.com/v1/tracks/{song_id}"
    try:
        response = requests.get(url, headers=headers).json()
        result = {"id": response.get("id"),
                "song_spotify": response.get("name"),
                "artist_spotify": response.get("artists")[0].get("name"),
                "album": response.get("album").get("name"),
                "release_date": response.get("album").get("release_date"),
                "popularity": response.get("popularity")}
    except requests.exceptions.RequestException as e:
        return None
    except IndexError:
        return None
    return result


### Main execution for retrieving, cleaning, and storing data

In [None]:
# Get the Billboard Hot 100 chart for the past 5 years
df = pd.DataFrame(get_billboard_n_weeks(52*5))
df["pk"] = df["song"] + "_" + df["artist"]
df.to_csv("billboard.csv", index=False)

In [None]:
# Get the spotify id for each song
spotify_id_df = df.drop_duplicates(subset = ["song", "artist"], keep = "last").reset_index(drop = True)
spotify_id_df = spotify_id_df.drop(columns=["date", "rank"])
token = get_spotify_token()
spotify_id_df["spotify_id"] = spotify_id_df.apply(lambda x: get_spotify_id(token, x["song"], x["artist"]), axis=1)
spotify_id_df.to_csv("spotify_id.csv", index=False)

In [None]:
# # Handle missing Spotify ID
# missing_loc = 5472
# missing_id = "1HOlb9rdNOmy9b1Fakicjo"
# spotify_id_df[spotify_id_df["spotify_id"].isnull()]
# spotify_id_df.iloc[missing_loc]
# spotify_id_df.at[missing_loc, "spotify_id"] = missing_id
# spotify_id_df.to_csv("spotify_id.csv", index=False)

In [None]:
# Get the Spotify features for each song
# spotify_id_df = pd.read_csv("spotify_id.csv")
id_list = spotify_id_df["spotify_id"].to_list()
token = get_spotify_token()
features = []
for song_id in id_list:
    features.append(get_spotify_features(token, song_id))
features_df = pd.DataFrame(features)
spotify_id_df = spotify_id_df.merge(features_df, left_on="spotify_id", right_on="id", how="left")
spotify_id_df.drop(columns=["spotify_id"], inplace=True)
spotify_id_df.to_csv("features19.csv", index=False)

In [None]:
# Get the track info from Spotify
token = get_spotify_token()
tracks = []
for song_id in id_list:
    tracks.append(get_spotify_track(token, song_id))
tracks_df = pd.DataFrame(tracks)
tracks_df.to_csv("tracks.csv", index=False)

In [None]:
# Get the artist info from MusicBrainz
# tracks_df = pd.read_csv("tracks.csv")
artist_list = tracks_df["artist_spotify"].drop_duplicates().to_list()
artist_df = pd.DataFrame(get_artist_info(artist_list))
artist_df.to_csv("artists.csv", index=False)

In [None]:
# # Update the Spotify features & track info for missing songs
# id = "1rqqCSm0Qe4I9rUvWncaom"
# token = get_spotify_token()
# feature = get_spotify_features(token, id)
# feature_df = pd.DataFrame([feature])
# feature_df.to_csv("feature_highhopes.csv", index=False)
# track = [get_spotify_track(token, id)]
# track_df = pd.DataFrame(track)
# track_df.to_csv("track_highhopes.csv", index=False)