In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def scrapeBillboard(year):
    '''
    Scrape the Billboard Hot 100 chart for a given year, handling merged cells.
    
    Parameters:
        year (int): The year to scrape the chart for.
        
    Returns:
        pd.DataFrame: A DataFrame containing rank, title, artist, and year.
    '''
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")

    table = soup.find("table", class_="wikitable")
    tr = table.findAll("tr")

    data = []
    last_artist = None  # Store last valid artist name

    for row in tr[1:]:  # Skip header row
        td = row.findAll("td")

        # if len(td) < 3:
        #     continue  # Skip invalid rows
        
        rank = td[0].text.strip()  # Extract rank
        
        # Extract song title (handle <a> tag if present)
        title_tag = td[1].find("a")
        title = title_tag.text.strip() if title_tag else td[1].text.strip()

        # Check if artist cell exists or is merged
        if len(td) == 3:
            artist = td[2].text.strip()
            last_artist = artist  # Store current artist if available
        else:
            artist = last_artist  # Use last stored artist for merged rows

        data.append([year, rank, title, artist])  # Store data as a list

    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=["year", "rank", "title", "artist"])
    
    return df

In [11]:
years = range(1959, 2025)  # Adjust years as needed
all_data = pd.concat([scrapeBillboard(year) for year in years], ignore_index=True)

  tr = table.findAll("tr")
  td = row.findAll("td")


In [12]:
all_data.to_csv("billboard_hot100_1959-2024.csv", index=False)
all_data.head()

Unnamed: 0,year,rank,title,artist
0,1959,1,The Battle of New Orleans,Johnny Horton
1,1959,2,Mack the Knife,Bobby Darin
2,1959,3,Personality,Lloyd Price
3,1959,4,Venus,Frankie Avalon
4,1959,5,Lonely Boy,Paul Anka


In [17]:
all_data['year'].nunique()  # Check number of unique years

66

In [None]:
# Genius API
import os
from dotenv import load_dotenv


def init():
    '''Initialize the environment.'''
    load_dotenv()

    global spotify_cid
    global spotify_secret
    global genius_token
    global headers

    spotify_cid = os.getenv("SPOTIFY_CID")
    spotify_secret = os.getenv("SPOTIFY_SECRET")
    genius_token = os.getenv("GENIUS_TOKEN")
    headers = {"Authorization": "Bearer " + genius_token}


def getGeniusURL(title, artist):
    '''Get the Genius URL of a song.'''
    url = "https://api.genius.com/search"
    params = {"q": f"{title} {artist}"}
    response = requests.get(url, params=params, headers=headers).json()
    return response["response"]["hits"][0]["result"]["url"]

def getLyrics(url):
    '''Get the lyrics of a song from its Genius URL.'''
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    lyrics = soup.find("div", {"data-lyrics-container": "true"}).get_text(separator="\n") 
    return lyrics