In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv

# Billboard Hot 100 on Wikipedia

In [None]:
def scrapeBillboard(year):
    '''
    Scrape the Billboard Hot 100 chart for a given year, handling merged cells.
    
    Parameters:
        year (int): The year to scrape the chart for.
        
    Returns:
        pd.DataFrame: A DataFrame containing rank, title, artist, and year.
    '''
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")

    table = soup.find("table", class_="wikitable")
    tr = table.findAll("tr")

    data = []
    last_artist = None  # Store last valid artist name

    for row in tr[1:]:  # Skip header row
        td = row.findAll("td")        
        rank = td[0].text.strip()  # Extract rank
        
        # Extract song title (handle <a> tag if present)
        title_tag = td[1].find("a")
        title = title_tag.text.strip() if title_tag else td[1].text.strip()

        # Check if artist cell exists or is merged
        if len(td) == 3:
            artist = td[2].text.strip()
            last_artist = artist  # Store current artist if available
        else:
            artist = last_artist  # Use last stored artist for merged rows

        data.append([year, rank, title, artist])  # Store data as a list

    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=["year", "rank", "title", "artist"])
    
    return df

In [None]:
years = range(1960, 2025)  # Adjust years as needed
billboard = pd.concat([scrapeBillboard(year) for year in years], ignore_index=True)

In [None]:
billboard.to_csv("billboard_hot100_1960-2024.csv", index=False)
billboard.head()

In [None]:
billboard['year'].nunique()  # Check number of unique years

In [None]:
billboard.isnull().sum()  # Check for missing values

# Genuis API

In [3]:
def init():
    '''Initialize the environment.'''
    load_dotenv()

    global genius_token
    global headers

    genius_token = os.getenv("GENIUS_TOKEN")
    headers = {"Authorization": "Bearer " + genius_token}

def getGeniusURL(title, artist):
    '''Get the Genius URL of a song.'''
    url = "https://api.genius.com/search"
    params = {"q": f"{title} {artist}"}
    response = requests.get(url, params=params, headers=headers).json()
    try:
        return response["response"]["hits"][0]["result"]["url"]
    except:
        return None

def getLyrics(url):
    '''Get the lyrics of a song from its Genius URL.'''
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    try:
        return soup.find("div", {"class": "Lyrics-sc-37019ee2-1 jRTEBZ"}).get_text(separator="\n")
    except:
        return None

## Get Genius URL

In [None]:
init()
df = pd.read_csv("billboard_hot100_1960-2024.csv")
df["url"] = df.apply(lambda x: getGeniusURL(x["title"], x["artist"]), axis=1)
df.to_csv("billboard_url_1960-2024.csv", index=False)
print(df.isnull().sum())

In [None]:
df[df["url"].isnull()]  # Check songs with missing Genius URLs

## Get Genius lyrics

In [4]:
df = pd.read_csv("billboard_url_1960-2024.csv")

In [12]:
df_1 = df.iloc[:1000].copy()
df_2 = df.iloc[1000:2000].copy()
df_3 = df.iloc[2000:3000].copy()
df_4 = df.iloc[3000:4000].copy()
df_5 = df.iloc[4000:].copy()

In [25]:
df_5["lyrics"] = df_5["url"].apply(lambda x: getLyrics(x) if pd.notna(x) else None)
df_5.to_csv("billboard_lyrics_1960-2024_5.csv", index=False)

In [26]:
df_5[df_5['lyrics'].isnull()]  # Check songs with missing lyrics

Unnamed: 0,year,rank,title,artist,url,lyrics
4931,2009,31,Run This Town,Jay-Z featuring Rihanna and Kanye West,https://genius.com/Dj-redo-run-this-town-jay-z...,
5058,2010,58,Meet Me Halfway,The Black Eyed Peas,https://genius.com/Black-eyed-peas-meet-me-hal...,
5154,2011,54,Stereo Love,Edward Maya featuring Vika Jigulina,https://genius.com/Dj-redo-stereo-love-edward-...,
5326,2013,26,Don't You Worry Child,Swedish House Mafia featuring John Martin,https://genius.com/Swedish-house-mafia-dont-yo...,
5417,2014,17,Say Something,A Great Big World and Christina Aguilera,https://genius.com/Say-something-say-something...,
5440,2014,40,Wiggle,Jason Derulo featuring Snoop Dogg,https://genius.com/The-greatest-bits-wiggle-8-...,
5515,2015,15,Bad Blood,Taylor Swift featuring Kendrick Lamar,https://genius.com/8-bit-arcade-bad-blood-8-bi...,
5542,2015,42,FourFiveSeconds,"Rihanna, Kanye West and Paul McCartney",https://genius.com/2015-o-t-c-fourfiveseconds-...,
5680,2016,80,Middle,DJ Snake featuring Bipolar Sunshine,https://genius.com/Dj-snake-middle-dj-snake-in...,
5826,2018,26,Friends,Marshmello and Anne-Marie,https://genius.com/Smooth-jazz-all-stars-frien...,


In [27]:
# concatenate all dataframes
df_concat = pd.concat([df_1, df_2, df_3, df_4, df_5], ignore_index=True)
df_concat.to_csv("billboard_lyrics_1960-2024.csv", index=False)