In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re

import deezer
import requests

import os

# Function Definitions
These functions should be converted to imports instead

In [None]:
client = deezer.Client()


def get_song_bpm(track, artist):
    try:
        result = client.search(track=track, artist=artist)

        if result:
            print(f"getting bpm for {track} by {artist}")
            bpm = result[0].bpm
            if bpm:
                return bpm
            else:
                print("BPM not found")
                return None
        else:
            print("Song not found")
            return None

    # GPT-4 was used to generate error handling code here:
    except requests.exceptions.HTTPError as http_err:
        if http_err.response.status_code == 403:
            print(
                "403 Error: Access Forbidden. Please check your API key or authentication."
            )
        else:
            print(f"HTTP error occurred: {http_err}")  # For other HTTP errors
    except Exception as err:
        print(f"An error occurred: {err}")

In [None]:
def prepare_url(artist, song):
    artist_lower = artist.lower()
    artist_clean = re.sub(
        r"[^a-zA-Z0-9 ]", "", artist_lower
    )  # remove non alphanumeric characters + keep spaces using a regular expression
    # reference: https://flexiple.com/python/remove-non-alphanumeric-characters-python

    artist_url = artist_clean.replace(" ", "-").capitalize()  # convert to kebab case

    song_lower = song.lower()
    song_clean = re.sub(
        r"[^a-zA-Z0-9 ]", "", song_lower
    )  # remove non alphanumeric characters + keep spaces
    song_url = song_clean.replace(" ", "-")  # convert to kebab case

    # prepre for genius format:
    url = "https://genius.com/" + artist_url + "-" + song_url + "-lyrics"

    return url


def get_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, "html.parser")
    lyrics_divs = html.find_all("div", attrs={"data-lyrics-container": "true"})

    if not lyrics_divs:
        print(f"Could not find lyrics for {url}")
        return None

    lyrics = "\n".join([div.get_text(separator="\n") for div in lyrics_divs])
    lyrics = re.sub(r"[\(\[].*?[\)\]]", "", lyrics)
    lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])

    return lyrics
    # reference: https://medium.com/@rachit.lsoni/scraping-song-lyrics-a-fun-and-practical-guide-c0b07e8e7312


def compute_lyricalness(artist, song):
    url = prepare_url(artist, song)
    lyrics = get_lyrics(url)

    if lyrics:
        # for initial implementation use simple measure of number of words per song
        return len(lyrics.split())
    else:
        return None

In [None]:
df = pd.read_csv("music-listening-data.csv")
df = df.drop(labels=["id", "timestamp", "model-version", "track-id"], axis=1)

tracks = df["track-title"]
artists = df["artist"]

# unique_tracks = df["track-title"].unique()
# unique_artists = df["artist"].unique()
# was going to use this to get all the songs I need to find metadata for but if there are tracks from the same artist it might cause problems

# https://www.geeksforgeeks.org/python-iterate-multiple-lists-simultaneously/
track_dict = {}
for track, artist in zip(tracks, artists):
    track_dict[track] = artist
# the end result will only contain unique keys for each song - artist repeats will not be impacted

# Lyricalness

In [None]:
tracks = []
lyrics = []

for track, artist in track_dict.items():
    tracks.append(track)
    lyrics.append(compute_lyricalness(artist, track))

df_dict = {"track": tracks, "lyric-count": lyrics}

lyric_df = pd.DataFrame(df_dict)
lyric_df.to_csv("lyric-data.csv")

# https://www.geeksforgeeks.org/python-remove-none-values-from-list/

lyrics_without_na = [x for x in lyrics if x is not None]
avg = np.average(lyrics_without_na)
print(avg)

# https://saturncloud.io/blog/how-to-replace-none-with-nan-in-pandas-dataframe/
lyrics_df_without_na = lyric_df.fillna(value=avg)
lyrics_df_without_na.to_csv("lyric-data-filled.csv")

In [None]:
# BPM
tracks = []
bpms = []

for track, artist in track_dict.items():
    tracks.append(track)
    bpms.append(get_song_bpm(track, artist))

df_dict = {"track": tracks, "bpm": bpms}

bpms_without_na = [x for x in bpms if x is not None]
avg = np.average(bpms_without_na)

bpm_df = pd.DataFrame(df_dict)

bpm_df_without_na = bpm_df.fillna(value=avg)


bpm_df.to_csv("bpm-data.csv")

# Prepare Main Dataset

In [None]:
lyric_dataset = pd.read_csv("lyric-data-filled.csv")
tracks = lyric_dataset["track"]
lyrics = lyric_dataset["lyric-count"]

lyrics_reference = {}

for track, lyric_count in zip(tracks, lyrics):
    lyrics_reference[track] = lyric_count

lyric_column = []

total_tracks = df["track-title"]

for track in total_tracks:
    lyric_column.append(lyrics_reference[track])

In [None]:
bpm_dataset = pd.read_csv("bpm-data-filled.csv")
tracks = bpm_dataset["track"]
bpms = bpm_dataset["bpm"]

bpm_reference = {}

for track, bpm_measure in zip(tracks, bpms):
    bpm_reference[track] = bpm_measure

bpm_column = []

total_tracks = df["track-title"]

for track in total_tracks:
    bpm_column.append(bpm_reference[track])

In [None]:
df["bpm"] = bpm_column
df["lyric-count"] = lyric_column

In [None]:
df.to_csv("music-listening-data-with-metadata.csv")