In [1]:
import lyricsgenius # Genius lyrics provider API
from textblob import TextBlob # language detector
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import logging
import sys
import re
import numpy as np
import time
import tqdm

### Setup basic logging

In [2]:
logging.basicConfig(filename="/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/log_files/lyrics_scraping.log",
                   level=logging.INFO,
                   format='%(asctime)s %(levelname)s %(message)s')

### Functions for scraping artists and songs

In [3]:
def metrolyrics_retriever(output_df, 
                          driver, 
                          root_url="https://www.metrolyrics.com/top-artists.html", 
                          per_artist_limit=None, 
                          overall_limit=None):
    
    dataset = output_df
    alphabet, urls = retrieve_alphabet(root_url, driver)
    
    for letter, url in zip(alphabet, urls):
        
        artists = retrieve_artists(letter, url, driver)
        for artist, (artist_url, artist_genre) in artists.items():
            songs = retrieve_songs(artist, artist_url, driver, per_artist_limit=per_artist_limit)
            nans = [None for i in range(len(songs))]
            genre = [artist_genre for i in range(len(songs))]
            
            artist_df = pd.DataFrame({"artist":[artist for i in range(len(songs))], 
                                      "song":songs, 
                                      "album":nans, 
                                      "release_date":nans, 
                                      "genre":genre, 
                                      "lyrics":nans})
            dataset = pd.concat([dataset, artist_df])
            dataset.reset_index(drop=True, inplace=True)
            
            
            print(f"\rSongs retrieved: {dataset.shape[0]}", end='\r')
            sys.stdout.flush()
            
            if not overall_limit is None:
                if dataset.shape[0] > overall_limit:
                    return dataset.iloc[:overall_limit, :]
        
        logging.info("Writing to csv")
        dataset.to_csv("/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/songs.csv", index=False)
        
    return dataset

def retrieve_alphabet(url, driver):
    
    driver.get(url)
    html = driver.page_source
    logging.info("Alphabet html source loaded")
    soup = BeautifulSoup(html, "lxml")
    
    try:
        alphabet = soup.find("p", class_="artist-letters").find_all("a")
        letters = []
        urls = []
        for idx, letter in enumerate(alphabet):
            try:
                letter_text = letter.get_text()
                letter_url = letter.get("href")
                letters.append(letter_text)
                urls.append(letter_url)
            except AttributeError:
                logging.warning(f"Unable to retrieve artists for letter number {idx}. Html structure not aligned")
    except AttributeError:
        logging.error("Unable to retrieve urls for letters. Html structure not aligned")
    
    logging.info(f"Got letters {letters}")
    
    return letters, urls
            
def retrieve_artists(letter, letter_url, driver):
    
    logging.info(f"\nRetrieving artists for letter {letter}")
    
    driver.get(letter_url)
    html = driver.page_source
    logging.info("Html source loaded")
    soup = BeautifulSoup(html, "lxml")
    
    artists = dict()
    
    # top six artists first
    logging.info("Retrieving top artists")
    try:
        top_artists = soup.find("div", class_="module top-artists clearfix")\
                            .find_all("div", class_=['artist grid_2 alpha', 'artist grid_2 ', "artist grid_2 omega"])
        for idx, artist in enumerate(top_artists):
            try:
                box = artist.find("a", class_="image")
                artist_url = box.get("href")
                artist_name = box.find("span", class_="name").get_text()
                artist_genre = ""
                substitute = re.compile("\s*(L|l)yrics.*")
                artist_name = re.sub(substitute, "", artist_name).replace("\n", "").replace("\t", "")
                artists[artist_name] = artist_url, artist_genre
                logging.info(f"Got {artist_name}")
            except AttributeError:
                logging.warning(f"Unable to retrieve top artist number {idx}. Html structure not aligned")        
    except AttributeError:
        logging.warning(f"Unable to retrieve top artists. Html structure not aligned")
    
    # other artists
    logging.info("Retrieving other artists")
    try:
        other_artists = soup.find("table", class_="songs-table").find("tbody").find_all("tr")
        for idx, artist in enumerate(other_artists):
            try:
                box = artist.find("a")
                artist_url = box.get("href")
                artist_name = box.get_text()
                artist_genre = artist.find_all("td")[1].get_text().replace("\n", "").replace("\t", "")
                if artist_genre == "":
                    artist_genre = None
                substitute = re.compile("\s*(L|l)yrics.*")
                artist_name = re.sub(substitute, "", artist_name).replace("\n", "").replace("\t", "")
                artists[artist_name] = artist_url, artist_genre
                logging.info(f"Got {artist_name}")
            except AttributeError:
                logging.warning(f"Unable to retrieve other artist number {idx}. Html structure not aligned")        
    except AttributeError:
        logging.warning(f"Unable to retrieve other artists. Html structure not aligned")
    
    logging.info(f"Retrieved {len(artists.keys())} artists for letter {letter}")
    
    return artists
        
        
def retrieve_songs(artist, artist_url, driver, per_artist_limit):

    logging.info(f"\nRetrieving songs for {artist}")
    
    driver.get(artist_url)
    html = driver.page_source
    logging.info("Html source loaded")
    soup = BeautifulSoup(html, "lxml")
    
    try:
        titles_soup = soup.find("div", id="popular").find("div", class_="content").find("tbody").find_all("tr")
    except AttributeError:
        logging.warning(f"Unable to retrieve songs for {artist}. Html structure not aligned")
    
    titles = []
    if per_artist_limit is None:
        per_artist_limit = len(titles_soup)
    for idx, song in enumerate(titles_soup[:per_artist_limit]):
        try:
            substitute = re.compile("\s*(L|l)yrics.*")
            title = song.find("a", class_=["title hasvidtable", "title "]).get_text()
            title = re.sub(substitute, "", title).replace("\n", "").replace("\t", "")
            titles.append(title)
            logging.info(f"Got {title}")
        except AttributeError:
            logging.warning(f"Unable to retrieve song number {idx}. Html structure not aligned")
    
    logging.info(f"Retrieved {len(titles)} songs for {artist}")
    
    return titles

### Use Genius API to get lyrics 

In [75]:
def retrieve_lyrics(df_songs, genius_api, output_filename, avoid_artists=[]):
    
    dataset = df_songs
    artists = [arts for arts in dataset.artist.unique() if arts not in avoid_artists]
    logging.info(f"Starting after {avoid_artists[-1]}")
    counter = 0
    counter_overall = 0
    
    for artist in artists:
        logging.info(f"\nRetrieving lyrics for {artist}")
        songs = dataset.loc[dataset.artist == artist, "song"].to_list()
        album = []
        lyrics = []
        release_date = []
        for idx, song_title in enumerate(songs):
            
            try:
                song = genius_api.search_song(song_title, artist=artist)
            except Exception as e:           
                try:
                    logging.warning(f"{song_title}, {artist} not found: {e.message} {e.args}")
                except AttributeError:
                    logging.warning(f"{song_title}, {artist} {song_title}, {artist} not found: {e}")
                
                dataset = dataset.drop(
                                index=dataset.loc[(dataset.artist == artist) & (dataset.song == song_title)].index
                                ).reset_index(drop=True)
                
                counter_overall += 1
                print(f"\rLyrics retrieved: {counter} Total songs: {counter_overall}", end='\r')
                sys.stdout.flush()
                continue
            
            try:
                text = song.lyrics
                substitute = re.compile("\[.+\]")
                text = re.sub(substitute, "", text).replace("\n", " ").replace("\t", " ")
                
                try:
                    language = TextBlob(text).detect_language()
                except Exception as e:
                    try:
                        logging.warning(f"Problem detecting language for {song_title}, {artist}: {e.message} {e.args}")
                    except AttributeError:
                        logging.warning(f"Problem detecting language for {song_title}, {artist}: {e}")
                    language = "en"
                if language != "en":
                    dataset = dataset.drop(
                                index=dataset.loc[(dataset.artist == artist) & (dataset.song == song_title)].index
                                ).reset_index(drop=True)
                    logging.warning(f"Excluded {song_title}, {artist}. Lyrics language: {language}")
                    counter_overall += 1
                    print(f"\rLyrics retrieved: {counter} Total songs: {counter_overall}", end='\r')
                    sys.stdout.flush()
                    continue
                else:
                    if len(text.split()) < 10000:
                        lyrics.append(text)
                    # avoid outliers --> surely not song lyrics
                    else:
                        dataset = dataset.drop(
                                index=dataset.loc[(dataset.artist == artist) & (dataset.song == song_title)].index
                                ).reset_index(drop=True)
                        logging.warning(f"Excluded {song_title}, {artist}. Lyrics longer than 10000 words")
                        counter_overall += 1
                        print(f"\rLyrics retrieved: {counter} Total songs: {counter_overall}", end='\r')
                        sys.stdout.flush()
                        continue
                        
            except AttributeError:
                logging.warning(f"No lyrics for {song_title}. Dropping")
                dataset = dataset.drop(
                                index=dataset.loc[(dataset.artist == artist) & (dataset.song == song_title)].index
                                ).reset_index(drop=True)
                counter_overall += 1
                print(f"\rLyrics retrieved: {counter} Total songs: {counter_overall}", end='\r')
                sys.stdout.flush()
                continue
            
            try:
                album.append(song.album.replace("\n", "").replace("\t", ""))
            except AttributeError:
                album.append(None)
            
            try:
                release_date.append(song.year.replace("\n", "").replace("\t", ""))
            except AttributeError:
                release_date.append(None)
            
            logging.info(f"Got data for {song_title}")
            counter += 1
            counter_overall += 1
            print(f"\rLyrics retrieved: {counter} Total songs: {counter_overall}", end='\r')
            sys.stdout.flush()
        
        logging.info(f"Retrieved lyrics for {artist}")
        logging.info(f"Artist Dataset shape: {dataset.loc[dataset.artist == artist, :].shape}, Albums: {len(album)}, Lyrics: {len(lyrics)}, Release Date: {len(release_date)}")
        
        dataset.loc[dataset.artist == artist, "album"] = album
        dataset.loc[dataset.artist == artist, "lyrics"] = lyrics
        dataset.loc[dataset.artist == artist, "release_date"] = release_date        
        
        logging.info(f"Writing {artist} to csv")
        logging.info(f"Global Dataset shape: {dataset.shape}")
        dataset.to_csv(f"/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/{output_filename}.csv", index=False)
    
    return dataset

### Final dataframe

In [5]:
df_initial = pd.DataFrame({"artist":[], "song":[], "album":[], "release_date":[], "genre":[], "lyrics":[]})
df_initial.head()

Unnamed: 0,artist,song,album,release_date,genre,lyrics


### Get songs

In [None]:
driver = driver = webdriver.Firefox(executable_path="/Users/lucamasserano/Desktop/BOCCONI/Business Analytics/geckodriver")
songs = metrolyrics_retriever(df_initial, driver)
songs.shape

In [None]:
songs = pd.read_csv("/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/songs.csv")

In [None]:
songs.shape

### Get lyrics

In [78]:
api = lyricsgenius.Genius("UKe9uBmoQW1eTb4rNC4Lr7hY98r0rZklx7Ws-y-YZcVUmlPLTe5Cu7Q-U9GjZ-wP", sleep_time=0.01, verbose=False)

In [None]:
lyrics = retrieve_lyrics(songs, api, avoid_artists=[])
lyrics.shape