In [2]:
import requests
from bs4 import BeautifulSoup,Comment
import time
import random
import os

# Set the user agent to a popular web browser
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'       
headers = {
    'User-Agent': 
    user_agent}
#Use ScraperAPI for rotating proxies to not get blocked by page. Get all artists, songs and lyrics from https://www.azlyrics.com
base_url = 'http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com'

def get_lyrics(artist_name):
    """
    This function takes an artist name and scrapes https://www.azlyrics.com for all the songs and lyrics from that artist.

    Parameters:
    artist_name (str): A string representing the name of the artist to scrape lyrics from.

    Returns:
    lyrics: A list where each element represents the lyrics of a song in a string.
    """    
    artist_name = artist_name.replace(" ", "").lower() #Remove spaces and make lower case to match artist name in the url.
    artist_url = f"{base_url}/{artist_name[0]}/{artist_name}.html" #Format url to match the way it links to artists which is for example https://www.azlyrics.com/d/drake.html
    reqs = requests.get(artist_url, headers=headers)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    lyrics = []
    listalbum_divs = soup.find_all('div', {'class': 'listalbum-item'}) #All the songs links are in divs with class listalbum-item.
    for div_tag in listalbum_divs:
        a_tag = div_tag.a # Find the a_tag which is within the listalbum-item divs
        song_url = a_tag["href"] # Find the song url from the href in the a_tag
        if song_url.startswith("/lyrics/"): #Feature songs include the whole url instead of just the ending which is in the format /lyrics/{artist}/{song_name}.hmtl. Decided to stay away from features the artist has done, as that will include more lyrics from other artists than the artist we are trying to generate similar lyrics to.
            song_url = base_url + song_url
            lyrics.append(get_song_lyrics(song_url))
            # Add a delay to avoid overloading the server
            delay = random.randint(1, 2)
            time.sleep(delay)
    write_to_file(artist_name, lyrics)
    return lyrics


def get_song_lyrics(song_url):
    """
    This function gets the lyrics for a song from a song url and returns it as a string.

    Parameters:
    song_url (str): A string representing url of the song to be scraped.

    Returns:
    song_lyrics: A string for the requested song.
    """    
    print(song_url)
    song_reqs = requests.get(song_url, headers=headers)
    song_soup = BeautifulSoup(song_reqs.text, 'html.parser')
    #Every html element in azlyrics.com ends with the following line before the lyrics
    prohibited_comment = song_soup.find(string=lambda text: isinstance(text, Comment) and "prohibited by our licensing agreement. Sorry about that." in text)
    #Find the parent div of the comment as this is the same div where all the lyrics are.
    lyric_div = prohibited_comment.find_parent("div")
    #Lyrics are already split by \n so we can directly return the text of the div
    return lyric_div.text

def write_to_file(artist_name, lyrics):
    """
    This function writes all the lyrics to a file with the name of the artist name.

    Parameters:
    artist_name (str): A string representing name of the artist.
    lyrics (str): A string with the lyrics.
    """    

    folder = 'data'
    filename = f'{artist_name}.txt'
    filepath = os.path.join(folder, filename)

    if not os.path.exists(folder):
        os.makedirs(folder)

    if not os.path.exists(filepath):
        with open(filepath, 'w') as f:
    # Write each string in the list to the file
            for line in lyrics:
                f.write(line)
    else:
        with open(filepath, 'a') as f:
    # Write each string in the list to the file
            for line in lyrics:
                f.write(line)

def lyrics_from_file(artist_name):
    """
    This function gets all the lyrics from a file.

    Parameters:
    artist_name (str): A string representing name of the artist.

    Returns:
    lines: A string with all the lyrics if the file exists or None if not.
    """    
    # Open the file in read mode and read the contents into the lines list
    artist_name = artist_name.replace(" ", "").lower() #Remove spaces and make lower case to match artist name in the file system.
    print(artist_name)
    try:
        with open(f'data/{artist_name}.txt', 'r') as f:
            lines = f.readlines()
            return lines
    except:
        return None


In [64]:
artist_name = "J Cole"
lyrics = get_lyrics(artist_name)

http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/simba.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/imtheman.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/schooldaze.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/dollaandadream.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/throwitup.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/quoteme.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/collegeboy.html
http://api.scraperapi.com/?api_key=0537b1295305cf331cdf77c6eaede212&url=https://www.azlyrics.com/lyrics/jcole/splityouup.html

In [62]:
lyrics = lyrics_from_file("Kendrick Lamar")
print(lyrics)

kendricklamar


In [3]:
lyrics = get_lyrics("Ed Sheeran")
type(lyrics)

list