# Lyrics Scraping

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import time

# Browser User-Agent
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"

# Setting up HTTP Headers
headers = {
    "User-Agent": user_agent
}

In [2]:
# Artist (Alt-J) page URL
site_url = "https://www.lyrics.com"
artist_url = "/artist/Alt-J/2570284"

# Getting response from HTTP GET request
response = requests.get(site_url + artist_url, headers=headers)

# Getting status code from the response
response.status_code

200

In [3]:
# Making soup
artist = BeautifulSoup(response.text, "html.parser")

# Getting artist name
artist_name = artist.find("h1", attrs={"class":"artist"}).text

# Getting all the links from the artist page
lyric_urls = [item["href"] for item in artist.find_all("a") if re.match(r"^/lyric/", item["href"])]


In [6]:
# Now let's crawl

# Folder for storing the files
folder_name = "data/"

# Storing counter for progress tracking
counter = 0

limit = input(f"How many lyrics out of {len(lyric_urls)} to fetch?")

for item_url in lyric_urls[0:int(limit)]:

    # Incrementing counter
    counter += 1

    # Delay 2 secs
    time.sleep(2)

    # Getting response from HTTP GET request
    response = requests.get(site_url + item_url, headers=headers)

    # Making lyric soup
    lyric = BeautifulSoup(response.text, "html.parser")

    # Getting song name
    try:
        lyric_name = lyric.find("h1", attrs={"id":"lyric-title-text"}).text
        
        # Getting lyric text
        lyric_text = lyric.find("pre", attrs={"id":"lyric-body-text"}).text
    except:
        print(f"{counter}/{len(lyric_urls)} Something wrong happened at {site_url + item_url}")
        continue

    # Transforming file name, so it has no spaces or slashes (replaced with "_")
    #   and there's double ("__") between artist and song name
    #file_name = "_".join(artist_name.split(" ")) + "__" + "_".join(lyric_name.split(" ")) + ".txt"

    for char in [" ", "/"]:
        artist_name = artist_name.replace(char, "_")
        lyric_name  = lyric_name.replace(char, "_")

    file_name = artist_name + "__" + lyric_name + ".txt"

    # Actually writing down the file
    with open(folder_name + file_name, "w") as f:
        f.write(lyric_text)

    print(f"{counter}/{len(lyric_urls)}. {file_name} is finished.")

1/147. Alt-J__3WW.txt is finished.
2/147. Alt-J__Adeline.txt is finished.
3/147. Alt-J__Hares_on_the_Mountain.txt is finished.
4/147. Alt-J__Deadcrush_[Alchemist_x_Trooko_Version].txt is finished.
5/147. Alt-J__In_Cold_Blood_[Twin_Shadow_Version].txt is finished.
6/147. Alt-J__3WW.txt is finished.
7/147. Alt-J__In_Cold_Blood.txt is finished.
8/147. Alt-J__House_Of_The_Rising_Sun.txt is finished.
9/147. Alt-J__Hit_Me_Like_That_Snare.txt is finished.
10/147. Alt-J__Deadcrush.txt is finished.
11/147. Alt-J__Adeline.txt is finished.
12/147. Alt-J__Last_Year.txt is finished.
13/147. Alt-J__Pleader.txt is finished.
14/147. Alt-J__3WW.txt is finished.
15/147. Alt-J__In_Cold_Blood.txt is finished.
16/147. Alt-J__House_Of_The_Rising_Sun.txt is finished.
17/147. Alt-J__Hit_Me_Like_That_Snare.txt is finished.
18/147. Alt-J__Deadcrush.txt is finished.
19/147. Alt-J__Adeline.txt is finished.
20/147. Alt-J__Last_Year.txt is finished.
21/147. Alt-J__Pleader.txt is finished.
22/147. Alt-J__Hunger_of_t

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cfd8b0ae-bede-4fbf-8370-46f1acdfcc89' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>