# Scraping the web for corpus data
This notebook contains the code used to gather song lyrics for our corpus. 

* Hamilton lyrics are sourced from http://www.themusicallyrics.com
* All other lyrics are sourced from https://www.azlyrics.com

These lyrics are being used for research purposes only

In [63]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment, NavigableString
import re
import time
import random
import os

nonos = re.compile("[^A-Za-z0-9]")

### Grabbing data from azlyrics.com

In [74]:
import sys
def get_links(url, album):
    BASE_URL = url.rsplit("/",2)[0] # https://www.azlyrics.com
    PREFIX = ".."
    links_list = []
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html5lib")
    if album is not None:
        catalog = soup.find("div", class_="album", text=album)
    else: 
        catalog = soup.find("div", id="listAlbum")
    links = catalog.findAll("a")
    print(links)
    sys.exit
    for link in links:
        tag = link.get("href")
        if tag is not None and tag.startswith(PREFIX):
            links_list.append(BASE_URL + tag.lstrip(PREFIX))
    return links_list 

In [75]:
def scrape_lyrics(url):
    COMMENT_KEY = "Usage of azlyrics.com"
    
    lines = []
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html5lib")
    parent = None

    # find parent of lyrics div
    for comment in soup.findAll(text=lambda x: isinstance(x, Comment)):
        if COMMENT_KEY in comment:
            parent = comment.parent

    for line in parent:
        if type(line) == NavigableString and len(line) > 1:
            lines.append(line.lstrip("\n")) 
            
    title = soup.title.contents[0].split("-")[1]
    title = re.sub(nonos, "", title)
    
    return title, lines

In [76]:
def get_corpus(discography_url, output_dir, album=None):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    song_links = get_links(discography_url, album)
    for link in song_links:
        title, lines = scrape_lyrics(link)
        filename = output_dir + title + ".txt"
        with open(filename, "w") as f:
            for line in lines:
                f.write(line + "\n")
        time.sleep(random.randint(5, 12))

In [77]:
get_corpus("https://www.azlyrics.com/k/kendricklamar.html", "lyrics/drake/", album="Hamilton: An American Musical")
get_corpus("https://www.azlyrics.com/d/drake.html", "lyrics/drake/")
get_corpus("https://www.azlyrics.com/c/cardi-b.html", "lyrics/cardi-b/")
get_corpus("https://www.azlyrics.com/j/jayz.html", "lyrics/jayz/")

AttributeError: 'NoneType' object has no attribute 'findAll'

In [None]:
BASE_URL = "http://www.themusicallyrics.com"

conn = urlopen("http://www.themusicallyrics.com/h/351-hamilton-the-musical.html")
html = conn.read()
soup = BeautifulSoup(html, "html5lib")

# get links to lyrics for all songs

body = soup.find("tbody")
tbody = body.find("tbody") # lyrics are in nested tbody element
links = tbody.findAll("a")

# write links to a file
with open("hamilton-links", "w") as hl:
    for tag in links:
        link = tag.get("href")
        if link is not None:
            if link.startswith("/h/") and "synopsis" not in link:
                lyrics_page = BASE_URL + link
                hl.write(lyrics_page + "\n")

### Gathering Links for Hamilton Songs

### Parsing each link and scraping lyrics

Note: the lyric files genereated below will still need to undergo some preprocessing before they're ready to be imported into the corpus

In [80]:
# open links on by one and strip lyrics
# I separated this step out to improve clarity
with open("hamilton-links", "r") as hlf:
    for link in hlf:
        conn = urlopen(link)
        soup = BeautifulSoup(conn.read(), "html5lib")
        title = soup.title.contents[0]
        filename = "lyrics/hamilton/raw/"
        filename += "".join(title.split()[2:-1])
        filename += ".txt"
        filename = re.sub(nonos, "", filename)
        p = soup.p
        with open(filename, "w") as hll:
            for item in p:
                if type(item) == NavigableString:   
                    hll.write(item + "\n")

NameError: name 'bs4' is not defined

### Gathering Links for Kendrick Lamar

In [91]:
BASE_URL = "https://www.azlyrics.com/"
CATALOG_EXT = "k/kendricklamar.html"

html = urlopen(BASE_URL + CATALOG_EXT).read()

soup = BeautifulSoup(html, "html5lib")

catalog = soup.find("div", id="listAlbum")
songs = catalog.findAll("a")

with open("song_lyric_links/kendrick_lamar-links.txt", "w") as kll:
    for song in songs:
        link = song.get("href")
        if link is not None:
            if link.startswith(".."):
                link = BASE_URL + link.lstrip("../") + "\n"
                kll.write(link)

### Scraping the lyrics

In [10]:
BASE_DIR = "lyrics/kendrick_lamar/"
with open("song_lyric_links/kendrick_lamar-links.txt", "r") as kll:
    for link in kll:
        html = urlopen(link).read()
        soup = BeautifulSoup(html, "html5lib")
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        title = soup.title.contents[0].split()
        title = "".join(title[4:])
        title = re.sub(nonos, "", title)
        filename = BASE_DIR + title + ".txt"
        print(filename)
        for comment in comments:
            if "Usage of azlyrics.com" in comment: # lyrics follow this comment
                parent = comment.parent
                with open(filename, "w") as f:
                    for item in parent:
                        if type(item) == NavigableString:
                            if len(item) > 1: # ignore blank lines
                                f.write(item)
        time.sleep(random.randint(5, 12))

lyrics/kendrick_lamar/IsItLove.txt
lyrics/kendrick_lamar/Celebration.txt
lyrics/kendrick_lamar/PP.txt
lyrics/kendrick_lamar/SheNeedsMe.txt
lyrics/kendrick_lamar/IAmInterlude.txt
lyrics/kendrick_lamar/WannaBeHeard.txt
lyrics/kendrick_lamar/IDoThis.txt
lyrics/kendrick_lamar/UncleBobbyJasonKeaton.txt
lyrics/kendrick_lamar/Faith.txt
lyrics/kendrick_lamar/Trip.txt
lyrics/kendrick_lamar/VanitySlaves.txt
lyrics/kendrick_lamar/FarFromHere.txt
lyrics/kendrick_lamar/Thanksgiving.txt
lyrics/kendrick_lamar/LetMeBeMe.txt
lyrics/kendrick_lamar/Determined.txt
lyrics/kendrick_lamar/TheHeartPt2.txt
lyrics/kendrick_lamar/GrowingApartToGetCloser.txt
lyrics/kendrick_lamar/NightOfTheLivingJunkies.txt
lyrics/kendrick_lamar/PP15.txt
lyrics/kendrick_lamar/AlienGirlTodaywHer.txt
lyrics/kendrick_lamar/OppositesAttractTomorrowwoHer.txt
lyrics/kendrick_lamar/MichaelJordan.txt
lyrics/kendrick_lamar/IgnoranceIsBliss.txt
lyrics/kendrick_lamar/ROTCInterlude.txt
lyrics/kendrick_lamar/BarbedWire.txt
lyrics/kendrick_lam

In [43]:
links = get_links("https://www.azlyrics.com/n/nickiminaj.html", "out.txt")
with open("song_lyric_links/nicki_minaj-links.txt", "w") as nml:
    for link in links:
        nml.write(link + "\n")

In [122]:
with open("song_lyric_links/nicki_minaj-links.txt", "r") as links:
    for url in links:
        title, lines = scrape_lyrics(url)
        with open("lyrics/nicki_minaj/" + title + ".txt", "w") as file:
            for line in lines:
                file.write(line + "\n")
        time.sleep(random.randint(5, 12)) # avoid making azlyrics mad

In [117]:
html = urlopen("https://www.azlyrics.com/l/linmanuelmiranda.html").read()
album = "Hamilton: An American Musical"
album_regex = re.compile(album)
soup = BeautifulSoup(html, "html5lib")
album = soup.find(string=album_regex).find_parent("div", class_="album")
for track in album:
    print(track)
# print(a)
# albums = soup.findAll("div", class_="album")
# for album in albums:
    # print(type(album))
    # print(album.string)
# print(albums)



album: 
<b>"Hamilton: An American Musical"</b>
 (2015)
