The goal is to get a catalogue of URLs of song lyrics from the OHHLA corpus. The algorithm walks through the link structure of the website to first get all artists then their albums and then the texts of songs on the albums. Then the text items of the websites containg the lyrics are read out and the lyrics are filtered out.  
This notebook produces a txt file which holds a list of the aquired song lyrics. 

In [None]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
import urllib.error


In [None]:
def getlinks(url,filters=[]):
    
    newurl=[]
    #brokenlist=[]

    for item in url:
    
        #some links on the website are broken and can be skipped
        try:
            html_page = urlopen(item)
        except urllib.error.HTTPError as e:
            if e.getcode() == 404: # check the return code
                #brokenlist.append(item)
                continue
            raise 

        soup = BeautifulSoup(html_page, "lxml")


        for link in soup.findAll('a'):

            #filters out any links that dont redirect to albums
            if any(t in str(link) for t in filters) or link.get("href") is None:
                pass

            else:

                newurl.append(item+link.get('href'))
                
    return newurl

In [None]:
"""Walks through OHHLA.com and makes a list of links to all websites with lyrics listed on the site. 
Then reads out the text of the website to get the lyrics"""
def getlyrics(starturl,baseurl,notalbums,size):

    #the rl redirects to a list of all artist listed on ohhla
    req=Request(starturl)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "lxml")

    #filters out some links that dont belong to artists
    filters=["YFA","anonymous//"]

    #get all links from the website and filters out some links directing elsewhere
    links = []
    for link in soup.findAll('a'):

        if any(q in str(link) for q in filters) or link.get('href') is None:
            pass
        else:    
            links.append(baseurl+link.get('href'))

    
    #some links at the star and at the end of the website dont direct to artists and are cut
    end=len(links)-2
    artlinks=links[size:end]
    
    
    #Call getlinks to obtain the links leading to the albums listed for the artists
    albumurl=getlinks(artlinks,notalbums)

   
    
    #Call getlinks to obtain the links leading to the lyrics listed for one album
    txturls=getlinks(albumurl,notalbums)

   


    textall=[]

    for l in txturls:
        res = requests.get(l)
        html_page = res.content
        soup = BeautifulSoup(html_page, 'html.parser')
        text = soup.find_all(text=True)
        

        #in the returned list the lyrics are always the longest item 
        #this filters out only the lyrics from all the text items on the website
        textall.append(max(text, key=len))
    return textall

In [None]:
#Driver code
#The artists in the corpus are listed over five websites with different url
starturl=["http://ohhla.com/all.html","http://ohhla.com/all_two.html",
          "http://ohhla.com/all_three.html","http://ohhla.com/all_four.html",
         "http://ohhla.com/all_five.html"]
notalbums=["/anonymous/","?C=N;O=D","?C=M;O=A","?C=S;O=A","?C=D;O=A"]

baseurl="http://ohhla.com/"
textall=[]
#This is needed to filter out only links to artist on the site the number of non relevant links is different for each url
sizes=[31,27,27,27,28]
for (url, size) in zip(starturl, sizes):
    textall.append(getlyrics(url,baseurl,notalbums,size))
    

In [None]:
flat_list = [item for sublist in textall for item in sublist]

In [None]:
#Print function to look at the output
for i in textall:
    print()
    print()
    if i =='\n':
        pass
    else:
        print("item",i)
        print()

In [None]:
#Write the lyrics to a file
target = open('lyricsall.txt', 'wb')
for item in flat_list:
    target.write(item.encode('ascii', 'ignore'))
target.close()

In [None]:
print(len(textall))