In [1]:
GENIUS_API_TOKEN='genius_credentials'

In [8]:
# Make HTTP requests
import requests
# Scrape data from an HTML document
from bs4 import BeautifulSoup
# I/O
import os
# Search and manipulate strings
import re

import numpy as np
import pandas as pd
import pickle

In [3]:
# Get artist object from Genius API
def request_artist_info(artist_name, page):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response

# Get Genius.com song url's from artist object
def request_song_url(artist_name, song_cap):
    page = 1
    songs = []
    
    while True:
        response = request_artist_info(artist_name, page)
        json = response.json()
        # Collect up to song_cap song objects from artist
        song_info = []
        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                song_info.append(hit)
    
        # Collect song URL's from song objects
        for song in song_info:
            if (len(songs) < song_cap):
                url = song['result']['url']
                songs.append(url)
            
        if (len(songs) == song_cap):
            break
        else:
            page += 1
        
    print('Found {} songs by {}'.format(len(songs), artist_name))
    return songs
    
# DEMO
abba = request_song_url('ABBA', 2)
abba

Found 2 songs by ABBA


['https://genius.com/Abba-mamma-mia-lyrics',
 'https://genius.com/Abba-dancing-queen-lyrics']

In [4]:
# Scrape lyrics from a Genius.com song URL
def scrape_song_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = html.find('div', class_='lyrics').get_text()
    #remove identifiers like chorus, verse, etc
    lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
    #remove empty lines
    lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])         
    return lyrics

In [117]:
# Get artist object from Genius API
def request_song_info(song_title, artist_name, page):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': song_title + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response


# Get Genius.com lyrics from song object
def request_song_lyrics(song_title, artist_name):
    page = 1
    response = request_song_info(song_title, artist_name, page)
    json = response.json()

    # Collect song objects
    song = json['response']['hits'][0]['result']
    artist_first_name = re.findall(r'\w+', artist_name)[0]
    if song_title.lower() in song['title'].lower() and artist_first_name.lower() in song['primary_artist']['name'].lower():
        url = song['url']
        page = requests.get(url)
        html = BeautifulSoup(page.text, 'html.parser')
        
        lyrics = html.find('div', class_='lyrics').get_text()
        #remove identifiers like chorus, verse, etc
        lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
        #remove empty lines
        lyrics = lyrics.replace('\n', ' ')

    else:
        lyrics = np.NaN

    return lyrics


# Create dataframe of all lyrics
def create_lyrics_df(df, song_list):
    for song in song_list:
        artist, track = song.split(' - ')[0], song.split(' - ')[1]
        artist_name_list = re.findall(r'\w+', artist)
        artist_first_name = ' '.join(artist_name_list[:2])
        print(artist_first_name, track)
        
        try:
            l =  request_song_lyrics(track, artist_first_name)
        except:
            l = np.NaN
            
        df = df.append({'artist': artist, 'title': track, 'lyrics': l}, ignore_index=True)
        
    return df

In [118]:
request_song_lyrics('Napalmregen', 'Morlockk Dilemma')

AttributeError: 'NoneType' object has no attribute 'get_text'

In [119]:
request_song_lyrics('110' ,'Capital Bra, Samra & Lea')

AttributeError: 'NoneType' object has no attribute 'get_text'

In [120]:
with open("song_list.data", "rb") as f:   # Unpickling the song list
    songs = pickle.load(f)

In [121]:
lyrics = pd.DataFrame(columns=['artist', 'title', 'lyrics'])

In [122]:
test_run = songs[20:24]
test_run
lyrics = create_lyrics_df(lyrics, test_run)

Outkast feat The Way You Move
Princess I'll Keep On Loving You
Capital Bra Fightclub
Jimmy Somerville Ain't No Mountain High Enough


In [123]:
lyrics

Unnamed: 0,artist,title,lyrics
0,Outkast feat. Sleepy Brown,The Way You Move,
1,Princess,I'll Keep On Loving You,
2,Capital Bra feat. Samra & AK Ausserkontrolle,Fightclub,
3,Jimmy Somerville,Ain't No Mountain High Enough,


In [None]:
for song in arr:
    try:
        artist_from_list = re.findall(r'\w+', song)[0]   # artist name from song_list.data file

        track = genius.search_song(song, artist_from_list)

        artist_from_genius = re.findall(r'\w+', track.artist)[0]    # artist name from genius

        if artist_from_list.lower() != artist_from_genius.lower():   # compare first word in artist name
            l = np.NaN
        else:
            l = track.lyrics
            l = re.sub(r'[\(\[].*?[\)\]]', '', l)
            l = re.sub('\n', ' ', l)

    except:
        l = np.NaN

    df = df.append({'title': song, 'lyrics': l}, ignore_index=True)
        
    return df

In [None]:
import lyricsgenius as lg

genius = lg.Genius(GENIUS_API_TOKEN,  # Client access token from Genius Client API page
                             skip_non_songs=True,
                             remove_section_headers=True)

In [None]:
file = open("./lyrics_1.txt", "w")  # File to write lyrics to

artists = ['ABBA']


def get_lyrics(arr, k):  # Write lyrics of k songs by each artist in arr
    c = 0  # Counter
    for name in arr:
        try:
            songs = (genius.search_artist(name, max_songs=k, sort='popularity')).songs
            s = [song.lyrics for song in songs]
            file.write("\n \n   <|endoftext|>   \n \n".join(s))  # Deliminator
            c += 1
            print(f"Songs grabbed:{len(s)}")
        except:  #  Broad catch which will give us the name of artist and song that threw the exception
            print(f"some exception at {name}: {c}")


get_lyrics(artists, 3)

In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
with open("song_list.data", "rb") as f:   # Unpickling the song list
    songs = pickle.load(f)

In [None]:
test_run = songs[25:30]

In [None]:
lyrics = pd.DataFrame(columns=['title', 'lyrics'])

In [None]:
def get_lyrics(df, arr):  # Write lyrics of k songs by each artist in arr
    for song in arr:
        
        try:
            artist_from_list = re.findall(r'\w+', song)[0]   # artist name from song_list.data file
            
            track = genius.search_song(song, artist_from_list)
            
            artist_from_genius = re.findall(r'\w+', track.artist)[0]    # artist name from genius

            if artist_from_list.lower() != artist_from_genius.lower():   # compare first word in artist name
                l = np.NaN
            else:
                l = track.lyrics
                l = re.sub(r'[\(\[].*?[\)\]]', '', l)
                l = re.sub('\n', ' ', l)
            
        except:
            l = np.NaN
            
        df = df.append({'title': song, 'lyrics': l}, ignore_index=True)
        
    return df

In [None]:
lyrics = get_lyrics(lyrics, test_run)

In [None]:
lyrics

In [None]:
lyrics = get_lyrics(lyrics, ['Angel One - Hold Me Tonight'])

In [None]:
lyrics

In [None]:
lyrics = get_lyrics(lyrics, ['Capital Bra x Samra & Lea - 110'])

In [None]:
lyrics