### Scraping song lyrics using Genius API

Import necessary packages

In [1]:
import json
import requests
import pandas as pd
from scrapy import Selector
from pprint import pprint

Open JSON file containing credentials

In [2]:
credentials_file_path = "./credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

Initialise a new session

In [3]:
# start a new session
session = requests.Session()

I created a custom function `scrape_lyrics` to scrape song lyrics from the Genius page for any given song. 

Note that the lyrics returned are formatted such that each line of lyric appears in a new line, similar to how it is displayed on the Genius page.

In [4]:
def scrape_lyrics(song_url):
    '''
    Returns a string of song lyrics, with each line separated by a new line

        Parameters:
            song_url (str): The URL of the Genius page for the song

        ReturnsL
            lyrics (str): The lyrics of the song
    '''
    response = session.get(song_url)
    sel = Selector(text=response.text)
    lyrics = '\n'.join(sel.css('div.Lyrics__Container-sc-1ynbvzw-1.kUgSbL ::text').getall())

    return lyrics

For example:

In [5]:
print(scrape_lyrics("https://genius.com/Onerepublic-i-aint-worried-lyrics"))

[Verse 1]
I don't know what you've been told
But time is running out, no need to take it slow
I'm stepping to you toe-to-toe
I should be scared, honey, maybe so
[Chorus]
But I ain't worried 'bout it right now (Right now)
Keeping dreams alive, 1999, heroes
I 
ain't worried 'bout it right now (Right now)
Swimmin' in the floods, 
dancing on the clouds below
[Post-Chorus]
I ain't worried 'bout it
I ain't worried 'bout it (Hey)
[Verse 2]
I don't know what you've been told
But time is running out so spend it like it's gold
I'm living like I'm nine-zeros
Got no regrets even when I am broke (Yeah)
I'm at my best when I got something I'm wanting to steal
Way too busy for them problems and problems to feel (Yeah, yeah)
No stressing, just obsessing with sealing the deal
I'll take it in and let it go
[Chorus]
But I ain't worried 'bout it right now (Right now)
Keeping dreams alive, 1999, heroes
I ain't worried 'bout it right now (Right now)
Swimmin' in the floods, dancing on the clouds below
[Post-

At this point of data collection, we will have a pandas dataframe of already selected and filtered songs from using the YouTube API. Critically, the dataframe will have information on the name and artist of each song.

We now want to add the lyrics of each song into the dataframe.

In [6]:
# create placeholder dataframe for testing

songs_data = {
    'Name': ['Yellow', 'Warriors', 'You Belong With Me'],
    'Artist': ['Coldplay', 'Imagine Dragons', 'Taylor Swift']
}

songs_df = pd.DataFrame(songs_data)

In [7]:
songs_df

Unnamed: 0,Name,Artist
0,Yellow,Coldplay
1,Warriors,Imagine Dragons
2,You Belong With Me,Taylor Swift


In [8]:
# Replace 'YOUR_API_KEY' with your actual Genius API key
GENIUS_API_KEY = credentials['client_access_token']
BASE_URL = 'https://api.genius.com'

def search_song_url(song_name, artist_name):
    search_url = BASE_URL + '/search'
    headers = {'Authorization': f'Bearer {GENIUS_API_KEY}'}
    params = {'q': f'{song_name} {artist_name}'}

    response = requests.get(search_url, headers=headers, params=params)
    data = response.json()

    # Check if the request was successful
    if response.status_code == 200:
        hits = data['response']['hits']
        if hits:
            # Extract the URL of the first search result
            song_url = hits[0]['result']['url']
            return song_url
        else:
            print('Song not found.')
    else:
        print(f'Error: {data["meta"]["message"]}')


song_name = 'Yellow'
artist_name = 'Coldplay'
song_url = search_song_url(song_name, artist_name)

In [9]:
songs_df['Genius_URL'] = songs_df.apply(lambda row: search_song_url(row['Name'], row['Artist']), axis=1)

In [16]:
songs_df['Genius_lyrics'] = songs_df.apply(lambda row: scrape_lyrics(row['Genius_URL']), axis=1)

In [23]:
pprint(songs_df)

                 Name           Artist  \
0              Yellow         Coldplay   
1            Warriors  Imagine Dragons   
2  You Belong With Me     Taylor Swift   

                                          Genius_URL  \
0          https://genius.com/Coldplay-yellow-lyrics   
1  https://genius.com/Imagine-dragons-warriors-ly...   
2  https://genius.com/Taylor-swift-you-belong-wit...   

                                       Genius_lyrics  
0  [Verse 1: Chris Martin]\nLook at the stars\nLo...  
1  [Verse 1]\nAs a child, you would wait and watc...  
2  [Verse 1]\nYou're on the phone with your girlf...  


Search by artist name

In [None]:
search_term = "Yellow by Coldplay"
genius_search_url = f"http://api.genius.com/search?q={search_term}&access_token={credentials['client_access_token']}"

In [None]:
response = requests.get(genius_search_url)
json_data = response.json()

In [None]:
json_file_path = './output.json'

with open(json_file_path,'w') as f:
    json.dump(json_data, f)

Get song titles

In [None]:
for song in json_data['response']['hits']:
    print(song['result']['full_title'])

Get song titles and page view counts

In [None]:
coldplay_songs = []
for song in json_data['response']['hits']:
    coldplay_songs.append([song['result']['title_with_featured'], song['result']['url']])
    
# make a Pandas dataframe from a list
coldplay_df = pd.DataFrame(coldplay_songs)
coldplay_df.columns = ['song_title', 'song_url']
coldplay_df