In [95]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from collections import namedtuple
from collections import defaultdict
import os
import pickle
from time import time, sleep
import random

In [11]:
# path to the chromedriver executable
def start_driver(url):

    """This function will open a Chrome browser at the given url.
    Input: a valid URL as a string.
    Doesn't return anything, but creates a global variable *driver*
    that can be called upon elsewhere."""
    
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver

    
    # Using the global keyword so that we can call upon the driver
    # further down in the code.
    
    global driver 
    driver = webdriver.Chrome(chromedriver)

    driver.get(url)

In [4]:
# Start by opening the all rap songs page.

all_rap_songs_genius = "https://genius.com/tags/rap/all"
start_driver(all_rap_songs_genius)

In [40]:
# Use Selenium to scroll down to the bottom of the screen
# This only works if you manually focus on the Chrome browser.

for i in range (10):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(1)

In [41]:
# This code snippet is there to grab links to songs
# on the https://genius.com/tags/rap/all page.
# It appears that 1000 is the most search results
# that genius.com will display at one time.


link_selector = '//a[@class= " song_link"]'
top_songs = driver.find_elements_by_xpath(link_selector)


link_list = []

for elem in top_songs:
    link_list.append(elem.get_attribute('href'))

In [44]:
# I save the list of links as a pickle.
# That way, I don't have to do this again.

with open('links.pkl', 'wb') as link_pickle:
    pickle.dump(link_list,link_pickle)

In [3]:
# If I pick up again I can just pull from the pickle

with open('links.pkl', 'rb') as link_pickle:
    link_list = pickle.load(link_pickle)

In [68]:
# Define the Song data structure
# Remember: namedtuples require that you plug in 
# every data type.

Song = namedtuple('Song', ['header', 'verified', 'metadata', 'lyrics'])

### Categories to scrape
- [x] Header
    - [x] Title
    - [x] artist
    - [x] featuring
    - [x] producer
- [x] Verified contribution by artist?
- [x] text of lyrics
- [x] Metadata
    - [x] Contributors
    - [x] \# of views
    - [x] tags


In [96]:
def scrape_info(list_of_urls):
    """This function uses the Chrome driver to visit 
    every page on a list of Genius links,
    and gathers the info I want to analyze.
    Input needs to be:
    a list of links to Genius pages, otherwise it won't work. 
    Output will be:
    a dict of named tuples of the Song format."""

    dict_of_songs = defaultdict(Song)

    for i, url in enumerate(list_of_urls):
        driver.get(url)
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        sleep(.5+2*random.random())

        # Gather and filter metadata

        views_regex = re.compile('views', re.IGNORECASE)
        contrib_regex = re.compile('contributors', re.IGNORECASE)
        tag_regex = re.compile('(\w+,\s\w+)+')

        all_metadata = driver.find_elements_by_class_name('metadata_with_icon')

        filtered_metadata = [metadata.text for metadata in all_metadata
                             if re.search(views_regex, metadata.text)
                             or re.search(contrib_regex, metadata.text)
                             or re.search(tag_regex, metadata.text)]

        # Scrape raw lyrics

        lyrics_string = ''

        lyrics = driver.find_elements_by_class_name('song_body-lyrics')
        for lyric in lyrics:
            lyrics_string += lyric.text

        # Scrape header

        header_string = driver.find_element_by_class_name(
            'header_with_cover_art-primary_info').text

        # Determine whether artist has contributed to their song page

        try:
            ver_art = driver.find_element_by_class_name(
                'song_verified_artists-section')
            verified_bool = True
        except:
            verified_bool = False
            
        # Create Song named tuple and add to dictionary

        dict_of_songs[url] = Song(header=header_string, lyrics=lyrics_string,
                                  verified=verified_bool, metadata=filtered_metadata)
        if (i+1 % 100 == 0):
            time.sleep(320)

    return dict_of_songs

In [None]:
start_driver('https://genius.com/Lil-nas-x-old-town-road-remix-lyrics')

In [99]:
song_dict = scrape_info(link_list)

In [104]:
song_dict['https://genius.com/Eminem-kim-lyrics']

'Kim\nEminem\nProduced by Bass Brothers\nAlbum The Marshall Mathers LP'

In [102]:
# Pickling the song dict
with open('songs_dict', 'wb') as song_file:
    pickle.dump(song_dict, song_file)

In [3]:
# Unpickle the list of links
with open('links.pkl', 'rb') as pickle_file:
    link_list = pickle.load(pickle_file)

In [10]:
url = 'https://genius.com/Eminem-rap-god-lyrics'

response = requests.get(url)
tag_soup = bs4.BeautifulSoup(response.text)

lyrics = ''
for lyric in tag_soup.find_all(class_='song_body-lyrics'):
    lyrics += lyric.text

# Removes the bracketed sections like [Intro: Ariana Grande]
# Still need to work on removing header and footer
# header = title, i.e. MONOPOLY by Ariana Grande
# Footer = advertisement at the end: 'More on Genius'

cleaned_lyrics = re.sub(r'\[.+\]', '', lyrics)
cleaned_lyrics = re.sub(r'[\'\"\(\),\?\“\”]', '', cleaned_lyrics.lower())

# Get a list of the words in the lyrics
words_list = cleaned_lyrics.lower().split()

# Freq dict
freq_dict = {}
for word in words_list:
    freq_dict[word] = freq_dict.get(word, 0) + 1

In [115]:
genius_url = "https://genius.com/Eminem-rap-god-lyrics"
driver = webdriver.Chrome(chromedriver)

driver.get(genius_url)

In [103]:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [28]:
all_metadata = driver.find_elements_by_class_name('metadata_with_icon')

metadata_list = []

for item in all_metadata:
    metadata_list.append(item.text)

In [53]:
views1 = metadata_list[1]
contrib = metadata_list[2]
tags1 = metadata_list[3]

In [58]:
# Still need to figure out how to scrape duration of song
# so that I can produce words_per_minute
rap_god = Song(title='Rap God', total_words=len(words_list), unique_words=len(
    freq_dict), lyrics=cleaned_lyrics, views=views1, contributors=contrib, tags=tags1, words_per_minute=123)

In [65]:
print(rap_god.lyrics)


rap god lyrics




look i was gonna go easy on you not to hurt your feelings.
but im only going to get this one chance.
somethings wrong i can feel it.
six minutes. six minutes. six minutes slim shady youre on!
just a feeling ive got. like somethings about to happen but i dont know what. if that means what i think it means were in trouble big trouble; and if he is as bananas as you say im not taking any chances.
you are just what the doc ordered.


im beginnin to feel like a rap god rap god
all my people from the front to the back nod back nod
now who thinks their arms are long enough to slap box slap box
they said i rap like a robot so call me rap-bot


but for me to rap like a computer it must be in my genes
i got a laptop in my back pocket
my penll go off when i half-cock it
got a fat knot from that rap profit
made a livin and a killin off it
ever since bill clinton was still in office
with monica lewinsky feelin on his nutsack
im an mc still as honest
but as rude and as indecent a

In [105]:
xpath = '/html/body/apple-music-player/div/div/div[2]/playback-time[2]/div'

In [106]:
# This didn't get me anywhere.
# But the duration should live in this little player!
music_player = driver.find_elements_by_xpath(xpath)

for elem in music_player:
    print(elem.text)

In [107]:
music_player

[]

In [108]:
html = driver.page_source
soup = BeautifulSoup(html)