In [2]:
import os
import re
import random
import numpy as np

from time import time, sleep
from selenium import webdriver
from collections import namedtuple
from collections import defaultdict

from work.rap_machine_genius_code import read_pickle, write_pickle, Song

In [3]:
# Define the Song data structure that will store info about each song.
# Remember: namedtuples require that you fill in each field.

Song = namedtuple('Song', ['header', 'verified', 'metadata', 'lyrics'])

In [4]:
def start_driver(url):
    """This function will open a Chrome browser at the given url.
    Args: 
      url (string): a valid URL as a string.
    Returns:
      None

    Doesn't return anything, but creates a global variable *driver*
    that can be called upon elsewhere."""

    # path to the chromedriver executable

    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver

    # Using the global keyword so that we can call upon the driver
    # further down in the code.

    global driver
    driver = webdriver.Chrome(chromedriver)

    driver.get(url)

In [5]:
# Start by opening the all rap songs page.
# I might spend some time exploring the architecture of Genius.com
# to identify other places to grab song links.

all_rap_songs_genius = "https://genius.com/tags/rap/all"
start_driver(all_rap_songs_genius)

In [11]:
# Use Selenium to scroll down to the bottom of the screen
# Be sure to click on the Chrome browser to give it focus.
# You may have to run this cell a couple times 
# before it gets you all the way to the bottom of the screen

for _ in range (60):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(1)

In [12]:
# This code snippet is there to grab links to songs
# on the https://genius.com/tags/rap/all page.
# It appears that 1000 is the most search results
# that genius.com will display at one time.


link_selector = '//a[@class= " song_link"]'
top_songs = driver.find_elements_by_xpath(link_selector)


link_list = []

for elem in top_songs:
    link_list.append(elem.get_attribute('href'))

In [13]:
# I save the list of links as a pickle.
# That way, I don't have to do this again.

write_pickle(link_list, 'data/links.pkl')

Pickling aborted because data/links.pkl is already a file.


In [14]:
# If I pick up again I can just pull from the pickle

link_list = read_pickle('data/links.pkl')

In [15]:
def scrape_info(list_of_urls):
    
    """This function uses the Chrome driver to visit 
    every page on a list of Genius links,
    and gathers the info I want to analyze.
    Warning: this function can take an hour to run,
    because it pauses on each page to avoid getting your IP banned.
    
    Args:
      list_of_urls:
        a list of strings. Must be links on Genius.com
        otherwise it won't work. 
        
    Returns:
      dict_of_songs:
        a dictionary where each key is a url to a Genius page
        for a given song, and the value is a namedtuple
        of the Song format (defined at the top of this notebook)."""

    dict_of_songs = defaultdict(Song)

    for i, url in enumerate(list_of_urls):
        driver.get(url)
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        sleep(.5+2*random.random())

        # Gather and filter metadata

        views_regex = re.compile('views', re.IGNORECASE)
        contrib_regex = re.compile('contributors', re.IGNORECASE)
        tag_regex = re.compile('(\w+,\s\w+)+')

        all_metadata = driver.find_elements_by_class_name('metadata_with_icon')

        filtered_metadata = [metadata.text for metadata in all_metadata
                             if re.search(views_regex, metadata.text)
                             or re.search(contrib_regex, metadata.text)
                             or re.search(tag_regex, metadata.text)]

        # Scrape raw lyrics

        lyrics_string = ''

        lyrics = driver.find_elements_by_class_name('song_body-lyrics')
        for lyric in lyrics:
            lyrics_string += lyric.text

        # Scrape header

        header_string = driver.find_element_by_class_name(
            'header_with_cover_art-primary_info').text

        # Determine whether artist has contributed to their song page

        try:
            ver_art = driver.find_element_by_class_name(
                'song_verified_artists-section')
            verified_bool = True
        except:
            verified_bool = False
            
        # Create Song named tuple and add to dictionary

        dict_of_songs[url] = Song(header=header_string, lyrics=lyrics_string,
                                  verified=verified_bool, metadata=filtered_metadata)
        if (i+1 % 100 == 0):
            time.sleep(320)

    return dict_of_songs

In [None]:
# Run this cell to get it to visit each link in the list!
# Careful--this may take hours!
song_dict = scrape_info(link_list)

In [18]:
write_pickle(song_dict, 'data/songs_dict.pkl')

Pickling aborted because data/songs_dict.pkl is already a file.


In [17]:
song_dict = read_pickle('data/songs_dict.pkl')