# Web Scrape Raw Data

In this notebook, we will scrape the Ultimate Guitar page for each song and save it for downstream parsing and analysis.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from itertools import chain
import os

## Setup

### Defining Variables

In [2]:
# where do we want to save the output?
output_dir = "/Volumes/SECONDDRIVE/prog/ug/raw_data/"

# what decades do we want to save data for?
decades = ['1970', '1980', '1990', '2000', '2010']

# what page do we want to start at?
# 1 is the lowest, but use a higher number to get less popular songs
start_index = 1
# how many pages do we want to save?
pages_to_save = 10

### Setting up the driver

In [3]:
driver = webdriver.Firefox(executable_path=r'/Users/morganoneka/Documents/PersonalProjects/geckodriver')

## Running geckodriver

In [4]:
for decade in decades:
    
    # this will create subdirectories for each decade if one doesn't exist in our main output directory yet
    if not os.path.exists(output_dir + decade):
        os.mkdir(output_dir + decade)

        
    # this will create a sub-subdirectory for the genre
    if not os.path.exists(output_dir + decade ):
        os.mkdir(output_dir + decade )
    else:
        for page in range(start_index, start_index+pages_to_save+1):
            print (decade + " " + str(page))
            # define the url using user-input variables
            url = "https://www.ultimate-guitar.com/explore?decade[]=" + decade + "&part[]=&type[]=Chords&page=" + str(page)

            print(url)

            # if the url doesn't work, it's likely we ran out of pages, so break this loop
            if not requests.head(url).status_code == requests.codes.ok:
                break

            # get html using driver
            driver.get(url)

            # parse the page
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # get urls
            song_list = soup.find_all("div", class_="_2vvgp")
            song_spans = [x.find_all("a") for x in song_list]
            links = [x['href'] for x in list(chain.from_iterable(song_spans))]

            for link in links:
                driver.get(link)
#                 print(link)
                html = driver.page_source
                if (len(link.split("/")) < 6):
                    continue
                artist = link.split("/")[4]
                song = link.split("/")[5].split("-chords")[0]
                output_file_name = output_dir + decade + "/" + "/" + artist + "_" + song + ".txt"
                if not os.path.isfile(output_file_name):
                    with open(output_file_name, "w") as text_file:
                        text_file.write(html)
        
                

1970 1
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=1
1970 2
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=2
1970 3
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=3
1970 4
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=4
1970 5
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=5
1970 6
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=6
1970 7
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=7
1970 8
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=8
1970 9
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=9
1970 10
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=10
1970 11
https://www.ultimate-guitar.com/explore?decade[]=1970&part[]=&type[]=Chords&page=11
1980 1

## Possible Errors
- **TimeoutException** TimedPromise timed out after 300000 ms