In [2]:
import re
import time
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import pickle

## Extracting data from book corpus
Function below gathers the price for each fiction book and
returns the mean price, in £, of a fiction book.

In [3]:
def scrape_books(start_page = "index.html"):
    """ Crawl through http://books.toscrape.com and extract fiction data"""
    base_url="http://books.toscrape.com/catalogue/category/books/fiction_10/"
    prices = []
    page = base_url + start_page                # Complete page URL.
    next_page_finder = re.compile(r"next")      # We need this button.
    
    current = None

    for _ in range(2):
        while current == None:                   # Try downloading until it works.
            # Download the page source and PAUSE before continuing.  
            page_source = requests.get(page).text
            time.sleep(1)           # PAUSE before continuing.
            soup = BeautifulSoup(page_source, "html.parser")
            current = soup.find_all(class_="product_pod")
            
        # Navigate to the correct tag and extract title.
        for book in current:
            prices.append(float(book.h3.next_sibling.next_sibling.p.string[2:]))
    
        # ind the URL for the page with the next data
        if "page-2" not in page:
            # Find the URL for the page with the next data.
            new_page = soup.find(string=next_page_finder).parent["href"]    
            page = base_url + new_page      # New complete page URL.
            current = None
    return sum(prices)/len(prices)

## Scraping bank metadata
Function below extracts the total consolidated assets ("Consol
Assets") for JPMorgan Chase, Bank of America, and Wells Fargo recorded each December from
2004 to the present. Returns a list of lists where each list contains the assets of each bank.

In [5]:
def bank_data():
    """Crawl through the Federal Reserve site and extract bank data."""
    # Compile regular expressions for finding certain tags.
    link_finder = re.compile(r"December 31, (?!2003)")
    chase_bank_finder = re.compile(r"^JPMORGAN CHASE BK")
    boa_bank_finder = re.compile(r"^BANK OF AMER")
    wf_bank_finder = re.compile(r"^WELLS FARGO BK")

    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.federalreserve.gov/releases/lbr/"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    link_tags = base_soup.find_all(name='a', href=True, string=link_finder)
    pages = [base_url + tag.attrs["href"] for tag in link_tags]

    # Crawl through the individual pages and record the data.
    chase_assets = []
    boa_assets = [] 
    wf_assets = []
    for page in pages:
        time.sleep(1)               # PAUSE, then request the page.
        soup = BeautifulSoup(requests.get(page).text, "html.parser")

        # Find the tag corresponding to Chase banks' consolidated assets.
        temp_tag = soup.find(name="td", string=chase_bank_finder)

        for _ in range(10):
            temp_tag = temp_tag.next_sibling
            
        # Extract the data, removing commas.
        chase_assets.append(int(temp_tag.string.replace(',', '')))
        
        # Find the tag corresponding to Bank of America banks' consolidated assets.
        temp_tag = soup.find(name="td", string=boa_bank_finder)

        for _ in range(10):
            temp_tag = temp_tag.next_sibling
            
        # Extract the data, removing commas.
        boa_assets.append(int(temp_tag.string.replace(',', '')))
        
        # Find the tag corresponding to Wells Fargo banks' consolidated assets.
        temp_tag = soup.find(name="td", string=wf_bank_finder)

        for _ in range(10):
            temp_tag = temp_tag.next_sibling
            
        # Extract the data, removing commas.
        wf_assets.append(int(temp_tag.string.replace(',', '')))

    return [chase_assets, boa_assets, wf_assets]

## Pulling data about basketball
The Basketball Reference website at `https://www.basketball-reference.com`
contains data on NBA athletes, including which player led different categories for each season.
For the past ten seasons, we identify which player had the most season points and find how many
points they scored during that season. Function below returns a list of triples consisting of the season, the
player, and the points scored, ("season year", "player name", points scored).

In [7]:
def basketball_data():
    '''The Basketball Reference website at 
    https://www.basketball-reference.com} hosts data on NBA athletes, 
    including which player led different categories.
    For the past ten years, we identify which player had the most season points.
    Return a list of triples, ("season year", "player name", points scored).
    '''
    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.basketball-reference.com"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    
    # Now let's make the tags to look for 
    value_tags = [] 
    for i in range(10):
        value_tags.append( "/leagues/NBA_201"+str(i)+"_leaders.html" )
    # Get the href's from the soup
    link_names = [list(base_soup.find_all(value=val_tag))[0]["value"] for val_tag in value_tags]
    # Here's where we want to go 
    pages = [base_url + link_name for link_name in link_names]
    
    # Crawl through the individual pages and record the data. We want a triple as described above from each page
    triples = []
    for year_digit, page in enumerate(pages):
        time.sleep(1)               # PAUSE, then request the page.
        soup = BeautifulSoup(requests.get(page).text, "html.parser")
        
        # We have a soup. Get that triple!
        player_name = list(soup.find(string="Points").parent.next_sibling.next_sibling.children)[3].a.text
        score = list(soup.find(string="Points").parent.next_sibling.next_sibling.children)[5].text.strip()
        year = 2010 + year_digit
        season_string = str(year-1) + "-" + str(year)[-2:]
        # we got it 
        triples.append((season_string, player_name, int(score)))
    
    return triples
        
    
    

## Movie data
The website IMDB contains a variety of information on movies. Specifically,
information on the top 10 box offce movies of the week can be found at `https://www.imdb.
com/chart/boxoffice`. Using `BeautifulSoup` the function below returns a list of the top 10
movies of the week and order the list according to the total grossing of the movies, from most
money to the least.

In [10]:
def imdb_data():
    """
    Sort the Top 10 movies of the week by Total Grossing, taken from 
    https://www.imdb.com/chart/boxoffice?ref_=nv_ch_cht.

    Returns:
        titles (list): Top 10 movies of the week sorted by total grossing
    """
    # We want to collect names and prices 
    mov_names = []
    mov_prices = []
    # Open the website
    url="https://www.imdb.com/chart/boxoffice"
    page_source = requests.get(url).text
    # Make some soup
    soup = BeautifulSoup(page_source, "html.parser")
    for movie_tag in soup.find_all(height="67"):
        # Get name and price
        name = movie_tag.parent.next_sibling.next_sibling.a.text.strip()
        mov_names.append(name)
        price = float(movie_tag.parent.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.span.text.strip().strip('$').strip('M'))
        mov_prices.append(price)
    # Now our lists are all good
    mov_names, mov_prices = np.array(mov_names), np.array(mov_prices)
    order = np.argsort(mov_prices)[::-1]
    return list(mov_names[order])
    

## Searching arXiv with selenium
### Ensure you have up-to-date web-driver
The arXiv (pronounced "archive") is an online repository of scientific publications,
hosted by Cornell University. Write a function that accepts a string to serve as a search
query defaulting to linkedin. Use `Selenium` to enter the query into the search bar of `https:
//arxiv.org` and press Enter. The resulting page has up to 50 links to the PDFs of technical
papers that match the query. Gather these URLs, then continue to the next page (if there are
more results) and continue gathering links until obtaining at most 150 URLs. Return the list
of URLs.

In [12]:
def prob5(search_query="linkedin"):
    """Use Selenium to enter the given search query into the search bar of
    https://arxiv.org and press Enter. The resulting page has up to 25 links
    to the PDFs of technical papers that match the query. Gather these URLs,
    then continue to the next page (if there are more results) and continue
    gathering links until obtaining at most 100 URLs. Return the list of URLs.

    Returns:
        (list): Up to 100 URLs that lead directly to PDFs on arXiv.
    """
    browser = webdriver.Chrome()
    try:
        browser.get("https://arxiv.org")
        # Go to the search bar 
        try:
            search_bar = browser.find_element_by_tag_name('input')
            search_bar.clear()
            search_bar.send_keys(search_query)
            # Now return using keys library
            search_bar.send_keys(Keys.RETURN)
        except NoSuchElementException:
            print("Could not find the search bar!")
            raise
        # Now we gotta change the number of results to 200
        option = browser.find_element_by_xpath("//option[@value='200']")
        option.click()
        go = browser.find_element_by_xpath("//button[@class='button is-small is-link']")
        go.click()
        # Now we want to add things to the list of urls 
        list_urls = []
        link_regex = re.compile(r'arXiv:\d+?')
        try:
            page_source = requests.get(browser.current_url).text
            soup = BeautifulSoup(page_source, "html.parser")
            for a in soup.find_all(string=link_regex,href=True):
                list_urls.append(a["href"])
                if len(list_urls) > 100:
                    break
                # Now find the 'Next' button and push it!!!
        except:
            raise
                
    finally:
        browser.close()
    
    return list_urls

## Scrape church-member-pertinent data from congregation database

In [None]:
def ward_list():
    
    # We want to collect names
    ward_names = []
    # Open the website
    url="https://directory.churchofjesuschrist.org/169633"
    page_source = requests.get(url).text
    # Make some soup
    soup = BeautifulSoup(page_source, "html.parser")
    for movie_tag in soup.find_all(height="67"):
        # Get name and price
        name = movie_tag.parent.next_sibling.next_sibling.a.text.strip()
        mov_names.append(name)
        price = float(movie_tag.parent.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.span.text.strip().strip('$').strip('M'))
        mov_prices.append(price)
    # Now our lists are all good
    mov_names, mov_prices = np.array(mov_names), np.array(mov_prices)
    order = np.argsort(mov_prices)[::-1]
    return list(mov_names[order])
    