In [1]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import pickle

### Problem 1
Modify `scrape_books()` so that it gathers the price for each fiction book and
returns the mean price, in £, of a fiction book.

In [2]:
def scrape_books(start_page = "index.html"):
    """ Crawl through http://books.toscrape.com and extract fiction data"""
    base_url="http://books.toscrape.com/catalogue/category/books/fiction_10/"
    prices = []
    page = base_url + start_page                # Complete page URL.
    next_page_finder = re.compile(r"next")      # We need this button.
    
    current = None

    for _ in range(4):
        while current == None:                   # Try downloading until it works.
            # Download the page source and PAUSE before continuing.  
            page_source = requests.get(page).text
            time.sleep(1)           # PAUSE before continuing.
            soup = BeautifulSoup(page_source, "html.parser")
            current = soup.find_all(class_="price_color")
    
            
        # Navigate to the correct tag and extract title.
        for book in current:
            prices.append(float(book.string[2:]))
    
        # ind the URL for the page with the next data
        if "page-4" not in page:
            # Find the URL for the page with the next data.
            new_page = soup.find(string=next_page_finder).parent["href"]    
            page = base_url + new_page      # New complete page URL.
            current = None
    return float(np.mean(prices))
    return NotImplementedError("Problem 1 Incomplete")

In [3]:
#Save the value as a pickle file
value = scrape_books()
with open("ans1", "wb") as fp:
    pickle.dump(value,fp)

### Problem 2
Modify `bank_data()` so that it extracts the total consolidated assets ("Consol
Assets") for JPMorgan Chase, Bank of America, and Wells Fargo recorded each December from
2004 to the present. Return a list of lists where each list contains the assets of each bank.

In [4]:
def bank_data():
    """Crawl through the Federal Reserve site and extract bank data."""
    # Compile regular expressions for finding certain tags.
    link_finder = re.compile(r"December 31, 20(0[4-9]|1[0-9])")
    chase_bank_finder = re.compile(r"^JPMORGAN CHASE BK")
    america_bank_finder = re.compile(r"^BANK OF AMER")
    wells_fargo_finder = re.compile(r"^WELLS FARGO BK")

    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.federalreserve.gov/releases/lbr/"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    link_tags = base_soup.find_all(name='a', href=True, string=link_finder)
    pages = [base_url + tag.attrs["href"] for tag in link_tags]
    
    # Crawl through the individual pages and record the data.
    chase_assets = []
    america_assets = []
    wells_assets = []
    for page in pages:
        time.sleep(1)               # PAUSE, then request the page.
        soup = BeautifulSoup(requests.get(page).text, "html.parser")

        # Find the tag corresponding to the banks' consolidated assets.
        chase_temp_tag = soup.find(name="td", string=chase_bank_finder)
        america_temp_tag = soup.find(name="td", string=america_bank_finder)
        wells_temp_tag = soup.find(name="td", string=wells_fargo_finder)

        for _ in range(10):
            chase_temp_tag = chase_temp_tag.next_sibling
            america_temp_tag = america_temp_tag.next_sibling
            wells_temp_tag = wells_temp_tag.next_sibling  
        # Extract the data, removing commas.
        chase_assets.append(int(chase_temp_tag.string.replace(',', '')))
        america_assets.append(int(america_temp_tag.string.replace(',', '')))
        wells_assets.append(int(wells_temp_tag.string.replace(',', '')))

    return([chase_assets, america_assets, wells_assets])
    raise NotImplementedError("Problem 4 Incomplete")

In [5]:
#Save the list of lists as a pickle file
value = bank_data()
with open("ans2", "wb") as fp:
    pickle.dump(value,fp)

### Problem 3
The Basketball Reference website at `https://www.basketball-reference.com`
contains data on NBA athletes, including which player led different categories for each season.
For the past ten seasons, identify which player had the most season points and find how many
points they scored during that season. Return a list of triples consisting of the season, the
player, and the points scored, ("season year", "player name", points scored).

In [6]:
def prob3():
    '''The Basketball Reference website at 
    https://www.basketball-reference.com} hosts data on NBA athletes, 
    including which player led different categories.
    For the past ten years, identify which player had the most season points.
    Return a list of triples, ("season year", "player name", points scored).
    '''
    # Compile regular expressions for finding certain tags.
    link_finder = re.compile(r"1[0-9] Leaders$")
    most_points_finder = re.compile(r"Points")
    year_finder = re.compile(r"201[0-9]")
    most_points = []

    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.basketball-reference.com"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    link_tags = base_soup.find_all(name='option', value=True, string=link_finder)
    pages = [base_url + tag.attrs["value"] for tag in link_tags]
    
    # Crawl through the individual pages and record the data.
    for page in pages:
        time.sleep(1)               # PAUSE, then request the page.
        soup = BeautifulSoup(requests.get(page).text, "html.parser")

        #Find the tag with corresponding to the highest scorer
        temp_tag = soup.find(name="caption", string=most_points_finder)
        temp_tag = temp_tag.next_sibling
        temp_tag = temp_tag.next_sibling
        
        #Get the name, points, and year
        name_tag = temp_tag.find('a').string
        points_tag = int(temp_tag.find('td', class_= 'value').string)
        year = int(re.findall(year_finder, page)[0])
        most_points.append((year, str(name_tag), points_tag))
        
    return(most_points)
    raise NotImplementedError("Problem 3 Incomplete")

In [7]:
#Save the list of tuples as a pickle file
value = prob3()
with open("ans3", "wb") as fp:
    pickle.dump(value,fp)

### Problem 4
The website IMDB contains a variety of information on movies. Specifically,
information on the top 10 box offce movies of the week can be found at `https://www.imdb.
com/chart/boxoffice`. Using `BeautifulSoup`, `Selenium`, or both, return a list of the top 10
movies of the week and order the list according to the total grossing of the movies, from most
money to the least.

In [8]:
def prob4():
    """
    Sort the Top 10 movies of the week by Total Grossing, taken from 
    https://www.imdb.com/chart/boxoffice?ref_=nv_ch_cht.

    Returns:
        titles (list): Top 10 movies of the week sorted by total grossing
    """
    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.imdb.com/chart/boxoffice"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    movie_ranks = []
    movie_money = []
    
    #Get all the titles and prices
    movie_tags = base_soup.find_all(name='td', class_ = "titleColumn")
    movie_gross = base_soup.find_all(name='span', class_ = 'secondaryInfo')
    for tag in movie_tags:
        movie_ranks.append(str(tag.a.string))
    for tag in movie_gross:
        movie_money.append(float(str(tag.string)[1:-1]))
        
    #Create a data frame for sorting all the data in the proper order
    df = pd.DataFrame({
        'col1': movie_ranks,
        'col2': movie_money})
    
    #Return the data in descending order
    df = df.sort_values(by=['col2'],ascending=False)
    return(list(df['col1']))
        
    raise NotImplementedError("Problem 4 Incomplete")

In [9]:
#Save the list of movie names as a pickle file
value = prob4()
with open("ans4", "wb") as fp:
    pickle.dump(value,fp)

### Problem 5
The arXiv (pronounced "archive") is an online repository of scientific publications,
hosted by Cornell University. Write a function that accepts a string to serve as a search
query defaulting to linkedin. Use `Selenium` to enter the query into the search bar of `https:
//arxiv.org` and press Enter. The resulting page has up to 50 links to the PDFs of technical
papers that match the query. Gather these URLs, then continue to the next page (if there are
more results) and continue gathering links until obtaining at most 150 URLs. Return the list
of URLs.

In [10]:
def prob5(search_query='linkedin'):
    """Use Selenium to enter the given search query into the search bar of
    https://arxiv.org and press Enter. The resulting page has up to 25 links
    to the PDFs of technical papers that match the query. Gather these URLs,
    then continue to the next page (if there are more results) and continue
    gathering links until obtaining at most 100 URLs. Return the list of URLs.

    Returns:
        (list): Up to 100 URLs that lead directly to PDFs on arXiv.
    """
    #Create a list and initiate the browser
    url_list = []
    browser = webdriver.Chrome('/home/mark/chromedriver')
    browser.get("https://arxiv.org")
    try:
        #Try to use the search bar and search the given query
        search_bar = browser.find_element_by_name('query')
        search_bar.clear()
        search_bar.send_keys(search_query)
        search_bar.send_keys(Keys.RETURN)

        #Keep going from page to page until we have 150 names or there are no more pages
        while(True):
            page_soup = BeautifulSoup(browser.page_source,'html.parser')
            tech_tags = page_soup.find_all(name='p', class_ = "list-title is-inline-block")
            #Save all of the links found on the page
            for tag in tech_tags:
                try:
                    url_list.append(str(tag.span.a['href']))
                except: 
                    pass

            #Return the list if it is already at least 150 long
            if(len(url_list)>= 150):
                browser.close()
                return(url_list[:150])
            try:
                #Otherwise click the next button to the next page
                next_bar = browser.find_element_by_class_name('pagination-next')
                next_bar.click()
            except:
                #If there is no next page return the list
                browser.close()
                return(url_list)  

    except NoSuchElementException:
        print("Could not find the search bar!")
    raise NotImplementedError("Problem 5 Incomplete")

In [11]:
#Save the answer as a pickle file
value = prob5()
with open("ans5", "wb") as fp:
    pickle.dump(value,fp)