In [None]:
'''
Before importing the libraries below, make sure they are installed. You can use Anaconda-Navigator's GUI or it's command line program called conda. You can also use the pip command.
Example install commands:

conda install pandas
pip install pandas

conda install beautifulsoup4
pip install beautifulsoup4
'''

# Import library to make HTTP requests, i.e fetch URL similar to a browser request
import requests
# Import web scraping library
from bs4 import BeautifulSoup
# Import data analysis and export library
import pandas as pd

In [None]:
# Request Yahoo Finance URL for a ticker's history (date range defaults to the past year).
# https://finance.yahoo.com/quote/SPY/history

# Specify ticker to scrape.
ticker = 'SPY'

yahoo_finance_url = 'https://finance.yahoo.com/quote/' + ticker + '/history'
page = requests.get(yahoo_finance_url)

In [None]:
# Return HTTP status code to check if URL request was successful.
# https://httpstatuses.com/ for a description of HTTP status codes.
page

In [None]:
# Display raw HTML content of the URL request.
page.content

In [None]:
# Assign HTML content to a BeautifulSoup object to facilitate web scraping.
# html.parser is one of four parser libraries.
# Description of each parser library: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# Convert HTML content into a nicely formatted string for better readability.
print(soup.prettify())

In [None]:
# Find all of the HTML table rows containing the historical data by looking for an HTML tab and its defining attribute.
# You can inspect the HTML elements by using the Chrome browser and right clicking over the desired element and clicking "Inspect."
# Raw HTML of the table row:
# <tr class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)">
historical_prices = soup.select('tr[class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)"]')

In [None]:
# Confirm if the rows were found by printing the found HTML table rows.
historical_prices

In [None]:
# Inspect the first found row.
# The index starts at 0. 0 equals the first element, 1 equalst the second element, etc.
historical_prices[0]

In [None]:
# Create an empty Python dictionary data structure to keep track of the found price dates and the adjusted closing price for each row.
# The value for each dictionary key is an empty list.
# The number of items in the value list for the price_date and price_adj_close keys must be identical.
# Dictionary initialization must be in the same cell to clear it out each time you're testing the scrape.
# Data is appended and may lead to mismatched column counts.
historical_data = {
    'price_date' : [],
    'price_adj_close' : []
}

# "for" loop through the found rows of historical data. Each row represents a single date.
for historical_price in historical_prices:
    
    # Display a horizontal line to easily distinguish between each row.
    print("\n----------------\n")

    # Price Adj Close
    # Need to find price adj close first b/c not all rows have it due to a row with just the dividend.
    # <td class="Py(10px) Pstart(10px)"><span>278.25</span></td>
    # Find the HTML table cell tag (td) with the class value of "Py(10px) Pstart(10px)"
    rows = historical_price.select('td[class="Py(10px) Pstart(10px)"]')

    # Proceed if the HTML table cell tag is found with the class value defined above.
    if rows:
        # The following print statements are for debugging to show how the price adj close was found.
        # Price Adj Close is the 5th HTML table cell.
        # There are multiple HTML table cells with the class attribute value of "Py(10px) Pstart(10px)"
        #print(rows)
        print("price_adj_close HTML:",rows[4])
        # .text will exclude the HTML tags and only get the text.
        # "\n" is a newline to format the debugging output.
        print("price_adj_close text:",rows[4].text,"\n")
        price_adj_close = rows[4].text

        # Append the found price adj close to the price_adj_close dictionary key which we'll reference later when exporting the results.
        historical_data['price_adj_close'].append(price_adj_close)
   

        # Price Date
        # The price date has its own unique class value for the HTML table cell that it is in.
        # Raw HTML of the desired table cell:
        # <td class="Py(10px) Ta(start) Pend(10px)"><span>Mar 11, 2019</span></td>
        price_date = historical_price.select('td[class="Py(10px) Ta(start) Pend(10px)"]')

        # Proceed if a price date value is found.
        # Notice how the price date related code is indented inside the "if rows" statement.
        # This will skip over the rows with just a dividdent value without all of the historical pricing data.
        if price_date:
            print("price_date HTML", price_date)
            print("price_date text:",price_date[0].text,"\n")
            price_date = price_date[0].text

            # Append the price date value to the price_date dictionary key.
            historical_data['price_date'].append(price_date)


In [None]:
# Inspect the contents of the historical_data dictionary.
# The list entries should line up. Check to see if the first list value for the price_adj_close key matches up to the first price_date value found on https://finance.yahoo.com/quote/SPY/history
historical_data

In [None]:
# This is why we created a Python dictionary to store the scraped results.
# Assign the Python dictionary to a Pandas DataFrame.
# By storing the data in a Pandas DataFrame we can manipulate the data then export the manipulated results.
# df is a variable name for the DataFrame.
df = pd.DataFrame(historical_data)

In [None]:
# Output the first 5 rows of the DataFrame.
df.head()

In [None]:
# Verify the data types for each DataFrame column.
df.dtypes

In [None]:
# The price_adj_close column is seen as an object and not a float.
# We want to convert price_adj_close to a float to allow for float operations within Excel.
df['price_adj_close'] = df['price_adj_close'].astype(float)

In [None]:
# price_date is listed as an object but will be treated as a string in Excel.
# Convert price_date to a datetime format for date-related operations.
df['price_date'] = pd.to_datetime(df['price_date'])

In [None]:
# Confirm the data types were updated.
df.dtypes

In [None]:
# Converting price_date to datetime will change the format from a string with the month name abbreviation to YYYY-MM-DD.
# Output the first 5 rows to confirm the price_date change.
df.head()

In [None]:
csv_filename = 'historical_data_' + ticker + '.csv'
# Print the csv_filename for debugging.
print(csv_filename)
# Write the DataFrame's contents to a .csv file where the .csv filename has historical_data_ as a prefix, followed by the ticker value.
# index=False omits the index column. Notice in df.head() the first column does not have a column name and starts with a 0 value. That is the index column.
df.to_csv(csv_filename, index=False)

# Verify the results by clicking on the generated .csv file from http://localhost:8885/tree or from File Explorer (Windows)/Finder (Mac).

In [None]:
# Create an Excel file named historical_data.xlsx with a single sheet where the sheet is renamed to the ticker.
excel_filename = 'historical_data.xlsx'
df.to_excel(excel_filename, sheet_name=ticker, index=False)

# Open historical_data.xlsx in Excel to verify the export.
# historical_data.xlsx is written to the same directory as this notebook.