In [1]:
'''
Before importing the libraries below, make sure they are installed. You can use Anaconda-Navigator's GUI or it's command line program called conda. You can also use the pip command.
Example install commands:

conda install pandas
pip install pandas

conda install beautifulsoup4
pip install beautifulsoup4
'''

# Import library to make HTTP requests, i.e fetch URL similar to a browser request
import requests
# Import web scraping library
from bs4 import BeautifulSoup
# Import data analysis and export library
import pandas as pd

In [2]:
# Request Yahoo Finance URL for a ticker's history (date range defaults to the past year).
# https://finance.yahoo.com/quote/SPY/history

# Specify ticker to scrape.
ticker = 'SPY'

yahoo_finance_url = 'https://finance.yahoo.com/quote/' + ticker + '/history'
page = requests.get(yahoo_finance_url)

In [3]:
# Return HTTP status code to check if URL request was successful.
# https://httpstatuses.com/ for a description of HTTP status codes.
page

<Response [200]>

In [4]:
# Display raw HTML content of the URL request.
page.content

b'<!DOCTYPE html><html id="atomic" class="NoJs featurephone" lang="en-US"><head prefix="og: http://ogp.me/ns#"><script>window.performance && window.performance.mark && window.performance.mark(\'PageStart\');</script><meta charset="utf-8"/><title>SPDR S&amp;P 500 (SPY) Stock Historical Prices &amp; Data</title><meta name="keywords" content="SPY, SPDR S&amp;P 500, SPY historical prices, SPDR S&amp;P 500 historical prices, historical prices, stocks, quotes, finance"/><meta http-equiv="x-dns-prefetch-control" content="on"/><meta property="twitter:dnt" content="on"/><meta property="fb:app_id" content="90376669494"/><meta name="theme-color" content="#400090"/><meta name="viewport" content="width=device-width, initial-scale=1"/><meta name="description" lang="en-US" content="Discover historical prices for SPY stock on Yahoo Finance. View daily, weekly or monthly format back to when SPDR S&amp;P 500 stock was issued."/><link rel="manifest" href="/manifest.json"/><link rel="dns-prefetch" href="/

In [5]:
# Assign HTML content to a BeautifulSoup object to facilitate web scraping.
# html.parser is one of four parser libraries.
# Description of each parser library: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
# Convert HTML content into a nicely formatted string for better readability.
print(soup.prettify())

<!DOCTYPE html>
<html class="NoJs featurephone" id="atomic" lang="en-US">
 <head prefix="og: http://ogp.me/ns#">
  <script>
   window.performance && window.performance.mark && window.performance.mark('PageStart');
  </script>
  <meta charset="utf-8"/>
  <title>
   SPDR S&amp;P 500 (SPY) Stock Historical Prices &amp; Data
  </title>
  <meta content="SPY, SPDR S&amp;P 500, SPY historical prices, SPDR S&amp;P 500 historical prices, historical prices, stocks, quotes, finance" name="keywords"/>
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <meta content="on" property="twitter:dnt"/>
  <meta content="90376669494" property="fb:app_id"/>
  <meta content="#400090" name="theme-color"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Discover historical prices for SPY stock on Yahoo Finance. View daily, weekly or monthly format back to when SPDR S&amp;P 500 stock was issued." lang="en-US" name="description"/>
  <link href="/manifest.json" re

In [7]:
# Find all of the HTML table rows containing the historical data by looking for an HTML tab and its defining attribute.
# You can inspect the HTML elements by using the Chrome browser and right clicking over the desired element and clicking "Inspect."
# Raw HTML of the table row:
# <tr class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)">
historical_prices = soup.select('tr[class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)"]')

In [8]:
# Confirm if the rows were found by printing the found HTML table rows.
historical_prices

[<tr class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)" data-reactid="49"><td class="Py(10px) Ta(start) Pend(10px)" data-reactid="50"><span data-reactid="51">Mar 15, 2019</span></td><td class="Py(10px) Pstart(10px)" data-reactid="52"><span data-reactid="53">280.54</span></td><td class="Py(10px) Pstart(10px)" data-reactid="54"><span data-reactid="55">282.21</span></td><td class="Py(10px) Pstart(10px)" data-reactid="56"><span data-reactid="57">280.33</span></td><td class="Py(10px) Pstart(10px)" data-reactid="58"><span data-reactid="59">281.31</span></td><td class="Py(10px) Pstart(10px)" data-reactid="60"><span data-reactid="61">281.31</span></td><td class="Py(10px) Pstart(10px)" data-reactid="62"><span data-reactid="63">81,251,300</span></td></tr>,
 <tr class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)" data-reactid="64"><td class="Py(10px) Ta(start) Pend(10px)" data-reactid="65"><span data-reactid="66">Mar 15, 2019</span></td><td class="Ta(c) Py(10px) Pstart(10px)" colspan="6" dat

In [9]:
# Inspect the first found row.
# The index starts at 0. 0 equals the first element, 1 equalst the second element, etc.
historical_prices[0]

<tr class="BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)" data-reactid="49"><td class="Py(10px) Ta(start) Pend(10px)" data-reactid="50"><span data-reactid="51">Mar 15, 2019</span></td><td class="Py(10px) Pstart(10px)" data-reactid="52"><span data-reactid="53">280.54</span></td><td class="Py(10px) Pstart(10px)" data-reactid="54"><span data-reactid="55">282.21</span></td><td class="Py(10px) Pstart(10px)" data-reactid="56"><span data-reactid="57">280.33</span></td><td class="Py(10px) Pstart(10px)" data-reactid="58"><span data-reactid="59">281.31</span></td><td class="Py(10px) Pstart(10px)" data-reactid="60"><span data-reactid="61">281.31</span></td><td class="Py(10px) Pstart(10px)" data-reactid="62"><span data-reactid="63">81,251,300</span></td></tr>

In [10]:
# Create an empty Python dictionary data structure to keep track of the found price dates and the adjusted closing price for each row.
# The value for each dictionary key is an empty list.
# The number of items in the value list for the price_date and price_adj_close keys must be identical.
# Dictionary initialization must be in the same cell to clear it out each time you're testing the scrape.
# Data is appended and may lead to mismatched column counts.
historical_data = {
    'price_date' : [],
    'price_adj_close' : []
}

# "for" loop through the found rows of historical data. Each row represents a single date.
for historical_price in historical_prices:
    
    # Display a horizontal line to easily distinguish between each row.
    print("\n----------------\n")

    # Price Adj Close
    # Need to find price adj close first b/c not all rows have it due to a row with just the dividend.
    # <td class="Py(10px) Pstart(10px)"><span>278.25</span></td>
    # Find the HTML table cell tag (td) with the class value of "Py(10px) Pstart(10px)"
    rows = historical_price.select('td[class="Py(10px) Pstart(10px)"]')

    # Proceed if the HTML table cell tag is found with the class value defined above.
    if rows:
        # The following print statements are for debugging to show how the price adj close was found.
        # Price Adj Close is the 5th HTML table cell.
        # There are multiple HTML table cells with the class attribute value of "Py(10px) Pstart(10px)"
        #print(rows)
        print("price_adj_close HTML:",rows[4])
        # .text will exclude the HTML tags and only get the text.
        # "\n" is a newline to format the debugging output.
        print("price_adj_close text:",rows[4].text,"\n")
        price_adj_close = rows[4].text

        # Append the found price adj close to the price_adj_close dictionary key which we'll reference later when exporting the results.
        historical_data['price_adj_close'].append(price_adj_close)
   

        # Price Date
        # The price date has its own unique class value for the HTML table cell that it is in.
        # Raw HTML of the desired table cell:
        # <td class="Py(10px) Ta(start) Pend(10px)"><span>Mar 11, 2019</span></td>
        price_date = historical_price.select('td[class="Py(10px) Ta(start) Pend(10px)"]')

        # Proceed if a price date value is found.
        # Notice how the price date related code is indented inside the "if rows" statement.
        # This will skip over the rows with just a dividdent value without all of the historical pricing data.
        if price_date:
            print("price_date HTML", price_date)
            print("price_date text:",price_date[0].text,"\n")
            price_date = price_date[0].text

            # Append the price date value to the price_date dictionary key.
            historical_data['price_date'].append(price_date)



----------------

price_adj_close HTML: <td class="Py(10px) Pstart(10px)" data-reactid="60"><span data-reactid="61">281.31</span></td>
price_adj_close text: 281.31 

price_date HTML [<td class="Py(10px) Ta(start) Pend(10px)" data-reactid="50"><span data-reactid="51">Mar 15, 2019</span></td>]
price_date text: Mar 15, 2019 


----------------


----------------

price_adj_close HTML: <td class="Py(10px) Pstart(10px)" data-reactid="82"><span data-reactid="83">279.93</span></td>
price_adj_close text: 279.93 

price_date HTML [<td class="Py(10px) Ta(start) Pend(10px)" data-reactid="72"><span data-reactid="73">Mar 14, 2019</span></td>]
price_date text: Mar 14, 2019 


----------------

price_adj_close HTML: <td class="Py(10px) Pstart(10px)" data-reactid="97"><span data-reactid="98">280.11</span></td>
price_adj_close text: 280.11 

price_date HTML [<td class="Py(10px) Ta(start) Pend(10px)" data-reactid="87"><span data-reactid="88">Mar 13, 2019</span></td>]
price_date text: Mar 13, 2019 


--

In [11]:
# Inspect the contents of the historical_data dictionary.
# The list entries should line up. Check to see if the first list value for the price_adj_close key matches up to the first price_date value found on https://finance.yahoo.com/quote/SPY/history
historical_data

{'price_date': ['Mar 15, 2019',
  'Mar 14, 2019',
  'Mar 13, 2019',
  'Mar 12, 2019',
  'Mar 11, 2019',
  'Mar 08, 2019',
  'Mar 07, 2019',
  'Mar 06, 2019',
  'Mar 05, 2019',
  'Mar 04, 2019',
  'Mar 01, 2019',
  'Feb 28, 2019',
  'Feb 27, 2019',
  'Feb 26, 2019',
  'Feb 25, 2019',
  'Feb 22, 2019',
  'Feb 21, 2019',
  'Feb 20, 2019',
  'Feb 19, 2019',
  'Feb 15, 2019',
  'Feb 14, 2019',
  'Feb 13, 2019',
  'Feb 12, 2019',
  'Feb 11, 2019',
  'Feb 08, 2019',
  'Feb 07, 2019',
  'Feb 06, 2019',
  'Feb 05, 2019',
  'Feb 04, 2019',
  'Feb 01, 2019',
  'Jan 31, 2019',
  'Jan 30, 2019',
  'Jan 29, 2019',
  'Jan 28, 2019',
  'Jan 25, 2019',
  'Jan 24, 2019',
  'Jan 23, 2019',
  'Jan 22, 2019',
  'Jan 18, 2019',
  'Jan 17, 2019',
  'Jan 16, 2019',
  'Jan 15, 2019',
  'Jan 14, 2019',
  'Jan 11, 2019',
  'Jan 10, 2019',
  'Jan 09, 2019',
  'Jan 08, 2019',
  'Jan 07, 2019',
  'Jan 04, 2019',
  'Jan 03, 2019',
  'Jan 02, 2019',
  'Dec 31, 2018',
  'Dec 28, 2018',
  'Dec 27, 2018',
  'Dec 26, 201

In [12]:
# This is why we created a Python dictionary to store the scraped results.
# Assign the Python dictionary to a Pandas DataFrame.
# By storing the data in a Pandas DataFrame we can manipulate the data then export the manipulated results.
# df is a variable name for the DataFrame.
df = pd.DataFrame(historical_data)

In [13]:
# Output the first 5 rows of the DataFrame.
df.head()

Unnamed: 0,price_date,price_adj_close
0,"Mar 15, 2019",281.31
1,"Mar 14, 2019",279.93
2,"Mar 13, 2019",280.11
3,"Mar 12, 2019",278.26
4,"Mar 11, 2019",277.22


In [14]:
# Verify the data types for each DataFrame column.
df.dtypes

price_date         object
price_adj_close    object
dtype: object

In [15]:
# The price_adj_close column is seen as an object and not a float.
# We want to convert price_adj_close to a float to allow for float operations within Excel.
df['price_adj_close'] = df['price_adj_close'].astype(float)

In [16]:
# price_date is listed as an object but will be treated as a string in Excel.
# Convert price_date to a datetime format for date-related operations.
df['price_date'] = pd.to_datetime(df['price_date'])

In [17]:
# Confirm the data types were updated.
df.dtypes

price_date         datetime64[ns]
price_adj_close           float64
dtype: object

In [18]:
# Converting price_date to datetime will change the format from a string with the month name abbreviation to YYYY-MM-DD.
# Output the first 5 rows to confirm the price_date change.
df.head()

Unnamed: 0,price_date,price_adj_close
0,2019-03-15,281.31
1,2019-03-14,279.93
2,2019-03-13,280.11
3,2019-03-12,278.26
4,2019-03-11,277.22


In [19]:
csv_filename = 'historical_data_' + ticker + '.csv'
# Print the csv_filename for debugging.
print(csv_filename)
# Write the DataFrame's contents to a .csv file where the .csv filename has historical_data_ as a prefix, followed by the ticker value.
# index=False omits the index column. Notice in df.head() the first column does not have a column name and starts with a 0 value. That is the index column.
df.to_csv(csv_filename, index=False)

# Verify the results by clicking on the generated .csv file from http://localhost:8885/tree or from File Explorer (Windows)/Finder (Mac).

historical_data_SPY.csv


In [20]:
# Create an Excel file named historical_data.xlsx with a single sheet where the sheet is renamed to the ticker.
excel_filename = 'historical_data.xlsx'
df.to_excel(excel_filename, sheet_name=ticker, index=False)

# Open historical_data.xlsx in Excel to verify the export.
# historical_data.xlsx is written to the same directory as this notebook.