### Obtaining Pricing Data

https://www.quantstart.com/articles/Securities-Master-Database-with-MySQL-and-Python/

In order to obtain the historical data for the current S&P500 constituents, we must first query the database for the list of all the symbols. Once the list of symbols (along with the symbol IDs) have been returned, it is possible to call the Yahoo Finance API and download the historical pricing data for each symbol. Once we have each symbol we can insert the data into the database in turn. Here's the Python code to carry this out:

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import datetime as dt
import MySQLdb as mdb # https://github.com/PyMySQL/mysqlclient-python/blob/master/doc/user_guide.rst#cursor-objects
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import sqlalchemy as sql
import time
import random

get_quandl_data=False
get_yahoo_data=False

# Obtain a database connection to the MySQL instance
db_host = 'localhost'
db_user = 'sec_user'
db_pass = 'securities'
db_name = 'securities_master'
unix_socket = "/var/run/mysqld/mysqld.sock"
dbconn = mdb.connect(host="localhost",unix_socket=unix_socket,user="sec_user",passwd="securities",db="securities_master")

#connect_string = 'mysql://root:southpark@localhost/test'
connect_string = 'mysql://sec_user:securities@localhost/securities_master?unix_socket=/var/run/mysqld/mysqld.sock'
sql_engine = sql.create_engine(connect_string) #Setup a persistent connection


def obtain_list_of_db_tickers():
  """Obtains a list of the ticker symbols in the database."""
  cur = dbconn.cursor()
  cur.execute("SELECT id, ticker FROM symbol")
  data = cur.fetchall()
  return [(d[0], d[1]) for d in data]

'''def get_daily_historic_data_yahoo_old(ticker,
                      start_date=(2000,1,1),
                      end_date=datetime.date.today().timetuple()[0:3]):
    """Obtains data from Yahoo Finance returns and a list of tuples.

  ticker: Yahoo Finance ticker symbol, e.g. "GOOG" for Google, Inc.
  start_date: Start date in (YYYY, M, D) format
  end_date: End date in (YYYY, M, D) format"""

    # Construct the Yahoo URL with the correct integer query parameters
    # for start and end dates. Note that some parameters are zero-based!
    yahoo_url = "http://ichart.finance.yahoo.com/table.csv?s=%s&a=%s&b=%s&c=%s&d=%s&e=%s&f=%s" % \
      (ticker, start_date[1] - 1, start_date[2], start_date[0], end_date[1] - 1, end_date[2], end_date[0])
    print('URL for %s: %s' % (ticker,yahoo_url))
    # Try connecting to Yahoo Finance and obtaining the data
    # On failure, print an error message.
    try:
        yf_data = urllib3.urlopen(yahoo_url).readlines()[1:] # Ignore the header
        resp = requests.get('yahoo_url')
        soup = bs.BeautifulSoup(resp.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        
        prices = []
        for y in yf_data:
          p = y.strip().split(',')
          prices.append( (datetime.datetime.strptime(p[0], '%Y-%m-%d'),
                  p[1], p[2], p[3], p[4], p[5], p[6]) )
    except (OSError, RuntimeError, TypeError, NameError):
        print("Could not download Yahoo data: %s\n" % RuntimeError)
    return prices
'''

def get_daily_historic_data_yahoo(ticker, start_date=("2000,1,1"), end_date=dt.date.today()):#.timetuple()[0:3]):
    from pandas_datareader import data as pdr
    import fix_yahoo_finance
    start_date = dt.datetime.strptime(start_date, "%Y,%m,%d")
    try:
        #Here's what we need: aapl = pdr.get_data_yahoo('AAPL', start=datetime.datetime(2006, 10, 1), end=datetime.datetime(2012, 1, 1))
        stock_df = pdr.get_data_yahoo(ticker, start=start_date, end=end_date)
        #aapl.to_csv('./data/aapl_ohlc.csv') #Export data as csv
        return(stock_df)
    except (OSError, RuntimeError, TypeError, NameError, KeyError) as e:
        print("Could not download Yahoo data: %s\n" % e)
        return(None)



In [2]:
#yf_data.tail()

Note that there are certainly ways we can optimise this procedure. If we make use of the Python ScraPy library, for instance, we would gain high concurrency from the downloads, as ScraPy is built on the event-driven Twisted framework. At the moment each download will be carried out sequentially.

### Table Details for daily_price:
CREATE TABLE daily_price ( id int AUTO_INCREMENT, data_vendor_id int, symbol_id int, price_date datetime, created_date datetime, last_updated_date datetime, open_price decimal(19,4), high_price decimal(19,4), low_price decimal(19,4), close_price decimal(19,4), adj_close_price decimal(19,4), volume bigint, PRIMARY KEY (id), KEY index_data_vendor_id (data_vendor_id), KEY index_synbol_id (symbol_id) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

In [3]:
def insert_daily_data_into_db(data_vendor_id, symbol_id, daily_data):
    #daily_price columns: id, data_vendor_id, symbol_id, price_date, created_date, last_updated_date, open_price, high_price, low_price, close_price, adj_close_price, volume
    now = dt.datetime.utcnow()
    daily_data['data_vendor_id'] = data_vendor_id
    daily_data['symbol_id'] = symbol_id
    daily_data['created_date'] = now
    daily_data['last_updated_date'] = now
    daily_data.to_sql(name='daily_price', con=sql_engine, if_exists = 'append', index=True) #if_exists applies to table. Options are 'fail', 'append', 'replace'
    #print(daily_data.head())
    

In [13]:
if __name__ == "__main__":
  # Loop over the tickers and insert the daily historical
  # data into the database
  tickers = obtain_list_of_db_tickers()
  for t in tickers[:]:
    print("Getting data for %s" % (t[1]))
    yf_data = get_daily_historic_data_yahoo(t[1]) #Passes ticker symbol
    #print(type(yf_data))
    #yf_data = get_daily_historic_data_yahoo('BRK.B') #Passes ticker symbol
    if type(yf_data) == pd.core.frame.DataFrame:
      print("Adding data for %s" % (t[1]))
      insert_daily_data_into_db('1', t[0], yf_data)
    sleeptime = random.randint(1, 5)
    print("Sleeping for %s" % sleeptime); time.sleep(sleeptime)  

Getting data for MMM
Adding data for MMM
Sleeping for 4
Getting data for ABT
Adding data for ABT
Sleeping for 1
Getting data for ABBV
Adding data for ABBV
Sleeping for 3
Getting data for ABMD
Adding data for ABMD
Sleeping for 3
Getting data for ACN
Adding data for ACN
Sleeping for 4
Getting data for ATVI
Adding data for ATVI
Sleeping for 3
Getting data for ADBE
Adding data for ADBE
Sleeping for 1
Getting data for AMD
Adding data for AMD
Sleeping for 2
Getting data for AAP
Adding data for AAP
Sleeping for 3
Getting data for AES
Adding data for AES
Sleeping for 5
Getting data for AMG
Adding data for AMG
Sleeping for 3
Getting data for AFL
Adding data for AFL
Sleeping for 2
Getting data for A
Adding data for A
Sleeping for 5
Getting data for APD
Adding data for APD
Sleeping for 2
Getting data for AKAM
Adding data for AKAM
Sleeping for 4
Getting data for ALK
Adding data for ALK
Sleeping for 2
Getting data for ALB
Adding data for ALB
Sleeping for 4
Getting data for ARE
Adding data for ARE
S