In [None]:
import requests
import json
import time
import datetime
import sys
from pathlib import Path
from dotenv import load_dotenv
import os
import yfinance as yf
import pandas as pd
import requests_cache
import nasdaqdatalink
from requests import Session
from requests_cache import CacheMixin, SQLiteCache
from requests_ratelimiter import LimiterMixin, MemoryQueueBucket
from pyrate_limiter import Duration, RequestRate, Limiter
class CachedLimiterSession(CacheMixin, LimiterMixin, Session):
    pass

session = CachedLimiterSession(
    limiter=Limiter(RequestRate(2, Duration.SECOND*5)),  # max 2 requests per 5 seconds
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache("yfinance.cache"),
)

load_dotenv()

PROXY_SERVER=os.getenv("PROXY_SERVER")
api_key=os.getenv("NASDAQ_DATA_LINK_API_KEY")
nasdaqdatalink.ApiConfig.api_key = api_key

session = requests_cache.CachedSession('yfinance.cache')
session.headers['User-agent'] = 'moneybot/1.0'

# Sharadar Daily Data Run

In [None]:
tickers_data = nasdaqdatalink.get_table('SHARADAR/TICKERS', table = ['SEP'], paginate=True)

print(tickers_data.info())

In [None]:
daily_metrics = pd.read_csv(
    "data/SHARADAR_DAILY.csv",
    parse_dates=["date"],
    index_col=["date", "ticker"]
).sort_index()
# Find the maximum date
max_date = daily_metrics.index.get_level_values('date').max()

# Filter the DataFrame to include only the rows with the maximum date
daily_metrics = daily_metrics.loc[max_date]

# Reset the index to make 'ticker' a column
daily_metrics = daily_metrics.reset_index()

# Set the index to 'ticker' only
daily_metrics = daily_metrics.set_index('ticker')

print(daily_metrics.head())
print(daily_metrics.info())

# yfinance Minute Data Run

In [None]:
 # enter the Sharadar table you would like to retrieve 
def display_menu():
    options = [
        "1: SHARADAR/TICKERS",
        "2: SHARADAR/ACTIONS",
        "3: SHARADAR/DAILY",
        "4: SHARADAR/SEP",
        "5: SHARADAR/SP500",
        "5: SHARADAR/SF1",  
    ]
    
    print("Please select a table:")
    for i, option in enumerate(options, 1):
        print(f"{i}. {option}")
    
    choice = input("Enter the number of your choice: ")
    return int(choice)

def select_table():
    choice = display_menu()
    if choice == 1:
        print("Fetching SHARADAR/TICKERS")
        table = 'TICKERS'
    elif choice == 2:
        print("Fetching SHARADAR/ACTIONS")
        table = 'ACTIONS'
    elif choice == 3:
        print("Fetching SHARADAR/DAILY")
        # Add your code for Option 3 here
    elif choice == 4:
        print("Fetching SHARADAR/SEP")
        # Add your code for Option 4 here
    elif choice == 5:
        print("Fetching SHARADAR/SP500")
        # Add your code for Option 4 here
    elif choice == 6:
        print("Fetching SHARADAR/SF1")
        # Add your code for Option 4 here
    else:
        print("Invalid choice, please try again.")
        select_table()

table = select_table()

destFileRef = f'temp/{table}_download.csv.zip' # enter the destination that you would like the retrieved data to be saved to
url = 'https://www.quandl.com/api/v3/datatables/SHARADAR/%s.json?qopts.export=true&api_key=%s' % (table, api_key) # optionally add parameters to the url to filter the data retrieved, as described in the associated table's documentation, eg here: https://www.quandl.com/databases/SF1/documentation/getting-started

def bulk_fetch(url=url, destFileRef=destFileRef):
  version = sys.version.split(' ')[0]
  if version < '3':
    import urllib2
    fn = urllib2.urlopen
  else:
    import urllib
    fn = urllib.request.urlopen

  valid = ['fresh','regenerating']
  invalid = ['generating']
  status = ''
  
  while status not in valid:
    Dict = json.loads(fn(url).read())
    last_refreshed_time = Dict['datatable_bulk_download']['datatable']['last_refreshed_time']
    status = Dict['datatable_bulk_download']['file']['status']
    link = Dict['datatable_bulk_download']['file']['link']
    print(status)
    if status not in valid:
      time.sleep(60)

  print('fetching from %s' % link)
  zipString = fn(link).read()
  f = open(destFileRef, 'wb')
  f.write(zipString)
  f.close()
  print('fetched')




In [None]:
bulk_fetch()

In [None]:
print(api_key[-3:])

In [89]:
DATA_STORE = Path('store/assets.h5')

In [91]:
# sp500 = nasdaqdatalink.get_table('SHARADAR/SP500', action='current')
# tickers = sp500['ticker'].tolist()
df = pd.read_csv('STOCK_META_DATA.csv')
df.info()

# Set the ticker column as the index and sort by the index
df.set_index('ticker', inplace=True)
df.sort_index(inplace=True)

tickers = df.index.to_list()
max_length = max(len(s) for s in tickers)
print(max_length)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5494 entries, 0 to 5493
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     5494 non-null   object 
 1   name       5494 non-null   object 
 2   last_sale  5494 non-null   float64
 3   marketcap  5494 non-null   float64
 4   ipoyear    5494 non-null   float64
 5   sector     5490 non-null   object 
 6   industry   5490 non-null   object 
dtypes: float64(3), object(4)
memory usage: 300.6+ KB
5


In [None]:
print(tickers)

In [90]:
with pd.HDFStore(DATA_STORE) as store:
    # Remove the specific key if it exists
    if 'yf/minute/us_equity/prices' in store:
        store.remove('yf/minute/us_equity/prices')
        print("Removed existing 'yf/minute/us_equity/prices' from the store")

Removed existing 'yf/minute/us_equity/prices' from the store


In [92]:
with pd.HDFStore(DATA_STORE) as store:
    print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: store/assets.h5
/yf/minute/sp500/prices            frame_table  (typ->appendable_multi,nrows->703229,ncols->9,indexers->[index],dc->[datetime,ticker])


In [93]:
def fetch_and_store_history_data():
    sleep_time = 2.5
    batch_start_time = time.time()
    # for t in tickers[n1:n2]:
    with pd.HDFStore(DATA_STORE) as store:
        for t in tickers:
            try:
                df = yf.Ticker(t).history(
                   interval="1m",
                   start="2024-05-02",
                   end="2024-05-04",
                   prepost=True,
                   proxy=PROXY_SERVER,
                   keepna=True
                )
                if df.empty:
                    print(f"No data found for {t}")
                    continue
                
                df.index = df.index.tz_localize(None)
                # Add the ticker column
                df['ticker'] = t

                # Set the multi-index
                df.set_index(['ticker', df.index], inplace=True)
                df.index.names = ['ticker', 'datetime']

                # Store the dataframe
                if 'yf/minute/us_equity/prices' not in store:
                    store.put('yf/minute/us_equity/prices', df[:0], format='table', min_itemsize={'ticker': max_length})
                store.append('yf/minute/us_equity/prices', df, format='table', min_itemsize={'ticker': max_length})
                print(f'added {t} to store')
            except Exception as e:
                print(f"Error fetching data for {t}: {str(e)}")
                if "429" in str(e):
                    if sleep_time<3.0:
                        sleep_time += 0.5
                        print("429 error detected, delaying to respect rate limit...")
                    time.sleep(10.0)
            time.sleep(sleep_time)
    batch_end_time = time.time()
    batch_total_time = batch_end_time - batch_start_time
    print(f"Batch ran for {batch_total_time:.2f} seconds")

In [None]:
start_time = time.time()
fetch_and_store_history_data()
end_time = time.time()
total_time = end_time - start_time
print(f"Program ran for {total_time:.2f} seconds")


# Stooq Hour Data Run