# Sharadar Daily Data Run

# yfinance Minute Data Run

In [87]:
import requests
import json
import time
import datetime

from pathlib import Path
from dotenv import load_dotenv
import os
import yfinance as yf
import pandas as pd
import requests_cache
import nasdaqdatalink
from requests import Session
from requests_cache import CacheMixin, SQLiteCache
from requests_ratelimiter import LimiterMixin, MemoryQueueBucket
from pyrate_limiter import Duration, RequestRate, Limiter
class CachedLimiterSession(CacheMixin, LimiterMixin, Session):
    pass

session = CachedLimiterSession(
    limiter=Limiter(RequestRate(2, Duration.SECOND*5)),  # max 2 requests per 5 seconds
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache("yfinance.cache"),
)

load_dotenv()

PROXY_SERVER=os.getenv("PROXY_SERVER")
api_key=os.getenv("NASDAQ_DATA_LINK_API_KEY")
nasdaqdatalink.ApiConfig.api_key = api_key

session = requests_cache.CachedSession('yfinance.cache')
session.headers['User-agent'] = 'moneybot/1.0'


In [88]:
print(api_key)

aTK7iTbaxk3yuo4e-hsu


In [89]:
DATA_STORE = Path('store/assets.h5')

In [91]:
# sp500 = nasdaqdatalink.get_table('SHARADAR/SP500', action='current')
# tickers = sp500['ticker'].tolist()
df = pd.read_csv('STOCK_META_DATA.csv')
df.info()

# Set the ticker column as the index and sort by the index
df.set_index('ticker', inplace=True)
df.sort_index(inplace=True)

tickers = df.index.to_list()
max_length = max(len(s) for s in tickers)
print(max_length)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5494 entries, 0 to 5493
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     5494 non-null   object 
 1   name       5494 non-null   object 
 2   last_sale  5494 non-null   float64
 3   marketcap  5494 non-null   float64
 4   ipoyear    5494 non-null   float64
 5   sector     5490 non-null   object 
 6   industry   5490 non-null   object 
dtypes: float64(3), object(4)
memory usage: 300.6+ KB
5


In [None]:
print(tickers)

In [90]:
with pd.HDFStore(DATA_STORE) as store:
    # Remove the specific key if it exists
    if 'yf/minute/us_equity/prices' in store:
        store.remove('yf/minute/us_equity/prices')
        print("Removed existing 'yf/minute/us_equity/prices' from the store")

Removed existing 'yf/minute/us_equity/prices' from the store


In [92]:
with pd.HDFStore(DATA_STORE) as store:
    print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: store/assets.h5
/yf/minute/sp500/prices            frame_table  (typ->appendable_multi,nrows->703229,ncols->9,indexers->[index],dc->[datetime,ticker])


In [93]:
def fetch_and_store_history_data():
    sleep_time = 2.5
    batch_start_time = time.time()
    # for t in tickers[n1:n2]:
    with pd.HDFStore(DATA_STORE) as store:
        for t in tickers:
            try:
                df = yf.Ticker(t).history(
                   interval="1m",
                   start="2024-05-02",
                   end="2024-05-04",
                   prepost=True,
                   proxy=PROXY_SERVER,
                   keepna=True
                )
                if df.empty:
                    print(f"No data found for {t}")
                    continue
                
                df.index = df.index.tz_localize(None)
                # Add the ticker column
                df['ticker'] = t

                # Set the multi-index
                df.set_index(['ticker', df.index], inplace=True)
                df.index.names = ['ticker', 'datetime']

                # Store the dataframe
                if 'yf/minute/us_equity/prices' not in store:
                    store.put('yf/minute/us_equity/prices', df[:0], format='table', min_itemsize={'ticker': max_length})
                store.append('yf/minute/us_equity/prices', df, format='table', min_itemsize={'ticker': max_length})
                print(f'added {t} to store')
            except Exception as e:
                print(f"Error fetching data for {t}: {str(e)}")
                if "429" in str(e):
                    if sleep_time<3.0:
                        sleep_time += 0.5
                        print("429 error detected, delaying to respect rate limit...")
                    time.sleep(10.0)
            time.sleep(sleep_time)
    batch_end_time = time.time()
    batch_total_time = batch_end_time - batch_start_time
    print(f"Batch ran for {batch_total_time:.2f} seconds")

In [None]:
start_time = time.time()
fetch_and_store_history_data()
end_time = time.time()
total_time = end_time - start_time
print(f"Program ran for {total_time:.2f} seconds")


# Stooq Hour Data Run