# Update Stock Historical Price/Volume Database

## 資料檔案設定

In [1]:
data_filepath = 'data/data.feather'  # .feather 儲存格式

## 程式執行

### 套件設定

In [2]:
import os
import pandas as pd
import yfinance as yf
from getStockList import get_stock_symbols
from requests import Session
from requests_cache import CacheMixin, SQLiteCache
from requests_ratelimiter import LimiterSession, LimiterMixin, MemoryQueueBucket
from pyrate_limiter import Duration, RequestRate, Limiter
import datetime
import pytz

In [3]:
# Rate limiter and cached
class CachedLimiterSession(CacheMixin, LimiterMixin, Session):
    pass

# start a cached session
session = CachedLimiterSession(
    limiter=Limiter(RequestRate(10, Duration.SECOND*2)),
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache("requests.cache"),
)

### 載入資料

In [4]:
# Get stock list
stock_symbol_df = get_stock_symbols()

stock_symbol_df  # Inspect stock symbol df

Unnamed: 0,symbol,name,market,industry,yf_symbol
0,1101,台泥,上市,水泥工業,1101.TW
1,1102,亞泥,上市,水泥工業,1102.TW
2,1103,嘉泥,上市,水泥工業,1103.TW
3,1104,環泥,上市,水泥工業,1104.TW
4,1108,幸福,上市,水泥工業,1108.TW
...,...,...,...,...,...
1794,9949,琉園,上櫃,文化創意業,9949.TWO
1795,9950,萬國通,上櫃,塑膠工業,9950.TWO
1796,9951,皇田,上櫃,電機機械,9951.TWO
1797,9960,邁達康,上櫃,運動休閒,9960.TWO


In [5]:
# LOAD DATA ##################
directory, filename = os.path.split(data_filepath)

# Check if the directory exists
if not os.path.exists(directory):
    # If not, create the directory
    os.makedirs(directory)
    print(f"Directory '{directory}' created.")

# load data
if os.path.exists(data_filepath):
    data_df = pd.read_feather(data_filepath)
    
else:  # Data file does not exist, download it from Yahoo Finance
    print('No current data file found, start to download')
    
    # Get stock symbols
    symbols = stock_symbol_df['yf_symbol'].to_list()

    # Download data
    data_df = yf.download(symbols, session=session, period="10y", progress=True)

    # Save data
    data_df.to_feather(data_filepath)

In [6]:
data_df.tail()  # Inspect data df

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Open,Open,Volume,Volume,Adj Close,Close,High,Low,Open,Volume
Unnamed: 0_level_1,1101.TW,1102.TW,1103.TW,1104.TW,1108.TW,1109.TW,1110.TW,1201.TW,1203.TW,1210.TW,...,6856.TWO,8476.TW,6856.TWO,8476.TW,6658.TW,6658.TW,6658.TW,6658.TW,6658.TW,6658.TW
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-11-01,32.099998,40.0,17.9,25.85,13.6,17.450001,18.15,18.450001,42.400002,51.900002,...,75.800003,63.5,137336.0,42085.0,,,,,,
2023-11-02,32.349998,40.0,17.950001,26.049999,13.95,17.65,18.35,18.5,42.25,52.099998,...,74.900002,62.400002,92515.0,62695.0,40.0,40.0,41.0,38.150002,39.200001,1102324.0
2023-11-03,32.900002,40.25,18.15,26.15,13.9,17.700001,18.35,18.6,42.099998,52.5,...,74.800003,62.400002,87691.0,23654.0,,,,,,
2023-11-06,32.799999,40.400002,18.4,26.35,13.9,17.75,18.4,18.65,43.25,52.799999,...,72.0,61.599998,65948.0,41410.0,39.0,39.0,39.5,38.599998,39.150002,269875.0
2023-11-07,32.549999,40.400002,18.299999,26.25,13.9,17.75,18.450001,18.799999,42.599998,52.799999,...,72.5,61.299999,52159.0,59738.0,39.049999,39.049999,39.299999,38.650002,39.0,143680.0


### 更新資料

In [7]:
# Get date information
now = datetime.datetime.now(tz=pytz.timezone('Asia/Taipei'))
date_today = now.date()
last_day_data = data_df.index[-1].date()

In [8]:
date_today + datetime.timedelta(days=1)

datetime.date(2023, 11, 9)

In [9]:
if date_today > last_day_data:
    # The target day for data update
    target_day = date_today if now.time() > datetime.time(15, 00) else date_today + datetime.timedelta(days=-1)
    start_day = last_day_data #+ datetime.timedelta(days=1)
    end_day = target_day + datetime.timedelta(days=1)

    if start_day <= target_day:
       
        update_data_df = yf.download(['2330.TW'], start=start_day.strftime('%Y-%m-%d'), end=end_day.strftime('%Y-%m-%d'), progress=False)  # For test run

        # Update the data df and save to file
        if len(update_data_df.index) > 0:  # Double-check new data is available
            symbols = stock_symbol_df['yf_symbol'].to_list()
            update_data_df = yf.download(symbols, start=start_day.strftime('%Y-%m-%d'), end=end_day.strftime('%Y-%m-%d'), progress=True)
            data_df = pd.concat([data_df, update_data_df])
            data_df.to_feather(data_filepath)
        else:
            print('New data is not available. Maybe try later.')
    else:
        print('No update')

print(f'Last three days of the data: {data_df.tail(3).index.to_list()}')

[*********************100%%**********************]  1799 of 1799 completed


4 Failed downloads:
['8423.TWO', '2740.TWO', '3089.TWO', '8080.TWO']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2023-11-07 -> 2023-11-09)')



Last three days of the data: [Timestamp('2023-11-06 00:00:00'), Timestamp('2023-11-07 00:00:00'), Timestamp('2023-11-08 00:00:00')]


In [10]:
# Remove duplicate dates just in case
data_df = data_df.reset_index().drop_duplicates(subset=[('Date', '')], keep='first').set_index('Date')
data_df.to_feather(data_filepath)