In [None]:
import pandas as pd
import datetime as dt
import calendar
import numpy as np
import itertools
from zipfile import ZipFile

In [None]:
def readDayCsv(folderPath, date):
    #folderPath: path of folder 2016
    #date: '20160104', str
    zFile = folderPath + date[:4] + f'/ORATS_SMV_Strikes_{date}.zip'
    csvFile = f'ORATS_SMV_Strikes_{date}.csv'
    with ZipFile(zFile) as z:
        with z.open(csvFile) as f:
            df = pd.read_csv(f, parse_dates = ['trade_date','expirDate'], index_col='trade_date')
    return df

In [None]:
def readMultiDay(folderPath, startDate, endDate):
    #startDate, endDate: 20160104, 20160531, str
    dateList = [d.strftime('%Y%m%d') for d in pd.date_range(startDate, endDate)]
    df = []
    for d in dateList:
        try:
            dayDf = readDayCsv(folderPath, d)
            df.append(dayDf)
        except:
            pass
    return pd.concat(df)

In [None]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

In [None]:
def get_date_list(year_start, year_end):
    year_list = np.arange(year_start, year_end+1)
    month_list = np.arange(1, 12+1)
    first_day = []
    last_day = []
    
    for year, month in itertools.product(year_list, month_list):
        _, num_days = calendar.monthrange(year, month)
        first_day.append(dt.date(year, month, 1).strftime('%Y%m%d'))
        last_day.append(dt.date(year, month, num_days).strftime('%Y%m%d'))
        # print(first_day, last_day)
    return (first_day, last_day)

In [None]:
folderPath = 'D:/Database/Option data/Orats/'
year_start = 2020
year_end = 2021

In [None]:
first_day, last_day = get_date_list(year_start, year_end)

In [None]:
%%time
print('Save as Parquet')
for start, end in zip(first_day, last_day):
    try:
        print(start + ' ' + end + ' ' + 'start.')
        df = readMultiDay(folderPath, start, end)
        df.to_parquet('D:/Database/Option data/Orats/Parquet/' + start[:4] + '/ORATS_SMV_Stries_' + start[:-2] + '.parquet')
        del df
        print(start + ' ' + end + ' ' + 'is done!')
    except:
        pass

In [None]:
%%time
print('Save as HDF')
for start, end in zip(first_day, last_day): 
    try:
        print(start + ' ' + end + ' ' + 'start.')
        df = readMultiDay(folderPath, start, end)
        # df.to_parquet('D:/Database/Option data/Orats/Parquet/' + start[:4] + '/ORATS_SMV_Stries_' + start[:-2] + '.parquet')
        df.to_hdf('D:/Database/Option data/Orats/HDF/' + start[:4] + '/ORATS_SMV_Stries_' + start[:-2] + '.h5', key='df', mode='w')
        del df
        print(start + ' ' + end + ' ' + 'is done!')
    except:
        pass