This notebook is intended to extract the csv files and split them into individual files for each day. All these files will be stored in data/data_per_day folder and each file name will be of the form ``SPY_20200203.csv``

In [1]:
# Required libraries
from pathlib import Path
import sys
import os 
import pandas as pd
import numpy as np
from itertools import chain
import matplotlib.pyplot as plt
import datetime
import gzip
import shutil
from timeit import default_timer as timer
# Paths
sys.path.append(os.path.join(Path(os.getcwd()).parent))  
data_path = os.path.join(os.path.join(Path(os.getcwd()).parent), 'data')

In [2]:
# get data location
zip_files = list()
zip_files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
zip_files = [file for file in zip_files if '.gz' in file]
zip_files = [os.path.join(data_path, x) for x in zip_files]
zip_files

['/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/ezu_may2019.gz',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/iwm_may2019.gz',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/eem_may2019.gz',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/vea_may2019.gz',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/spy_may2019.gz']

In [3]:
zip_file = zip_files[4]
with gzip.open(zip_file, 'rb') as f_in:
        with open(os.path.join(data_path, 'test.csv'), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
# get the csv file now
csv_file = os.path.join(data_path, 'test.csv')

In [4]:
%%time
data_parser = pd.read_csv(csv_file, chunksize=5*10**5)
start=timer()
for df in data_parser:
    #print(1)
    dates = np.sort(df.DATE.unique())
    if 'current_date' not in locals():
        current_date = dates[0]
    if 'ticker' not in locals():
        ticker = df.SYM_ROOT.values[0]
    
    if len(dates)==1 and current_date==dates[0]:  # only one date which is the current one
        if 'df_date' not in locals():
            df_date = df 
        else:
            df_date = pd.concat([df_date, df])
    elif len(dates)==1 and current_date!=dates[0]:  # only one date but it is not the current date
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        df_date = df
        current_date = dates[0]
    elif len(dates)==2:
        df_date = pd.concat([df_date, df[df.DATE == current_date]])
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        current_date = dates[1]
        df_date = df[df.DATE == current_date]
    else:
        print('More than two dates are present. Chuncksize is too big!')
        break
df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False) # as the last chuck is processed but never saved
end=timer()
print(f'Finished {current_date} in {end-start} seconds')

Finished 20190501 in 26.97730266200233 seconds
Finished 20190502 in 44.063456748001045 seconds
Finished 20190503 in 23.704169062999426 seconds
Finished 20190506 in 34.76921290500104 seconds
Finished 20190507 in 55.29991199000142 seconds
Finished 20190508 in 30.11882609500026 seconds
Finished 20190509 in 47.95164199999999 seconds
Finished 20190510 in 44.879307227001846 seconds
Finished 20190513 in 59.501354406002065 seconds
Finished 20190514 in 30.685433705999458 seconds
Finished 20190515 in 29.253609355000663 seconds
Finished 20190516 in 26.471355304998724 seconds
Finished 20190517 in 43.02775007799937 seconds
Finished 20190520 in 35.85772383699805 seconds
Finished 20190521 in 27.664010668999254 seconds
Finished 20190522 in 25.47703297100088 seconds
Finished 20190523 in 47.60961755600147 seconds
Finished 20190524 in 26.81840456899954 seconds
Finished 20190528 in 28.921080643001915 seconds
Finished 20190529 in 37.74888269099756 seconds
Finished 20190530 in 28.333298380999622 seconds
Fin

Several cases:
    - only one date which is the current one
        - either create or append to df_date
    - only one date but it is not the current date
        - save the df_date to csv and redifine df_date = df
    - two dates:
        - truncate, append & save & redifine df_date =  df and the current date
    - more than two dates:
        - chunck is too big 
        
   Only issue remaining is the last chuck! which you do not save