This notebook is intended to extract the csv files and split them into individual files for each day. All these files will be stored in data/data_per_day folder and each file name will be of the form ``SPY_20200203.csv``

In [1]:
# Required libraries
from pathlib import Path
import sys
import os 
import pandas as pd
import numpy as np
from itertools import chain
import matplotlib.pyplot as plt
import datetime
import gzip
import shutil
from timeit import default_timer as timer
# Paths
sys.path.append(os.path.join(Path(os.getcwd()).parent))  
data_path = os.path.join(os.path.join(Path(os.getcwd()).parent), 'data')

In [2]:
# get data location
zip_files = list()
zip_files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
zip_files = [file for file in zip_files if '.gz' in file]
zip_files = np.sort([os.path.join(data_path, x) for x in zip_files])
zip_files[-1]

'/data/Dropbox/Projects/financial_volatility/financial_volatility/data/ezu_sep2019.gz'

In [3]:
zip_file = zip_files[-1]
with gzip.open(zip_file, 'rb') as f_in:
        with open(os.path.join(data_path, 'test.csv'), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
# get the csv file now
csv_file = os.path.join(data_path, 'test.csv')

In [4]:
%%time
data_parser = pd.read_csv(csv_file, chunksize=5*10**4)
start=timer()
for df in data_parser:
    #print(1)
    dates = np.sort(df.DATE.unique())
    if 'current_date' not in locals():
        current_date = dates[0]
    if 'ticker' not in locals():
        ticker = df.SYM_ROOT.values[0]
    
    if len(dates)==1 and current_date==dates[0]:  # only one date which is the current one
        if 'df_date' not in locals():
            df_date = df 
        else:
            df_date = pd.concat([df_date, df])
    elif len(dates)==1 and current_date!=dates[0]:  # only one date but it is not the current date
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        df_date = df
        current_date = dates[0]
    elif len(dates)==2:
        df_date = pd.concat([df_date, df[df.DATE == current_date]])
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        current_date = dates[1]
        df_date = df[df.DATE == current_date]
    else:
        print('More than two dates are present. Chuncksize is too big!')
        break
df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
df_date.to_csv(os.path.join(data_path, 'data_per_day',f'{ticker}', f'{ticker}_{current_date}.csv'), index=False) # as the last chuck is processed but never saved
end=timer()
print(f'Finished {current_date} in {end-start} seconds')

Finished 20190903 in 1.9755029500011005 seconds
Finished 20190904 in 1.0672661569988122 seconds
Finished 20190905 in 1.57257854299678 seconds
Finished 20190906 in 1.1002267040021252 seconds
Finished 20190909 in 0.9160565749989473 seconds
Finished 20190910 in 1.368200662000163 seconds
Finished 20190911 in 0.8929501570019056 seconds
Finished 20190912 in 2.421476937000989 seconds
Finished 20190913 in 1.16597218500101 seconds
Finished 20190916 in 1.0788180349991308 seconds
Finished 20190917 in 1.0906696740057669 seconds
Finished 20190918 in 1.1572848639989388 seconds
Finished 20190919 in 0.8219099639973138 seconds
Finished 20190920 in 1.1497841509990394 seconds
Finished 20190923 in 0.8277127469991683 seconds
Finished 20190924 in 2.146892201999435 seconds
Finished 20190925 in 1.5521733180066803 seconds
Finished 20190926 in 2.0990242449988727 seconds
Finished 20190927 in 2.217232970004261 seconds
Finished 20190930 in 1.1520770290007931 seconds
CPU times: user 26 s, sys: 1.72 s, total: 27.8 s

Several cases:
    - only one date which is the current one
        - either create or append to df_date
    - only one date but it is not the current date
        - save the df_date to csv and redifine df_date = df
    - two dates:
        - truncate, append & save & redifine df_date =  df and the current date
    - more than two dates:
        - chunck is too big 
        
   Only issue remaining is the last chuck! which you do not save