This notebook is intended to extract the csv files and split them into individual files for each day. All these files will be stored in data/data_per_day folder and each file name will be of the form ``SPY_20200203.csv``

In [1]:
# Required libraries
from pathlib import Path
import sys
import os 
import pandas as pd
import numpy as np
from itertools import chain
import matplotlib.pyplot as plt
import datetime
import zipfile 
from timeit import default_timer as timer
# Paths
sys.path.append(os.path.join(Path(os.getcwd()).parent))  
data_path = os.path.join(os.path.join(Path(os.getcwd()).parent), 'data')

In [2]:
# get data location
zip_files = list()
zip_files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
zip_files = [file for file in zip_files if '.zip' in file]
zip_files = [os.path.join(data_path, x) for x in zip_files]
zip_files

['/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/kj5k45pfwowrssuj_csv.zip']

In [3]:
zip_file = zip_files[0]
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(data_path)
# get the csv file now
csv_files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
csv_files = [file for file in csv_files if '.csv' in file]
csv_files = [os.path.join(data_path, x) for x in csv_files]
csv_file = csv_files[0]

In [4]:
%%time
data_parser = pd.read_csv(csv_file, chunksize=5 * 10**5)
start=timer()
for df in data_parser:
    dates = np.sort(df.DATE.unique())
    if 'current_date' not in locals():
        current_date = dates[0]
    if 'ticker' not in locals():
        ticker = df.SYM_ROOT.values[0]
    
    if len(dates)==1 and current_date==dates[0]:  # only one date which is the current one
        if 'df_date' not in locals():
            df_date = df 
        else:
            df_date = pd.concat([df_date, df])
    elif len(dates)==1 and current_date!=dates[0]:  # only one date but it is not the current date
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        df_date = df
        current_date = dates[0]
    elif len(dates)==2:
        df_date = pd.concat([df_date, df[df.DATE == current_date]])
        df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
        df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
        df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
        df_date.to_csv(os.path.join(data_path, 'data_per_day', f'{ticker}_{current_date}.csv'), index=False)
        end=timer()
        print(f'Finished {current_date} in {end-start} seconds')
        start=timer()
        current_date = dates[1]
        df_date = df[df.DATE == current_date]
    else:
        print('More than two dates are present. Chuncksize is too big!')
        break
df_date.DATE = df_date.DATE.astype(str).apply(lambda x: x[:4]+'-'+x[4:6]+'-'+x[6:])
df_date['DT'] = df_date.DATE + ' ' +df_date.TIME_M
df_date.drop(columns=['DATE', 'TIME_M'], inplace=True)
df_date.to_csv(os.path.join(data_path, 'data_per_day', f'{ticker}_{current_date}.csv'), index=False) # as the last chuck is processed but never saved
end=timer()
print(f'Finished {current_date} in {end-start} seconds')

Finished 20200203 in 53.98439746399981 seconds
CPU times: user 48.4 s, sys: 5.53 s, total: 54 s
Wall time: 54 s


Several cases:
    - only one date which is the current one
        - either create or append to df_date
    - only one date but it is not the current date
        - save the df_date to csv and redifine df_date = df
    - two dates:
        - truncate, append & save & redifine df_date =  df and the current date
    - more than two dates:
        - chunck is too big 
        
   Only issue remaining is the last chuck! which you do not save