In [1]:
# Required libraries
from pathlib import Path
import sys
import os 
import pandas as pd
import numpy as np
from itertools import chain
import matplotlib.pyplot as plt
import datetime
import zipfile 
from timeit import default_timer as timer
import sqlalchemy as db
# Paths
sys.path.append(os.path.join(Path(os.getcwd()).parent))  
data_path = os.path.join(os.path.join(Path(os.getcwd()).parent), 'data')
data_per_day_path = os.path.join(os.path.join(Path(os.getcwd()).parent), 'data','data_per_day')

In [2]:
# create sqllite database
# create database
import sqlite3
con = sqlite3.connect(os.path.join(data_path, 'database.db'))
con.close()

In [3]:
# create connection to sqlite database
db_path = os.path.join(data_path, 'database.db')
db_engine = db.create_engine('sqlite:///' + db_path)

In [4]:
# get the data folders file now
data_folders = [f for f in os.listdir(data_per_day_path) if not os.path.isfile(os.path.join(data_per_day_path, f))]
data_folders = [file for file in data_folders if '.' not in file]
data_folders = [os.path.join(data_per_day_path, x) for x in data_folders]
data_folders

['/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/SPY',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/IWM',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/EEM',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/EZU',
 '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA']

In [14]:
# get the csv file now
data_folder = data_folders[4]
table_name = data_folder[-3:]
csv_files = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]
csv_files = [file for file in csv_files if '.csv' in file]
csv_files = np.sort([os.path.join(data_folder, x) for x in csv_files])
csv_files

array(['/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190501.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190502.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190503.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190506.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190507.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190508.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data_per_day/VEA/VEA_20190509.csv',
       '/home/patricklucescu/Dropbox/Projects/financial_volatility/financial_volatility/data/data

In [15]:
data_folder[-3:]

'VEA'

In [16]:
%%time
for csv_file in csv_files:
    start = timer()
    print(f"Start with day-----{csv_file[-12:-8]}-{csv_file[-8:-6]}-{csv_file[-6:-4]}")
    data_df = pd.read_csv(csv_file)
    data_df.DT = pd.to_datetime(data_df.DT)
    data_df.sort_values(by=['DT'], inplace=True)

    # non zero quotes
    data_df = data_df.loc[(data_df.BID>0) & (data_df.BIDSIZ>0) & (data_df.ASK>0) & (data_df.ASKSIZ>0)]

    # autoselect exchange
    data_df['total_size'] = data_df.BIDSIZ + data_df.ASKSIZ
    data_df = data_df.loc[data_df.EX == data_df.groupby(['EX']).sum().total_size.idxmax()]

    # delete negative spreads
    data_df = data_df.loc[data_df.ASK > data_df.BID]

    # mergeQuotesSameTimestamp
    ex = data_df.EX.values[0]
    sym_root = data_df.SYM_ROOT.values[0]
    data_df.drop(columns=['SYM_SUFFIX', 'total_size'], inplace=True)
    data_df = data_df.groupby(['DT']).median()
    data_df['EX'] = ex
    data_df['SYM_ROOT'] = sym_root
    data_df.reset_index(drop=False, inplace=True)

    # remove entries with spread > 50 * daily median spread
    data_df['SPREAD'] = data_df.ASK - data_df.BID
    data_df = data_df.loc[data_df['SPREAD'] < 50 * data_df['SPREAD'].median()]

    # remove outliers using the centered rolling window approach 
    def compute_diff(x):
        return x.values[window] - np.median(np.delete(x.values,window))

    window = 25
    data_df.sort_values(by=['DT'], inplace=True)
    data_df['SPREAD_DIFF'] = data_df.SPREAD.rolling(2*window+1, min_periods=2*window+1, center=True).apply(compute_diff)
    data_df = data_df.dropna()
    data_df = data_df.loc[data_df['SPREAD_DIFF'] < 10 * data_df['SPREAD_DIFF'].mean()]
    data_df = data_df.reset_index(drop=True)

    # resample data to 10 minute level
    data_df.set_index(['DT'], inplace=True)
    data_df["MID"] = data_df.apply(lambda x: (x.ASK * x.ASKSIZ + x.BID * x.BIDSIZ) / (x.ASKSIZ + x.BIDSIZ), axis=1)
    data_df = data_df[['MID', 'SYM_ROOT']]
    data_df = data_df.resample('5min').last()
    #data_df['RETURN'] = data_df.MID.pct_change()
    #data_df = data_df.iloc[1:,:]
    data_df.reset_index(drop=False, inplace=True)
    data_df.DT = data_df.DT.shift(-1) # so that prices are alligned with the time they appear in the market and not with the 5 minutes group
    data_df.iloc[-1,0] = data_df.iloc[-2,0] + datetime.timedelta(minutes=5)
    data_df.to_sql(data_folder[-3:], db_engine, index=False, if_exists='append')
    end = timer()
    print(f"   Finished with batch {csv_file[-12:-8]}-{csv_file[-8:-6]}-{csv_file[-6:-4]}-----{end - start} s")

Start with day-----2019-05-01
   Finished with batch 2019-05-01-----8.735660936996283 s
Start with day-----2019-05-02
   Finished with batch 2019-05-02-----9.791167069997755 s
Start with day-----2019-05-03
   Finished with batch 2019-05-03-----6.387965563000762 s
Start with day-----2019-05-06
   Finished with batch 2019-05-06-----12.704491977005091 s
Start with day-----2019-05-07
   Finished with batch 2019-05-07-----24.91407797500142 s
Start with day-----2019-05-08
   Finished with batch 2019-05-08-----25.221295771996665 s
Start with day-----2019-05-09
   Finished with batch 2019-05-09-----15.799142822994327 s
Start with day-----2019-05-10
   Finished with batch 2019-05-10-----21.266363536000426 s
Start with day-----2019-05-13
   Finished with batch 2019-05-13-----17.271776433000923 s
Start with day-----2019-05-14
   Finished with batch 2019-05-14-----8.547524739005894 s
Start with day-----2019-05-15
   Finished with batch 2019-05-15-----10.484908733000339 s
Start with day-----2019-05

# Create returns tables

In [21]:
table_names = db_engine.table_names()
table_names

['EEM', 'EZU', 'IWM', 'SPY', 'VEA']

In [65]:
for table in table_names:
    price_data_ticker = pd.read_sql(f"select * from {table}", db_engine)
    if 'price_data' in locals():
        ticker = price_data_ticker.SYM_ROOT[0]
        price_data[f"{ticker}"] = price_data_ticker.MID
    else:
        price_data = price_data_ticker[['DT', 'MID']]
        price_data.rename(columns={"MID": f"{ticker}"})

# compute log returns and upload to database
price_data.set_index(['DT'], inplace=True)
return_data = price_data.pct_change().iloc[1:,:]
return_data = return_data.apply(np.vectorize(lambda x: np.log(1+x)))
return_data.reset_index(drop=False, inplace=True)
return_data.to_sql("returns", db_engine, if_exists='replace')