In [62]:
from IPython.core.debugger import set_trace

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
import math

from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import silhouette_score
from hmmlearn.hmm import GaussianHMM, GMMHMM, MultinomialHMM

from dateutil.relativedelta import relativedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DATE_TIME = 'date_time'
DATE = 'date'
TIME = 'time'
OPEN_PRICE = 'open'
HIGH_PRICE = 'high'
LOW_PRICE = 'low'
CLOSE_PRICE = 'close'
VOLUME = 'volume'
TURNOVER = 'turnover'
VWAP = 'vwap'
FEATURES = ['high_low_spread', "open_close_rets", "log_total_traded_vol", "daily_log_return", "short_term_vol", "long_term_vol", "money_flow_index"]

In [3]:
warnings.filterwarnings("ignore")
DATAPATH = os.getcwd()+"\\Data\\"

files_list = []
for root, dirs, files in os.walk(DATAPATH):
    for file in files:
        files_list.append(file)
        
raw = pd.read_csv("file:///" + os.path.join(DATAPATH, files_list[3]), parse_dates=[['<DTYYYYMMDD>', '<TIME>']])

## Inputs

### Input vieja

In [4]:
def formatData(df):
    returned_df = df.copy()
    returned_df.drop(labels=["<TICKER>", "<PER>", "<OPENINT>"], axis="columns", inplace=True)
    returned_df.columns = ['date_time', 'open', 'high', 'low', 'close', 'volume']
    returned_df.set_index('date_time', drop=True, inplace=True)
    returned_df = addDateAndTime(returned_df)

    return returned_df

def prepareDataframe(df):
    returned_df = df.copy()
    returned_df['volume*price'] = returned_df['volume']*returned_df['close']
    returned_df = returned_df.resample('5T').sum()
    returned_df = returned_df.between_time('9:00', '17:34')
    returned_df = returned_df[returned_df.index.weekday != 5]
    returned_df = returned_df[returned_df.index.weekday != 6]
    returned_df['vwap'] = returned_df['volume*price']/returned_df['volume']
    returned_df.dropna(inplace=True)

    return returned_df

def addDateAndTime(df):
    returned_df = df.copy()
    returned_df['date'] = pd.to_datetime(returned_df.index.date)
    returned_df['time'] = pd.to_datetime(returned_df.index, format = "%m-%d-%Y %H:%M:%S")
    returned_df['time'] = returned_df['time'].apply(lambda x: x.strftime('%H:%M:%S'))

    return returned_df

formatted_df = formatData(raw)
algo_df = prepareDataframe(formatted_df)

### Input nueva

In [5]:
def format_data(df):
    returned_df = df.copy()
    returned_df.drop(labels=["<PER>", "<OPENINT>"], axis="columns", inplace=True)
    returned_df.columns = ['date_time', 'ticker', 'open', 'high', 'low', 'close', 'volume']
    returned_df.set_index('date_time', drop=True, inplace=True)
    returned_df = add_datetime(returned_df)

    return returned_df

def get_open_close_time(df):
    open_time = df.time.min()
    close_time = df.time.max()
    
    return open_time, close_time

def get_open_close_auction_time(df):
    open_time = df.time.min()
    auction_time = df.time.max()
    close_time = df[df[TIME] != df.time.max()].time.max()
    
    return open_time, close_time, auction_time

def get_intraday_data(df, start_time, end_time):
    df = df.between_time(open_time, end_time)
    df[TURNOVER] = df[VOLUME]*df[CLOSE_PRICE]
    df_resampled = df.resample('5T').agg({OPEN_PRICE: "first", 
                                          LOW_PRICE: "min", 
                                          HIGH_PRICE: "max", 
                                          CLOSE_PRICE: "last", 
                                          VOLUME: "sum", 
                                          TURNOVER: "sum"})
    df_resampled[VWAP] = df_resampled[TURNOVER]/df_resampled[VOLUME]
    df_resampled = df_resampled[(df_resampled.index.weekday != 5) | (df_resampled.index.weekday != 6)]
    df_resampled.dropna(inplace=True)

    return df_resampled

def get_daily_data(df):
    daily_data = df.resample('B').agg({OPEN_PRICE: "first", 
                                                LOW_PRICE: "min",
                                                HIGH_PRICE: "max", 
                                                CLOSE_PRICE: "last", 
                                                VOLUME: "sum", 
                                                TURNOVER: "sum"})
    daily_data[VWAP] = daily_data.turnover/daily_data.volume
    daily_data = daily_data[daily_data[VOLUME] > 0.1]
    daily_data = daily_data[(daily_data.index != 6) & (daily_data.index != 7)]
    daily_data.index = pd.to_datetime(daily_data.index)
    daily_data = daily_data.resample('B').first()
    daily_data.dropna(inplace=True)
    
    
    return daily_data

def add_datetime(df):
    returned_df = df.copy()
    returned_df['date'] = pd.to_datetime(returned_df.index.date)
    returned_df['time'] = pd.to_datetime(returned_df.index, format = "%m-%d-%Y %H:%M:%S")
    returned_df['time'] = returned_df['time'].apply(lambda x: x.strftime('%H:%M:%S'))

    return returned_df

input_new = format_data(raw)
open_time, close_time, auction_time = get_open_close_auction_time(input_new)
intraday_data = get_intraday_data(input_new, open_time, close_time)
daily_data = get_daily_data(intraday_data)

## Features

### Features df old

In [6]:
def get_total_traded_vol(df):
    total_traded_vol = df.resample('B').sum()[['volume']]
    total_traded_vol.drop(labels=total_traded_vol.index.get_values()[0], axis='index', inplace=True)

    return total_traded_vol

def get_log_open_close_returns_(df):
    # open_close_returns = df[(df['time'] == '09:01:00') | (df['time'] == '17:35:00')]
    open_close_returns = df[(df['time'] == '09:00:00') | (df['time'] == '17:35:00')]
    open_close_returns['return'] = (open_close_returns['open']/open_close_returns['close'].shift(-1))
    open_close_returns['log_return'] = np.log(open_close_returns['return'])
    # open_close_returns = open_close_returns[open_close_returns['time'] == '09:01:00']
    open_close_returns = open_close_returns[open_close_returns['time'] == '09:00:00']
    open_close_returns = open_close_returns[['log_return']].resample('B').sum()

    return open_close_returns

def get_log_open_close_returns(df):
    open_price = df[OPEN_PRICE].resample('B').first()
    close_price = df[CLOSE_PRICE].resample('B').last()
    open_close_returns = np.log(open_price/close_price.shift(-1)).dropna()

    return open_close_returns

def get_log_returns(df):
    close = df[CLOSE_PRICE].resample('B').last()
    log_daily_returns = np.log(close/close.shift(-1))

    return log_daily_returns

def get_high_low_spread(df):
    daily_high = df.resample("B").max()[['high']]
    daily_low = df.resample("B").min()[['low']]
    high_low_spread = (daily_high['high']-daily_low['low'])

    return high_low_spread

def get_log(df):
    return np.log(df)

def get_money_flow_index(df):
    close = df[CLOSE_PRICE].resample('B').last()
    high = df[HIGH_PRICE].resample('B').max()
    low = df[LOW_PRICE].resample('B').min()
    typical_price = (close+high+low)/3
    volume = df[VOLUME].resample('B').sum()
    money_flow_index = typical_price/volume

    return money_flow_index

features_to_use = ["log_total_traded_vol", "money_flow_index"]

high_low = get_log(np.abs(get_high_low_spread(formatted_df))).rename("high_low_spread")
open_close_log_rets = get_log_open_close_returns(formatted_df).rename("open_close_rets")
total_traded_vol = get_log(get_total_traded_vol(formatted_df))[VOLUME].rename("log_total_traded_vol")
daily_log_rets = get_log_returns(formatted_df).rename("daily_log_return").dropna()
short_term_vol = daily_log_rets.rolling(21).std().dropna().rename("short_term_vol")
#     short_term_vol = daily_log_rets.rolling(252).std().dropna().apply(lambda x: x/np.sqrt(21)).rename("short_term_vol")
long_term_vol = daily_log_rets.dropna().rolling(252).std(ddof=0).dropna().rename("long_term_vol")
implied_daily_vol = (daily_log_rets.dropna().rolling(252).std(ddof=0).dropna()/np.sqrt(252)).rename("implied_daily_vol")
mfi = get_money_flow_index(formatted_df).rename("money_flow_index")

features_df = pd.concat([high_low, open_close_log_rets, total_traded_vol, daily_log_rets, short_term_vol, long_term_vol, mfi, implied_daily_vol], axis=1).dropna()
features_df = features_df.replace([np.inf, -np.inf], np.nan)
features_df = features_df.dropna()
# features_df = features_df[features_df['log_return'] != 0.0]
# features_df = features_df[features_df['hl_spread'] != 0.0]
features_df = features_df[features_to_use]

data_to_predict = features_df.shift(1).dropna()

In [7]:
data_to_predict

Unnamed: 0_level_0,log_total_traded_vol,money_flow_index
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-06-24,14.379688,0.000020
2004-06-25,14.710144,0.000015
2004-06-28,14.210634,0.000024
2004-06-29,14.781439,0.000014
2004-06-30,15.000944,0.000011
...,...,...
2019-10-04,14.192709,0.000044
2019-10-07,14.541612,0.000030
2019-10-08,13.731285,0.000068
2019-10-09,14.051481,0.000049


### features new

In [8]:
def get_features_series_list(daily_data):
    log_returns = get_log_returns(daily_data)
    adv_antilog = get_antilog_adv_median(daily_data)
    mfi = get_money_flow_index(daily_data)
    betas_mkt_impact = get_beta_market_impact(daily_data)
    log_traded_vol = get_log_total_trade_vol(daily_data)
    
    features = [log_returns, adv_antilog, mfi, betas_mkt_impact, log_traded_vol]
    features_df.rename(columns={VOLUME: "log_total_traded_vol"}, inplace=True)
    
    return features

def get_features_df(list_of_features):
    features_df = pd.concat(list_of_features, axis=1).dropna()
    features_df = features_df.replace([np.inf, -np.inf], np.nan)
    features_df = features_df.dropna()
    features_df = features_df[features_df != 0.0].shift(1).dropna()

    return features_df

def get_log_total_trade_vol(df):
    log_total_traded_vol = get_log(get_total_traded_vol(df))
    log_total_traded_vol.name = "log_total_traded_vol"
    
    return log_total_traded_vol 

def get_total_traded_vol(df):
    total_traded_vol = df.resample('B').first()[VOLUME]
    total_traded_vol.name = "total_traded_vol"

    return total_traded_vol

def get_log_open_close_returns(df):
    open_close_returns = np.log(df[OPEN_PRICE]/df[CLOSE_PRICE].shift(-1)).dropna()
    open_close_returns.name = "log_overnight_returns"

    return open_close_returns

def get_log_returns(df):
    close = df[CLOSE_PRICE].resample('B').last()
    log_daily_returns = np.log(close/close.shift(-1))
    log_daily_returns.name = "log_returns"

    return log_daily_returns

def get_high_low_spread(df):
    high_low_spread = (df[HIGH_PRICE]-df[LOW_PRICE])
    high_low_spread.name = "high_low_spread"

    return high_low_spread

def get_log(df):
    return np.log(df)

def get_antilog_adv_median(df):
    antilog_adv = np.exp(get_log(daily_data[VOLUME]).rolling(20, min_periods=5).median().dropna())
    antilog_adv.name = "antilog_adv"
    
    return antilog_adv

def get_money_flow_index(df):
    typical_price = (df[CLOSE_PRICE]+df[HIGH_PRICE]+df[LOW_PRICE])/3
    volume = df[VOLUME]
    money_flow_index = typical_price/volume
    money_flow_index.name = "money_flow_index"

    return money_flow_index

def get_beta_market_impact(df):
    adv_antilog = get_antilog_adv_median(df)
    X = df[VOLUME]/adv_antilog
    Y = ((df[VWAP]-df[OPEN_PRICE])/df[OPEN_PRICE])*1e4
    betas = X/Y
    betas.name = "betas_market_impact"
    
    return betas

features_list = get_features_series_list(daily_data)
features_df_new = get_features_df(features_list)

In [9]:
features_to_use = ["log_total_traded_vol", "money_flow_index"]
features_df_new[features_to_use]

Unnamed: 0_level_0,log_total_traded_vol,money_flow_index
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2003-06-27,14.716221,0.000013
2003-07-03,14.231743,0.000022
2003-07-04,13.325325,0.000055
2003-07-07,13.853255,0.000033
2003-07-08,14.521313,0.000017
...,...,...
2019-10-04,13.817550,0.000065
2019-10-07,14.139078,0.000045
2019-10-08,13.271175,0.000107
2019-10-09,13.615681,0.000076


## test MFI

### mfi old

In [10]:
df_mfi_old = formatted_df
close = df_mfi_old[CLOSE_PRICE].resample('B').last()
high = df_mfi_old[HIGH_PRICE].resample('B').max()
low = df_mfi_old[LOW_PRICE].resample('B').min()
typical_price = (close+high+low)/3
volume = df_mfi_old[VOLUME].resample('B').sum()
money_flow_index = typical_price/volume

In [11]:
df_mfi_new[CLOSE_PRICE]

NameError: name 'df_mfi_new' is not defined

In [15]:
df_mfi_old[CLOSE_PRICE]

date_time
2003-06-19 17:35:00    31.96
2003-06-20 09:01:00    31.90
2003-06-20 09:02:00    31.96
2003-06-20 09:03:00    31.80
2003-06-20 09:04:00    31.68
                       ...  
2019-10-11 17:27:00    64.80
2019-10-11 17:28:00    64.86
2019-10-11 17:29:00    64.83
2019-10-11 17:30:00    64.80
2019-10-11 17:35:00    64.78
Name: close, Length: 1970723, dtype: float64

In [16]:
close

date_time
2003-06-19    31.96
2003-06-20    32.35
2003-06-23    31.84
2003-06-24    32.41
2003-06-25    32.30
              ...  
2019-10-07    62.68
2019-10-08    61.86
2019-10-09    62.35
2019-10-10    63.23
2019-10-11    64.78
Freq: B, Name: close, Length: 4257, dtype: float64

In [17]:
typical_price

date_time
2003-06-20    32.440000
2003-06-23    31.890000
2003-06-24    32.296667
2003-06-25    32.480000
2003-06-26    32.873333
                ...    
2019-10-07    62.190000
2019-10-08    62.080000
2019-10-09    62.073333
2019-10-10    63.093333
2019-10-11    64.296667
Length: 4143, dtype: float64

### mfi new

In [12]:
df_mfi_new = daily_data
typical_price = (df_mfi_new[CLOSE_PRICE]+df_mfi_new[HIGH_PRICE]+df_mfi_new[LOW_PRICE])/3
volume = df_mfi_new[VOLUME]
money_flow_index = typical_price/volume
money_flow_index.name = "money_flow_index"

In [13]:
df_mfi_new[CLOSE_PRICE]

date_time
2003-06-20    32.75
2003-06-23    31.85
2003-06-24    32.39
2003-06-25    32.49
2003-06-26    33.28
              ...  
2019-10-07    62.54
2019-10-08    61.89
2019-10-09    62.14
2019-10-10    63.44
2019-10-11    64.80
Name: close, Length: 4143, dtype: float64

In [14]:
typical_price

date_time
2003-06-20    32.440000
2003-06-23    31.890000
2003-06-24    32.296667
2003-06-25    32.480000
2003-06-26    32.873333
                ...    
2019-10-07    62.190000
2019-10-08    62.080000
2019-10-09    62.073333
2019-10-10    63.093333
2019-10-11    64.296667
Length: 4143, dtype: float64

## Test models

In [18]:
def split_train_test_data(df, size_in_years):
    training_data = df[df.index[0]:df.index[-1] - relativedelta(years=size_in_years, hours=-9, minutes = -5)]
    test_data = df[df.index[-1] - relativedelta(years=size_in_years, hours=-9):]

    return training_data, test_data

In [19]:
gmm = GaussianMixture(n_components=3, covariance_type='full', max_iter=1000, n_init=100, random_state=3)

In [133]:
hmm = GaussianHMM(n_components=3, covariance_type='full', n_iter=1000, random_state=100)

### OLD

In [20]:
data_to_train, data_to_test = split_train_test_data(data_to_predict, 2)

In [134]:
def fit_model(model, full_data, train_data, list_of_features):
    X = train_data.values
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)

    if len(list_of_features) > 2:
        pca = PCA(n_components=.95)
        pca.fit(X_scaled)
        X_pca = pca.transform(X_scaled)
        fitted_model = model.fit(X_pca)
        X_full = full_data.values
        X_full_scaled = scaler.transform(X_full)
        X_full_pca = pca.transform(X_full_scaled)
        prediction = fitted_model.predict(X_full_pca)
    else:
        fitted_model = model.fit(X_scaled)
        X_full = full_data.values
        X_full_scaled = scaler.transform(X_full)
        prediction = fitted_model.predict(X_full_scaled)

    return prediction

model_old = hmm
X = data_to_train.values
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
pca = PCA(n_components=.95)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
fitted_model = model_old.fit(X_pca)
X_full = data_to_predict.values
X_full_scaled = scaler.transform(X_full)
X_full_pca = pca.transform(X_full_scaled)
prediction_old = fitted_model.predict(X_full_pca)
regime_old = pd.Series(data = prediction_old, index = data_to_predict.index).rename("regime")
regime_old = regime_old.resample("5T").asfreq().fillna(method="ffill")

new_df = pd.merge(algo_df, regime_old, left_index=True, right_index=True)
new_df = new_df.astype({"regime":"int32"})

### NEW

In [23]:
train_new, test_new = split_train_test_data(features_df_new[features_to_use], 2)

In [24]:
new_model = gmm
X = train_new.values
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
pca = PCA(n_components=.95)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
fitted_model_new = new_model.fit(X_pca)
X_full = features_df_new[features_to_use].values
X_full_scaled = scaler.transform(X_full)
X_full_pca = pca.transform(X_full_scaled)
prediction_new = fitted_model_new.predict(X_full_pca)

regime = pd.Series(data = prediction_new, index = features_df_new[features_to_use].index).rename("regime")

regime_intraday = pd.merge(intraday_data, regime.resample("5T").asfreq().fillna(method="ffill"), left_index=True, right_index=True)
regime_intraday = regime_intraday.astype({"regime":"int32"})

regime_daily = pd.merge(daily_data, regime, left_index=True, right_index=True)
regime_daily = regime_daily.astype({"regime":"int32"})

## Algoritmo Dinamico

### old

In [135]:
train_old, test_old = split_train_test_data(new_df, 2)

In [136]:
def getWeekdaysDataDict(df):
    weekdaysDataDict = {}
    weekdaysDataDict[0] = df[df.index.weekday == 0]
    weekdaysDataDict[1] = df[df.index.weekday == 1]
    weekdaysDataDict[2] = df[df.index.weekday == 2]
    weekdaysDataDict[3] = df[df.index.weekday == 3]
    weekdaysDataDict[4] = df[df.index.weekday == 4]
    weekdaysDataDict['else'] = df

    return weekdaysDataDict

def getStaticVolPredictorByWeekday(data, weekdaysDataDict):
    staticVolPredictor = {}
    staticVolPredictor[0] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(0))
    staticVolPredictor[1] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(1))
    staticVolPredictor[2] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(2))
    staticVolPredictor[3] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(3))
    staticVolPredictor[4] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(4))
    staticVolPredictor['else'] = getNormalizedStaticVolPredictor(data)

    return staticVolPredictor

def getNormalizedStaticVolPredictor(df):
    df = addDateAndTime(df)

    static_volume_predictor = df.groupby(by='time')['volume'].median()
    norm_static_volume_predictor = static_volume_predictor/sum(static_volume_predictor)
    norm_static_volume_predictor.index = norm_static_volume_predictor.index.map(lambda x: datetime.strptime(x, '%X').time())

    return norm_static_volume_predictor

def getADVMedian(df):
    return df['volume'].groupby(df.index.date).sum().median()

def getADVMedianByWeekday(data, weekdaysDataDict):
    advMedian = {}
    advMedian[0] = getADVMedian(weekdaysDataDict.get(0))
    advMedian[1] = getADVMedian(weekdaysDataDict.get(1))
    advMedian[2] = getADVMedian(weekdaysDataDict.get(2))
    advMedian[3] = getADVMedian(weekdaysDataDict.get(3))
    advMedian[4] = getADVMedian(weekdaysDataDict.get(4))
    advMedian['else'] = getADVMedian(data)

    return advMedian

regimes_old = np.unique(new_df['regime'])
regime_daily_vwap = {}
regime_weekdaysDataDict = {}
regime_staticVolPredictor = {}
regime_advMedian = {}
# daily_vwap = getDailyVWAP(test_data)
daily_vwap_old = test_old['volume*price'].groupby(test_old.index.date).sum()/test_old['volume'].groupby(test_old.index.date).sum()
for regime in regimes_old:
    this_regime_test_data_old = test_old[test_old['regime']==regime]
    this_regime_train_data_old = train_old[train_old['regime']==regime]
#         regime_daily_vwap[regime] = getDailyVWAP(test_data[test_data['regime'] == regime])
    regime_daily_vwap = this_regime_test_data_old['volume*price'].groupby(this_regime_test_data_old.index.date).sum()/this_regime_test_data_old['volume'].groupby(this_regime_test_data_old.index.date).sum()
#     regime_weekdaysDataDict[regime] = getWeekdaysDataDict(training_data[training_data['regime'] == regime])
    regime_weekdaysDataDict[regime] = getWeekdaysDataDict(this_regime_train_data_old)
    regime_staticVolPredictor[regime] = getStaticVolPredictorByWeekday(this_regime_train_data_old, regime_weekdaysDataDict[regime])
    regime_advMedian[regime] = getADVMedianByWeekday(this_regime_train_data_old, regime_weekdaysDataDict[regime])

In [137]:
## BacktestAlgoDynamicByRegime

def dataToMultiIndex(data):
    multi_data = data.copy()
    multi_data.index = pd.MultiIndex.from_arrays([multi_data.index.date, multi_data.index.time], names=['Date','Time'])

    return multi_data

def getReversedCumVol(multi_data, data):
    reversed_cumvol = []
    for day in multi_data.index.get_level_values('Date').unique():
        reversed_cumvol.append(multi_data.xs(day, level='Date')['volume'].cumsum().values[::-1])

    reversed_cumvol = pd.Series(data = np.concatenate(reversed_cumvol), index = data.index)

    return reversed_cumvol

def getVolPredictorNextBin(test_data, staticVolPredictor, advMedian):
    volume_predictor_next_interval = test_data.groupby(level=0)['volume'].shift(1)
    volume_predictor_next_interval.fillna(int (advMedian.get('else')*staticVolPredictor.get('else').iloc[0]), inplace=True)

    return volume_predictor_next_interval

training_data = train_old
test_data = test_old
advMedian = regime_advMedian
staticVolPredictor = regime_staticVolPredictor

new_training_data = dataToMultiIndex(training_data)
new_test_data = dataToMultiIndex(test_data)
regime_reversedCumVol = {}
regime_volPredictorNextBin = {}

for regime in regimes_old:
    regime_reversedCumVol[regime] = getReversedCumVol(new_training_data[new_training_data['regime']==regime], 
                                                 training_data[training_data['regime']==regime])
    regime_volPredictorNextBin[regime] = getVolPredictorNextBin(new_test_data[new_test_data['regime']==regime], 
                                                                        staticVolPredictor.get(regime),
                                                                        advMedian.get(regime))

In [142]:
def vwap_dynamic_execution_algo(data, reversed_cumvol, staticVolPredictor, volume_predictor_next_interval, amount_shares, order_side, start_time, end_time, day):
    shares_per_interval = []
    if day.weekday() == 0:
        shares_per_interval.append(staticVolPredictor.get(0).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 0].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 0].index.time).median()
    elif day.weekday() == 1:
        shares_per_interval.append(staticVolPredictor.get(1).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 1].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 1].index.time).median()
    elif day.weekday() == 2:
        shares_per_interval.append(staticVolPredictor.get(2).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 2].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 2].index.time).median()
    elif day.weekday() == 3:
        shares_per_interval.append(staticVolPredictor.get(3).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 3].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 3].index.time).median()
    elif day.weekday() == 4:
        shares_per_interval.append(staticVolPredictor.get(4).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 4].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 4].index.time).median()
    else:
        shares_per_interval.append(staticVolPredictor.get('else').iloc[0]*amount_shares)
        volPredictor = reversed_cumvol.groupby(reversed_cumvol.index.time).median()

    for i in range(1, len(data)):
        num = data['volume'].iloc[:i].sum()+volume_predictor_next_interval.xs(day, level='Date').iloc[i]
        denom = data['volume'].iloc[:i].sum()+volPredictor[i]
        op = amount_shares*(num/denom)
        shares_next_interval = op - sum(shares_per_interval)
        #     shares_next_interval = (amount_shares*((data['volume'].iloc[0:i].sum()+volume_predictor_next_interval.xs(day, level='Date').iloc[i])/(data['volume'].iloc[0:i].sum()+volPredictor[i])))-sum(shares_per_interval)
        shares_per_interval.append(shares_next_interval)

    vwap_this_exec_this_day = sum(shares_per_interval*data['vwap'])/sum(shares_per_interval)

    return vwap_this_exec_this_day

backtest_sell_vwap_dynamic = []
backtest_buy_vwap_dynamic = []
for day in new_test_data.index.get_level_values('Date').unique():
    data = new_test_data.xs(day, level='Date')
    regime = data['regime'].iloc[0]
    if day.weekday() == 0:
        amount_shares = advMedian.get(regime).get(0)*0.1
    elif day.weekday() == 1:
        amount_shares = advMedian.get(regime).get(1)*0.1
    elif day.weekday() == 2:
        amount_shares = advMedian.get(regime).get(2)*0.1
    elif day.weekday() == 3:
        amount_shares = advMedian.get(regime).get(3)*0.1
    elif day.weekday() == 4:
        amount_shares = advMedian.get(regime).get(4)*0.1
    else:
        amount_shares = advMedian.get(regime).get('else')*0.1

    backtest_sell_vwap_dynamic.append(vwap_dynamic_execution_algo(data, 
                                                                  regime_reversedCumVol.get(regime),
                                                                  regime_staticVolPredictor.get(regime),
                                                                  regime_volPredictorNextBin.get(regime),
                                                                  amount_shares, 
                                                                  'sell',
                                                                  data.index[0],
                                                                  data.index[-1], day))
    backtest_buy_vwap_dynamic.append(vwap_dynamic_execution_algo(data,
                                                                 regime_reversedCumVol.get(regime),
                                                                 regime_staticVolPredictor.get(regime),
                                                                 regime_volPredictorNextBin.get(regime),
                                                                 amount_shares,
                                                                 'buy',
                                                                 data.index[0],
                                                                 data.index[-1],
                                                                 day))

In [139]:
static_vwap_comparison = pd.DataFrame(data=daily_vwap_old.values.tolist(), index=daily_vwap_old.index, columns=['market_vwap'])
static_vwap_comparison['backtest_buy_vwap'] = backtest_buy_vwap_dynamic
static_vwap_comparison['backtest_sell_vwap'] = backtest_sell_vwap_dynamic

static_vwap_comparison['diff_vwap_bps_buy'] = 1000*(static_vwap_comparison['backtest_buy_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']
static_vwap_comparison['diff_vwap_bps_sell'] = -1000*(static_vwap_comparison['backtest_sell_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']

mean_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].mean()
sd_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].std()
mean_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].mean()
sd_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].std()

percentiles_diff_vwap_sells = {'1': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.01),
                           '5': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.05),
                           '95': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.95),
                           '99': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.99)}
percentiles_diff_vwap_buys = {'1': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.01),
                           '5': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.05),
                           '95': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.95),
                           '99': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.99)}

In [140]:
ticker="BMW"

In [141]:
print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.049160
Standard Dev: ±1.117328


### new

In [27]:
intraday_train, intraday_test = split_train_test_data(regime_intraday, 2)
daily_train, daily_test = split_train_test_data(regime_daily, 2)

In [28]:
def get_adv_median(df):
    return df['volume'].groupby(df.index.date).sum().median()

def get_weekdays_data_dict(df):
    weekdays_data_dict = {}
    weekdays_data_dict[0] = df[df.index.weekday == 0]
    weekdays_data_dict[1] = df[df.index.weekday == 1]
    weekdays_data_dict[2] = df[df.index.weekday == 2]
    weekdays_data_dict[3] = df[df.index.weekday == 3]
    weekdays_data_dict[4] = df[df.index.weekday == 4]
    weekdays_data_dict['else'] = df

    return weekdays_data_dict

def get_static_vol_predictor_by_weekday(data, weekdays_data_dict):
    static_vol_predictor = {}
    static_vol_predictor[0] = get_norm_static_vol_predictor(weekdays_data_dict.get(0))
    static_vol_predictor[1] = get_norm_static_vol_predictor(weekdays_data_dict.get(1))
    static_vol_predictor[2] = get_norm_static_vol_predictor(weekdays_data_dict.get(2))
    static_vol_predictor[3] = get_norm_static_vol_predictor(weekdays_data_dict.get(3))
    static_vol_predictor[4] = get_norm_static_vol_predictor(weekdays_data_dict.get(4))
    static_vol_predictor['else'] = get_norm_static_vol_predictor(data)

    return static_vol_predictor

def get_adv_median_by_weekday(data, weekdays_data_dict):
    adv_median = {}
    adv_median[0] = get_adv_median(weekdays_data_dict.get(0))
    adv_median[1] = get_adv_median(weekdays_data_dict.get(1))
    adv_median[2] = get_adv_median(weekdays_data_dict.get(2))
    adv_median[3] = get_adv_median(weekdays_data_dict.get(3))
    adv_median[4] = get_adv_median(weekdays_data_dict.get(4))
    adv_median['else'] = get_adv_median(data)

    return adv_median
  
def get_data_by_weekday(df, weekday):
    df_weekday = df[df.index.weekday == weekday]

    return df_weekday

def add_datetime(df):
    returned_df = df.copy()
    returned_df['date'] = pd.to_datetime(returned_df.index.date)
    returned_df['time'] = pd.to_datetime(returned_df.index, format = "%m-%d-%Y %H:%M:%S")
    returned_df['time'] = returned_df['time'].apply(lambda x: x.strftime('%H:%M:%S'))

    return returned_df

def get_norm_static_vol_predictor(df):
    static_volume_predictor = df.groupby(by=df.index.time)[VOLUME].median()
    norm_static_volume_predictor = static_volume_predictor/sum(static_volume_predictor)

    return norm_static_volume_predictor

vwap_and_regime = daily_test[[VWAP, "regime"]]
regime_daily_vwap = {}
regime_weekdays_data_dict = {}
regime_static_vol_predictor = {}
regime_adv_median = {}
daily_vwap_new = vwap_and_regime[VWAP]
for regime in regimes:
    regime_daily_vwap[regime] = vwap_and_regime[vwap_and_regime['regime'] == regime][VWAP]
    regime_weekdays_data_dict[regime] = get_weekdays_data_dict(intraday_train[intraday_train['regime'] == regime])
    regime_static_vol_predictor[regime] = get_static_vol_predictor_by_weekday(intraday_train[intraday_train['regime'] == regime],
                                                                              regime_weekdays_data_dict[regime])
    regime_adv_median[regime] = get_adv_median_by_weekday(intraday_train[intraday_train['regime'] == regime], 
                                                              regime_weekdays_data_dict[regime])