# Backtest VWAP performance

## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
import math

from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.mixture import GaussianMixture
from hmmlearn.hmm import GaussianHMM, MultinomialHMM, GMMHMM

from dateutil.relativedelta import relativedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
## GLOBAL VARS
DATE_TIME = 'date_time'
OPEN_PRICE = 'open'
HIGH_PRICE = 'high'
LOW_PRICE = 'low'
CLOSE_PRICE = 'close'
VOLUME = 'volume'
FEATURES = ['high_low_spread', "open_close_rets", "log_total_traded_vol", "daily_log_return", "short_term_vol", "long_term_vol", "money_flow_index"]

## Functions

In [3]:
def loadData(file):
    filepath = os.path.join(DATAPATH, file)
    df = pd.read_csv("file:///" + filepath, parse_dates=[['<DTYYYYMMDD>', '<TIME>']])

    return df

def formatData(df):
    returned_df = df.copy()
    returned_df.drop(labels=["<TICKER>", "<PER>", "<OPENINT>"], axis="columns", inplace=True)
    returned_df.columns = ['date_time', 'open', 'high', 'low', 'close', 'volume']
    returned_df.set_index('date_time', drop=True, inplace=True)
    returned_df = addDateAndTime(returned_df)

    return returned_df

def prepareDataframe(df):
    returned_df = df.copy()
    returned_df['volume*price'] = returned_df['volume']*returned_df['close']
    returned_df = returned_df.resample('5T').sum()
    returned_df = returned_df.between_time('9:00', '17:25')
    returned_df = returned_df[returned_df.index.weekday != 5]
    returned_df = returned_df[returned_df.index.weekday != 6]
    returned_df['vwap'] = returned_df['volume*price']/returned_df['volume']

    return returned_df

def splitTrainTestData(df, size_in_years):
    training_data = df[df.index[0]:df.index[0] + relativedelta(years=size_in_years, hours=-9, minutes = -5)]
    test_data = df[df.index[0] + relativedelta(years=size_in_years, hours=-9):]

    return training_data, test_data

def split_train_test_data(df, size_in_years):
    training_data = df[df.index[0]:df.index[-1] - relativedelta(years=size_in_years, hours=-9, minutes = -5)]
    test_data = df[df.index[-1] - relativedelta(years=size_in_years, hours=-9):]

    return training_data, test_data

def getWeekdaysData(df):
    df_mondays = df[df.index.weekday == 0]
    df_tuesdays = df[df.index.weekday == 1]
    df_wednesdays = df[df.index.weekday == 2]
    df_thursdays = df[df.index.weekday == 3]
    df_fridays = df[df.index.weekday == 4]

    return df_mondays, df_tuesdays, df_wednesdays, df_thursdays, df_fridays

def getWeekdaysDataDict(df):
    weekdaysDataDict = {}
    weekdaysDataDict[0] = df[df.index.weekday == 0]
    weekdaysDataDict[1] = df[df.index.weekday == 1]
    weekdaysDataDict[2] = df[df.index.weekday == 2]
    weekdaysDataDict[3] = df[df.index.weekday == 3]
    weekdaysDataDict[4] = df[df.index.weekday == 4]
    weekdaysDataDict['else'] = df

    return weekdaysDataDict

def getStaticVolPredictorByWeekday(data, weekdaysDataDict):
    staticVolPredictor = {}
    staticVolPredictor[0] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(0))
    staticVolPredictor[1] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(1))
    staticVolPredictor[2] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(2))
    staticVolPredictor[3] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(3))
    staticVolPredictor[4] = getNormalizedStaticVolPredictor(weekdaysDataDict.get(4))
    staticVolPredictor['else'] = getNormalizedStaticVolPredictor(data)

    return staticVolPredictor

def getADVMedianByWeekday(data, weekdaysDataDict):
    advMedian = {}
    advMedian[0] = getADVMedian(weekdaysDataDict.get(0))
    advMedian[1] = getADVMedian(weekdaysDataDict.get(1))
    advMedian[2] = getADVMedian(weekdaysDataDict.get(2))
    advMedian[3] = getADVMedian(weekdaysDataDict.get(3))
    advMedian[4] = getADVMedian(weekdaysDataDict.get(4))
    advMedian['else'] = getADVMedian(data)

    return advMedian
  
def getDataByWeekDay(df, weekday):
    df_weekday = df[df.index.weekday == weekday]

    return df_weekday

def addDateAndTime(df):
    returned_df = df.copy()
    returned_df['date'] = pd.to_datetime(returned_df.index.date)
    returned_df['time'] = pd.to_datetime(returned_df.index, format = "%m-%d-%Y %H:%M:%S")
    returned_df['time'] = returned_df['time'].apply(lambda x: x.strftime('%H:%M:%S'))

    return returned_df

def getNormalizedStaticVolPredictor(df):
    df = addDateAndTime(df)

    static_volume_predictor = df.groupby(by='time')['volume'].median()
    norm_static_volume_predictor = static_volume_predictor/sum(static_volume_predictor)
    norm_static_volume_predictor.index = norm_static_volume_predictor.index.map(lambda x: datetime.strptime(x, '%X').time())

    return norm_static_volume_predictor

def getReversedCumVol(multi_data, data):
    reversed_cumvol = []
    for day in multi_data.index.get_level_values('Date').unique():
        reversed_cumvol.append(multi_data.xs(day, level='Date')['volume'].cumsum().values[::-1])

    reversed_cumvol = pd.Series(data = np.array(reversed_cumvol).flatten(), index = data.index)

    return reversed_cumvol

def getADVMedian(df):
    return df['volume'].groupby(df.index.date).sum().median()

def getADVMean(df):
    return df['volume'].groupby(df.index.date).sum().median()

def getDailyVWAP(df):
    return df['volume*price'].groupby(df.index.date).sum()/df['volume'].groupby(df.index.date).sum()
  
def getVolPredictorNextBin(test_data, staticVolPredictor, advMedian):
    volume_predictor_next_interval = test_data.groupby(level=0)['volume'].shift(1)
    volume_predictor_next_interval.fillna(int (advMedian.get('else')*staticVolPredictor.get('else').iloc[0]), inplace=True)

    return volume_predictor_next_interval
  
def vwap_static_execution_algo(data, staticVolPredictor, amount_shares, order_side, start_time, end_time, day):
    if day.weekday() == 0:
        volPredictor = staticVolPredictor.get(0)[start_time:end_time]
    elif day.weekday() == 1:
        volPredictor = staticVolPredictor.get(1)[start_time:end_time]
    elif day.weekday() == 2:
        volPredictor = staticVolPredictor.get(2)[start_time:end_time]
    elif day.weekday() == 3:
        volPredictor = staticVolPredictor.get(3)[start_time:end_time]
    elif day.weekday() == 4:
        volPredictor = staticVolPredictor.get(4)[start_time:end_time]
    else:
        volPredictor = staticVolPredictor.get('else')[start_time:end_time]

    shares_per_interval = volPredictor*amount_shares
    vwap_this_exec_this_day = sum(shares_per_interval*data['vwap'])/sum(shares_per_interval)

    return vwap_this_exec_this_day

def vwap_dynamic_execution_algo(data, reversed_cumvol, staticVolPredictor, volume_predictor_next_interval, amount_shares, order_side, start_time, end_time, day):
    shares_per_interval = []
    if day.weekday() == 0:
        shares_per_interval.append(staticVolPredictor.get(0).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 0].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 0].index.time).median()
    elif day.weekday() == 1:
        shares_per_interval.append(staticVolPredictor.get(1).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 1].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 1].index.time).median()
    elif day.weekday() == 2:
        shares_per_interval.append(staticVolPredictor.get(2).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 2].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 2].index.time).median()
    elif day.weekday() == 3:
        shares_per_interval.append(staticVolPredictor.get(3).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 3].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 3].index.time).median()
    elif day.weekday() == 4:
        shares_per_interval.append(staticVolPredictor.get(4).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 4].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 4].index.time).median()
    else:
        shares_per_interval.append(staticVolPredictor.get('else').iloc[0]*amount_shares)
        volPredictor = reversed_cumvol.groupby(reversed_cumvol.index.time).median()

    for i in range(1, len(data)):
        num = data['volume'].iloc[:i].sum()+volume_predictor_next_interval.xs(day, level='Date').iloc[i]
        denom = data['volume'].iloc[:i].sum()+volPredictor[i]
        op = amount_shares*(num/denom)
        shares_next_interval = op - sum(shares_per_interval)
        #     shares_next_interval = (amount_shares*((data['volume'].iloc[0:i].sum()+volume_predictor_next_interval.xs(day, level='Date').iloc[i])/(data['volume'].iloc[0:i].sum()+volPredictor[i])))-sum(shares_per_interval)
        shares_per_interval.append(shares_next_interval)

    vwap_this_exec_this_day = sum(shares_per_interval*data['vwap'])/sum(shares_per_interval)

    return vwap_this_exec_this_day

def dataToMultiIndex(data):
    multi_data = data.copy()
    multi_data.index = pd.MultiIndex.from_arrays([multi_data.index.date, multi_data.index.time], names=['Date','Time'])

    return multi_data

def backtestAlgoStatic(training_data, test_data, advMedian, staticVolPredictor):
    new_test_data = test_data.copy()
    new_test_data.index = pd.MultiIndex.from_arrays([new_test_data.index.date, new_test_data.index.time], names=['Date','Time'])

    backtest_sell_vwap = []
    backtest_buy_vwap = []
    
    for day in new_test_data.index.get_level_values('Date').unique():
        data = new_test_data.xs(day, level='Date')

        if day.weekday() == 0:
            amount_shares = advMedian.get(0)*0.1
        elif day.weekday() == 1:
            amount_shares = advMedian.get(1)*0.1
        elif day.weekday() == 2:
            amount_shares = advMedian.get(2)*0.1
        elif day.weekday() == 3:
            amount_shares = advMedian.get(3)*0.1
        elif day.weekday() == 4:
            amount_shares = advMedian.get(4)*0.1
        else:
            amount_shares = advMedian.get('else')*0.1

        backtest_sell_vwap.append(vwap_static_execution_algo(data, staticVolPredictor, amount_shares, 'sell' ,data.index[0], data.index[-1], day))
        backtest_buy_vwap.append(vwap_static_execution_algo(data, staticVolPredictor, amount_shares, 'buy', data.index[0], data.index[-1], day))

    return backtest_buy_vwap, backtest_sell_vwap

def backtestAlgoDynamic(training_data, test_data, advMedian, staticVolPredictor):
    new_training_data = dataToMultiIndex(training_data)
    new_test_data = dataToMultiIndex(test_data)
    reversedCumVol = getReversedCumVol(new_training_data, training_data)
    volPredictorNextBin = getVolPredictorNextBin(new_test_data, staticVolPredictor, advMedian)


    backtest_sell_vwap_dynamic = []
    backtest_buy_vwap_dynamic = []
    for day in new_test_data.index.get_level_values('Date').unique():
        data = new_test_data.xs(day, level='Date')
        if day.weekday() == 0:
            amount_shares = advMedian.get(0)*0.1
        elif day.weekday() == 1:
            amount_shares = advMedian.get(1)*0.1
        elif day.weekday() == 2:
            amount_shares = advMedian.get(2)*0.1
        elif day.weekday() == 3:
            amount_shares = advMedian.get(3)*0.1
        elif day.weekday() == 4:
            amount_shares = advMedian.get(4)*0.1
        else:
            amount_shares = advMedian.get('else')*0.1
        backtest_sell_vwap_dynamic.append(vwap_dynamic_execution_algo(data, reversedCumVol, staticVolPredictor, volPredictorNextBin, amount_shares, 'sell', data.index[0], data.index[-1], day))
        backtest_buy_vwap_dynamic.append(vwap_dynamic_execution_algo(data, reversedCumVol, staticVolPredictor, volPredictorNextBin, amount_shares, 'buy', data.index[0], data.index[-1], day))

    return backtest_buy_vwap_dynamic, backtest_sell_vwap_dynamic
  
def getAlgoPerformance(training_data, test_data, dynamic_flag):
    daily_vwap = getDailyVWAP(test_data)
    weekdaysDataDict = getWeekdaysDataDict(training_data)
    staticVolPredictor = getStaticVolPredictorByWeekday(training_data, weekdaysDataDict)
    advMedian = getADVMedianByWeekday(training_data, weekdaysDataDict)

    if dynamic_flag:
        backtest_buy_vwap, backtest_sell_vwap = backtestAlgoDynamic(training_data, test_data, advMedian, staticVolPredictor)
    else:
        backtest_buy_vwap, backtest_sell_vwap = backtestAlgoStatic(training_data, test_data, advMedian, staticVolPredictor)

    static_vwap_comparison = pd.DataFrame(data=daily_vwap.values.tolist(), index=daily_vwap.index, columns=['market_vwap'])
    static_vwap_comparison['backtest_buy_vwap'] = backtest_buy_vwap
    static_vwap_comparison['backtest_sell_vwap'] = backtest_sell_vwap

    static_vwap_comparison['diff_vwap_bps_buy'] = 1000*(static_vwap_comparison['backtest_buy_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']
    static_vwap_comparison['diff_vwap_bps_sell'] = -1000*(static_vwap_comparison['backtest_sell_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']

    mean_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].mean()
    sd_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].std()
    mean_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].mean()
    sd_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].std()

    percentiles_diff_vwap_sells = {'1': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.01),
                               '5': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.05),
                               '95': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.95),
                               '99': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.99)}
    percentiles_diff_vwap_buys = {'1': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.01),
                               '5': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.05),
                               '95': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.95),
                               '99': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.99)}

    return mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells

def printAlgoPerformance(file, ticker):
    filepath = os.path.join(DATAPATH, file)

    df = pd.read_csv("file:///" + filepath, parse_dates=[['<DTYYYYMMDD>', '<TIME>']])
    formatted_df = formatData(df)
    formatted_df = formatted_df[formatted_df.index.year >= 2009]
    last_year = formatted_df.index.year.unique()[-2]
    algo_df = prepareDataframe(formatted_df)
    train = algo_df[algo_df.index.year < last_year]
    test = algo_df[algo_df.index.year >= last_year]

    print("Backtesting performance with static predictor...\n")
    mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformance(train, test, dynamic_flag=False)
    print("The performance of the algorithm using static predictor on " + ticker + " is")
    print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

    print("\n\n")

    print("Backtesting performance with dynamic predictor...\n")
    mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformance(train, test, dynamic_flag=True)
    print("The performance of the algorithm using dynamic predictor on " + ticker + " is")
    print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

    print("\n")
    print("Fitting Gaussian Mixture Model...")

    high_low = get_log(np.abs(get_high_low_spread(formatted_df)))
    open_close_log_rets = get_log_open_close_returns(formatted_df)
    total_traded_vol = get_log(get_total_traded_vol(formatted_df))

    features_df = pd.concat([high_low, open_close_log_rets, total_traded_vol], axis=1).dropna()
    features_df.columns = ['hl_spread', "log_return", "traded_vol"]
    features_df = features_df.replace([np.inf, -np.inf], np.nan)
    features_df = features_df.dropna()
    features_df = features_df[features_df['log_return'] != 0.0]
    features_df = features_df[features_df['hl_spread'] != 0.0]
    features_df = features_df[["traded_vol"]]
    # features_df = features_df[['hl_spread', "log_return", "traded_vol"]]

    features_df = features_df.shift(1).dropna()

    features_train = features_df[features_df.index.year < last_year]
    features_test = features_df[features_df.index.year >= last_year]

    X = features_train.values
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    # pca = PCA(n_components=.95)
    # pca.fit(X_scaled)
    # X_pca = pca.transform(X_scaled)

    gmm = GaussianMixture(n_components=3, covariance_type='full', max_iter=1000, n_init=100)
    model = gmm.fit(X_scaled)

    X = features_df.values
    X_scaled = scaler.transform(X)
    # X_pca = pca.transform(X_scaled)

    prediction_gmm = model.predict(X_scaled)
    # prediction_gmm = hmm_model.predict(X_pca)

    print("Model fitted\n\n")
    features_df['regime'] = prediction_gmm
    regime = features_df[['regime']]
    regime = regime.resample("5T").asfreq().fillna(method="ffill")

    new_df = pd.merge(algo_df, regime, left_index=True, right_index=True)
    new_df = new_df.astype({"regime":"int32"})

    train = new_df[new_df.index.year < last_year]
    test = new_df[new_df.index.year >= last_year]

    print("Backtesting performance of static predictor using segmented data...\n")
    mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=False, regimes=new_df['regime'].unique())
    print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
    print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

    print("\n\n")

    print("Backtesting performance of dynamic predictor using segmented data...\n")
    mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=True, regimes=new_df['regime'].unique())
    print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
    print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

def vwap_static_execution_algo(data, staticVolPredictor, amount_shares, order_side, start_time, end_time, day):
    if day.weekday() == 0:
        volPredictor = staticVolPredictor.get(0)[start_time:end_time]
    elif day.weekday() == 1:
        volPredictor = staticVolPredictor.get(1)[start_time:end_time]
    elif day.weekday() == 2:
        volPredictor = staticVolPredictor.get(2)[start_time:end_time]
    elif day.weekday() == 3:
        volPredictor = staticVolPredictor.get(3)[start_time:end_time]
    elif day.weekday() == 4:
        volPredictor = staticVolPredictor.get(4)[start_time:end_time]
    else:
        volPredictor = staticVolPredictor.get('else')[start_time:end_time]

    shares_per_interval = volPredictor*amount_shares
    vwap_this_exec_this_day = sum(shares_per_interval*data['vwap'])/sum(shares_per_interval)

    return vwap_this_exec_this_day

def vwap_dynamic_execution_algo(data, reversed_cumvol, staticVolPredictor, volume_predictor_next_interval, amount_shares, order_side, start_time, end_time, day):
    shares_per_interval = []
    if day.weekday() == 0:
        shares_per_interval.append(staticVolPredictor.get(0).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 0].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 0].index.time).median()
    elif day.weekday() == 1:
        shares_per_interval.append(staticVolPredictor.get(1).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 1].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 1].index.time).median()
    elif day.weekday() == 2:
        shares_per_interval.append(staticVolPredictor.get(2).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 2].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 2].index.time).median()
    elif day.weekday() == 3:
        shares_per_interval.append(staticVolPredictor.get(3).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 3].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 3].index.time).median()
    elif day.weekday() == 4:
        shares_per_interval.append(staticVolPredictor.get(4).iloc[0]*amount_shares)
        volPredictor = reversed_cumvol[reversed_cumvol.index.weekday == 4].groupby(reversed_cumvol[reversed_cumvol.index.weekday == 4].index.time).median()
    else:
        shares_per_interval.append(staticVolPredictor.get('else').iloc[0]*amount_shares)
        volPredictor = reversed_cumvol.groupby(reversed_cumvol.index.time).median()

    for i in range(1, len(data)):
        num = data['volume'].iloc[:i].sum()+volume_predictor_next_interval.xs(day, level='Date').iloc[i]
        denom = data['volume'].iloc[:i].sum()+volPredictor[i]
        op = amount_shares*(num/denom)
        shares_next_interval = op - sum(shares_per_interval)
        shares_per_interval.append(shares_next_interval)

    vwap_this_exec_this_day = sum(shares_per_interval*data['vwap'])/sum(shares_per_interval)

    return vwap_this_exec_this_day

def backtestAlgoStaticByRegime(training_data, test_data, advMedian, staticVolPredictor):
    new_test_data = test_data.copy()
    new_test_data.index = pd.MultiIndex.from_arrays([new_test_data.index.date,
                                                   new_test_data.index.time],
                                                   names=['Date','Time'])

    backtest_sell_vwap = []
    backtest_buy_vwap = []
    
    for day in new_test_data.index.get_level_values('Date').unique():
        data = new_test_data.xs(day, level='Date')
        regime = data['regime'].iloc[0]
        if day.weekday() == 0:
            amount_shares = advMedian.get(regime).get(0)*0.1
        elif day.weekday() == 1:
            amount_shares = advMedian.get(regime).get(1)*0.1
        elif day.weekday() == 2:
            amount_shares = advMedian.get(regime).get(2)*0.1
        elif day.weekday() == 3:
            amount_shares = advMedian.get(regime).get(3)*0.1
        elif day.weekday() == 4:
            amount_shares = advMedian.get(regime).get(4)*0.1
        else:
            amount_shares = advMedian.get(regime).get('else')*0.1
        
    backtest_sell_vwap.append(vwap_static_execution_algo(data,
                                                         staticVolPredictor.get(regime),
                                                         amount_shares,
                                                         'sell',
                                                         data.index[0],
                                                         data.index[-1],
                                                         day))
    backtest_buy_vwap.append(vwap_static_execution_algo(data,
                                                        staticVolPredictor.get(regime),
                                                        amount_shares,
                                                        'buy',
                                                        data.index[0],
                                                        data.index[-1],
                                                        day))

    return backtest_buy_vwap, backtest_sell_vwap

def backtestAlgoDynamicByRegime(training_data, test_data, advMedian, staticVolPredictor, regimes):
    new_training_data = dataToMultiIndex(training_data)
    new_test_data = dataToMultiIndex(test_data)
    regime_reversedCumVol = {}
    regime_volPredictorNextBin = {}
    for regime in regimes:
        regime_reversedCumVol[regime] = getReversedCumVol(new_training_data[new_training_data['regime']==regime], 
                                                          training_data[training_data['regime']==regime])
        regime_volPredictorNextBin[regime] = getVolPredictorNextBin(new_test_data[new_test_data['regime']==regime], 
                                                                    staticVolPredictor.get(regime),
                                                                    advMedian.get(regime))

    backtest_sell_vwap_dynamic = []
    backtest_buy_vwap_dynamic = []
    for day in new_test_data.index.get_level_values('Date').unique():
        data = new_test_data.xs(day, level='Date')
        regime = data['regime'].iloc[0]
        if day.weekday() == 0:
            amount_shares = advMedian.get(regime).get(0)*0.1
        elif day.weekday() == 1:
            amount_shares = advMedian.get(regime).get(1)*0.1
        elif day.weekday() == 2:
            amount_shares = advMedian.get(regime).get(2)*0.1
        elif day.weekday() == 3:
            amount_shares = advMedian.get(regime).get(3)*0.1
        elif day.weekday() == 4:
            amount_shares = advMedian.get(regime).get(4)*0.1
        else:
            amount_shares = advMedian.get(regime).get('else')*0.1
            
        backtest_sell_vwap_dynamic.append(vwap_dynamic_execution_algo(data, 
                                                                      regime_reversedCumVol.get(regime),
                                                                      staticVolPredictor.get(regime),
                                                                      regime_volPredictorNextBin.get(regime),
                                                                      amount_shares, 
                                                                      'sell',
                                                                      data.index[0],
                                                                      data.index[-1], day))
        backtest_buy_vwap_dynamic.append(vwap_dynamic_execution_algo(data,
                                                                     regime_reversedCumVol.get(regime),
                                                                     staticVolPredictor.get(regime),
                                                                     regime_volPredictorNextBin.get(regime),
                                                                     amount_shares,
                                                                     'buy',
                                                                     data.index[0],
                                                                     data.index[-1],
                                                                     day))

    return backtest_buy_vwap_dynamic, backtest_sell_vwap_dynamic

def getAlgoPerformanceByRegime(training_data, test_data, dynamic_flag, regimes):
    regime_daily_vwap = {}
    regime_weekdaysDataDict = {}
    regime_staticVolPredictor = {}
    regime_advMedian = {}
    daily_vwap = getDailyVWAP(test_data)
    for regime in regimes:
        regime_daily_vwap[regime] = getDailyVWAP(test_data[test_data['regime'] == regime])
        regime_weekdaysDataDict[regime] = getWeekdaysDataDict(training_data[training_data['regime'] == regime])
        regime_staticVolPredictor[regime] = getStaticVolPredictorByWeekday(training_data[training_data['regime'] == regime], regime_weekdaysDataDict[regime])
        regime_advMedian[regime] = getADVMedianByWeekday(training_data[training_data['regime'] == regime], regime_weekdaysDataDict[regime])

    if dynamic_flag:
        backtest_buy_vwap, backtest_sell_vwap = backtestAlgoDynamicByRegime(training_data, test_data, regime_advMedian, regime_staticVolPredictor, regimes)
    else:
        backtest_buy_vwap, backtest_sell_vwap = backtestAlgoStaticByRegime(training_data, test_data, regime_advMedian, regime_staticVolPredictor)

    static_vwap_comparison = pd.DataFrame(data=daily_vwap.values.tolist(), index=daily_vwap.index, columns=['market_vwap'])
    static_vwap_comparison['backtest_buy_vwap'] = backtest_buy_vwap
    static_vwap_comparison['backtest_sell_vwap'] = backtest_sell_vwap

    static_vwap_comparison['diff_vwap_bps_buy'] = 1000*(static_vwap_comparison['backtest_buy_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']
    static_vwap_comparison['diff_vwap_bps_sell'] = -1000*(static_vwap_comparison['backtest_sell_vwap']-static_vwap_comparison['market_vwap'])/static_vwap_comparison['market_vwap']

    mean_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].mean()
    sd_bps_diff_buys = static_vwap_comparison['diff_vwap_bps_buy'].std()
    mean_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].mean()
    sd_bps_diff_sells = static_vwap_comparison['diff_vwap_bps_sell'].std()

    percentiles_diff_vwap_sells = {'1': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.01),
                               '5': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.05),
                               '95': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.95),
                               '99': static_vwap_comparison['diff_vwap_bps_sell'].quantile(0.99)}
    percentiles_diff_vwap_buys = {'1': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.01),
                               '5': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.05),
                               '95': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.95),
                               '99': static_vwap_comparison['diff_vwap_bps_buy'].quantile(0.99)}

    return mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells

def get_total_traded_vol(df):
    total_traded_vol = df.resample('B').sum()[['volume']]
    total_traded_vol.drop(labels=total_traded_vol.index.get_values()[0], axis='index', inplace=True)

    return total_traded_vol

def get_log_open_close_returns_(df):
    # open_close_returns = df[(df['time'] == '09:01:00') | (df['time'] == '17:35:00')]
    open_close_returns = df[(df['time'] == '09:00:00') | (df['time'] == '17:35:00')]
    open_close_returns['return'] = (open_close_returns['open']/open_close_returns['close'].shift(-1))
    open_close_returns['log_return'] = np.log(open_close_returns['return'])
    # open_close_returns = open_close_returns[open_close_returns['time'] == '09:01:00']
    open_close_returns = open_close_returns[open_close_returns['time'] == '09:00:00']
    open_close_returns = open_close_returns[['log_return']].resample('B').sum()

    return open_close_returns

def get_log_open_close_returns(df):
    open_price = df[OPEN_PRICE].resample('B').first()
    close_price = df[CLOSE_PRICE].resample('B').last()
    open_close_returns = np.log(open_price/close_price.shift(-1)).dropna()

    return open_close_returns

def get_log_returns(df):
    close = df[CLOSE_PRICE].resample('B').last()
    log_daily_returns = np.log(close/close.shift(-1))

    return log_daily_returns

def get_high_low_spread(df):
    daily_high = df.resample("B").max()[['high']]
    daily_low = df.resample("B").min()[['low']]
    high_low_spread = (daily_high['high']-daily_low['low'])

    return high_low_spread

def get_log(df):
    return np.log(df)

def getFeaturesDf(df):
    high_low = get_log(np.abs(get_high_low_spread(df)))
    open_close_log_rets = get_log_open_close_returns(df)
    total_traded_vol = get_log(get_total_traded_vol(df))

    features_df = pd.concat([high_low, open_close_log_rets, total_traded_vol], axis=1).dropna()
    features_df.columns = ['hl_spread', "log_return", "traded_vol"]
    features_df = features_df.replace([np.inf, -np.inf], np.nan)
    features_df = features_df.dropna()
    features_df = features_df[features_df['log_return'] != 0.0]
    features_df = features_df[features_df['hl_spread'] != 0.0]

    return features_df

def processDataToFit(features_df):
    X = features_df.values
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    pca = PCA(n_components=.95)
    pca.fit(X)
    X_pca = pca.transform(X)

    return X_pca

def fit_model(model, full_data, train_data, list_of_features):
    X = train_data.values
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)

    if len(list_of_features) > 2:
        pca = PCA(n_components=.95)
        pca.fit(X_scaled)
        X_pca = pca.transform(X_scaled)
        fitted_model = model.fit(X_pca)
        X_full = full_data.values
        X_full_scaled = scaler.transform(X_full)
        X_full_pca = pca.transform(X_full_scaled)
        prediction = fitted_model.predict(X_full_pca)
    else:
        fitted_model = model.fit(X_scaled)
        X_full = full_data.values
        X_full_scaled = scaler.transform(X_full)
        prediction = fitted_model.predict(X_full_scaled)

    return prediction

def get_money_flow_index(df):
    close = df[CLOSE_PRICE].resample('B').last()
    high = df[HIGH_PRICE].resample('B').max()
    low = df[LOW_PRICE].resample('B').min()
    typical_price = (close+high+low)/3
    volume = df[VOLUME].resample('B').sum()
    money_flow_index = typical_price/volume

    return money_flow_index

def fit_model_by_weekday(model, full_data, train_data, list_of_features):
    regime = []
    for weekday in [0,1,2,3,4]:
        model_weekday = model
        X = train_data[train_data.index.weekday == weekday].values
        scaler = MinMaxScaler()
        scaler.fit(X)
        X_scaled = scaler.transform(X)

        if len(list_of_features) > 2:
            pca = PCA(n_components=.95)
            pca.fit(X_scaled)
            X_pca = pca.transform(X_scaled)
            fitted_model = model_weekday.fit(X_pca)
            X_full = full_data[full_data.index.weekday == weekday].values
            X_full_scaled = scaler.transform(X_full)
            X_full_pca = pca.transform(X_full_scaled)
            prediction = fitted_model.predict(X_full_pca)
        else:
            fitted_model = model_weekday.fit(X_scaled)
            X_full = full_data.values
            X_full_scaled = scaler.transform(X_full)
            prediction = fitted_model.predict(X_full_scaled)

        regime_serie = pd.Series(data = prediction, index = full_data[full_data.index.weekday == weekday].index).rename("regime")
        regime_serie = regime_serie.resample("5T").asfreq().fillna(method="ffill")
        regime.append(regime_serie)

    regime_df = regime[0]
    for ii in range(1,5):
        pd.concat(regime_df, regime[ii])

    return regime_df

In [4]:
warnings.filterwarnings("ignore")
DATAPATH = os.getcwd()[:-4]+"Data\\"
filepath_anon = os.path.join(DATAPATH, "volume_price_2014_18.csv")

In [5]:
def printFullAlgoPerformance(file, ticker, features_to_use, normal_vwap=True, machine_learning=True, from_year=2008, calibrate_by_weekday=False, static_predictor=True, dynamic_predictor=True):
    filepath = os.path.join(DATAPATH, file)
    df = pd.read_csv("file:///" + filepath, parse_dates=[['<DTYYYYMMDD>', '<TIME>']])
    formatted_df = formatData(df)

    first_year = formatted_df.index[0].year
    if from_year >= first_year:
        formatted_df = formatted_df[formatted_df.index.year >= from_year]

    print("Backtesting {}. Using data from {}\n".format(ticker, formatted_df.index[0].year))

    enough_data = True

    last_year = formatted_df.index.year.unique()[-1]
    algo_df = prepareDataframe(formatted_df)
#     train = algo_df[algo_df.index.year < last_year]
#     test = algo_df[algo_df.index.year >= last_year]

#     ## If test data has less than 150 business days, use previous year
#     if len(test) < 150*102:
#         train = algo_df[algo_df.index.year < (last_year - 1)]
#         test = algo_df[algo_df.index.year >= (last_year - 1)]
#         enough_data = False

    train, test = split_train_test_data(algo_df, 2)


    if normal_vwap:

        if static_predictor:
            print("Backtesting performance with static predictor...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformance(train, test, dynamic_flag=False)
            print("The performance of the algorithm using static predictor on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

            print("-"*80)

        if dynamic_predictor:
            print("Backtesting performance with dynamic predictor...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformance(train, test, dynamic_flag=True)
            print("The performance of the algorithm using dynamic predictor on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

            print("-"*80)

    if machine_learning:
        high_low = get_log(np.abs(get_high_low_spread(formatted_df))).rename("high_low_spread")
        open_close_log_rets = get_log_open_close_returns(formatted_df).rename("open_close_rets")
        total_traded_vol = np.exp(get_log(get_total_traded_vol(formatted_df))[VOLUME]).rename("log_total_traded_vol")
        daily_log_rets = get_log_returns(formatted_df).rename("daily_log_return").dropna()
        short_term_vol = daily_log_rets.rolling(21).std().dropna().rename("short_term_vol")
        #     short_term_vol = daily_log_rets.rolling(252).std().dropna().apply(lambda x: x/np.sqrt(21)).rename("short_term_vol")
        long_term_vol = daily_log_rets.dropna().rolling(252).std(ddof=0).dropna().rename("long_term_vol")
        implied_daily_vol = (daily_log_rets.dropna().rolling(252).std(ddof=0).dropna()/np.sqrt(252)).rename("implied_daily_vol")
        mfi = get_log(get_money_flow_index(formatted_df).rename("money_flow_index"))

        features_df = pd.concat([high_low, open_close_log_rets, total_traded_vol, daily_log_rets, short_term_vol, long_term_vol, mfi, implied_daily_vol], axis=1).dropna()
        features_df = features_df.replace([np.inf, -np.inf], np.nan)
        features_df = features_df.dropna()
        # features_df = features_df[features_df['log_return'] != 0.0]
        # features_df = features_df[features_df['hl_spread'] != 0.0]
        features_df = features_df[features_to_use]

        data_to_predict = features_df.shift(1).dropna()

        if enough_data:
            data_to_train = data_to_predict[data_to_predict.index.year < last_year]
            data_to_test = data_to_predict[data_to_predict.index.year >= last_year]
        else:
            data_to_train = data_to_predict[data_to_predict.index.year < (last_year - 1)]
            data_to_test = data_to_predict[data_to_predict.index.year >= (last_year - 1)]

        gmm = GaussianMixture(n_components=3, covariance_type='full', max_iter=1000, n_init=100)
        if calibrate_by_weekday:
            regime = fit_model_by_weekday(gmm, data_to_predict, data_to_train, features_to_use)
        else:
            model_prediction = fit_model(gmm, data_to_predict, data_to_train, features_to_use)
            regime = pd.Series(data = model_prediction, index = data_to_predict.index).rename("regime")
            regime = regime.resample("5T").asfreq().fillna(method="ffill")

        new_df = pd.merge(algo_df, regime, left_index=True, right_index=True)
        new_df = new_df.astype({"regime":"int32"})

        if enough_data:
            train = new_df[new_df.index.year < last_year]
            test = new_df[new_df.index.year >= last_year]
        else:
            train = new_df[new_df.index.year < (last_year - 1)]
            test = new_df[new_df.index.year >= (last_year - 1)]

        print("Model used: GMM, features used: [{}]\n".format(", ".join(features_to_use)))
        if static_predictor:
            print("Backtesting performance of static predictor using segmented data...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=False, regimes=new_df['regime'].unique())
            print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))
            print("-"*80)

        if dynamic_predictor:
            print("Backtesting performance of dynamic predictor using segmented data ...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=True, regimes=new_df['regime'].unique())
            print("The performance of the algorithm using dynamic predictor with segmented data on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))
            print("-"*80)

#     hmm = GMMHMM(n_components=3, n_mix=3, covariance_type='full', n_iter=1000, random_state=100)
        hmm = GaussianHMM(n_components=3, covariance_type='full', n_iter=1000, random_state=100)
        if calibrate_by_weekday:
            regime = fit_model_by_weekday(hmm, data_to_predict, data_to_train, features_to_use)
        else:
            model_prediction = fit_model(hmm, data_to_predict, data_to_train, features_to_use)
            regime = pd.Series(data = model_prediction, index = data_to_predict.index).rename("regime")
            regime = regime.resample("5T").asfreq().fillna(method="ffill")

        regime = pd.Series(data = model_prediction, index = data_to_predict.index).rename("regime")
        regime = regime.resample("5T").asfreq().fillna(method="ffill")

        new_df = pd.merge(algo_df, regime, left_index=True, right_index=True)
        new_df = new_df.astype({"regime":"int32"})

        if enough_data:
            train = new_df[new_df.index.year < last_year]
            test = new_df[new_df.index.year >= last_year]
        else:
            train = new_df[new_df.index.year < (last_year - 1)]
            test = new_df[new_df.index.year >= (last_year - 1)]

        print("Model used: Gaussian HMM, features used: [{}]\n".format(", ".join(features_to_use)))
        if static_predictor:
            print("Backtesting performance of static predictor using segmented data...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=False, regimes=new_df['regime'].unique())
            print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))
            print("-"*80)

        if dynamic_predictor:
            print("Backtesting performance of dynamic predictor using segmented data...\n")
            mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=True, regimes=new_df['regime'].unique())
            print("The performance of the algorithm using dynamic predictor with segmented data on " + ticker + " is")
            print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))
            print("-"*80)

    # multi_hmm = MultinomialHMM(n_components=3, n_iter=1000, random_state=100)
    # model_prediction = fit_model(multi_hmm, data_to_predict, data_to_train, features_to_use)

    # regime = pd.Series(data = model_prediction, index = data_to_predict.index).rename("regime")
    # regime = regime.resample("5T").asfreq().fillna(method="ffill")

    # new_df = pd.merge(algo_df, regime, left_index=True, right_index=True)
    # new_df = new_df.astype({"regime":"int32"})

    # if enough_data:
    #   train = new_df[new_df.index.year < last_year]
    #   test = new_df[new_df.index.year >= last_year]
    # else:
    #   train = new_df[new_df.index.year < (last_year - 1)]
    #   test = new_df[new_df.index.year >= (last_year - 1)]

    # print("Model used: Multinomial HMM, features used: [{}]\n".format(", ".join(features_to_use)))
    # print("Backtesting performance of static predictor using segmented data...\n")
    # mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=False, regimes=new_df['regime'].unique())
    # print("The performance of the algorithm using static predictor with segmented data on " + ticker + " is")
    # print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))

    # print("-"*80)

    # print("Backtesting performance of dynamic predictor using segmented data...\n")
    # mean_bps_diff_buys, sd_bps_diff_buys, mean_bps_diff_sells, sd_bps_diff_sells, percentiles_diff_vwap_buys, percentiles_diff_vwap_sells = getAlgoPerformanceByRegime(train, test, dynamic_flag=True, regimes=new_df['regime'].unique())
    # print("The performance of the algorithm using dynamic predictor with segmented data on " + ticker + " is")
    # print("Mean: ±%f\nStandard Dev: ±%f" % (np.abs(mean_bps_diff_buys), sd_bps_diff_buys))
    # print("-"*80)

## Backtest

#### BMW

### prueba con función otro notebook

In [6]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["log_total_traded_vol", "money_flow_index"], from_year=1900, static_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.055855
Standard Dev: ±1.041366
--------------------------------------------------------------------------------
Model used: GMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.005959
Standard Dev: ±1.333628
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.056958
Standard Dev: ±1.206740
--------------------------------------------------------------------------------


### prueba con función otro notebook 

In [6]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["log_total_traded_vol", "money_flow_index"], from_year=1900, static_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.062248
Standard Dev: ±1.065458
--------------------------------------------------------------------------------
Model used: GMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.002653
Standard Dev: ±1.336090
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.056958
Standard Dev: ±1.206740
--------------------------------------------------------------------------------


In [37]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["log_total_traded_vol", "money_flow_index"], from_year=1900, static_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.062248
Standard Dev: ±1.065458
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.002653
Standard Dev: ±1.336090
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.056958
Standard Dev: ±1.206740
--------------------------------------------------------------------------------


In [35]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["log_total_traded_vol", "money_flow_index"], from_year=1900, static_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.062248
Standard Dev: ±1.065458
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.005959
Standard Dev: ±1.333628
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.002712
Standard Dev: ±1.070776
--------------------------------------------------------------------------------


In [21]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["log_total_traded_vol", "money_flow_index"], from_year=1900, static_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.062248
Standard Dev: ±1.065458
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.014594
Standard Dev: ±1.317188
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [log_total_traded_vol, money_flow_index]

Backtesting performance of dynamic predictor using segmented data...

The performance of the algorithm using dynamic predictor with segmented data on BMW is
Mean: ±0.014305
Standard Dev: ±1.270934
--------------------------------------------------------------------------------


In [19]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["daily_log_return", "short_term_vol"], from_year=1900, dynamic_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BMW is
Mean: ±0.095037
Standard Dev: ±1.237775
--------------------------------------------------------------------------------
Model used: GMM, features used: [daily_log_return, short_term_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.101379
Standard Dev: ±1.278587
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [daily_log_return, short_term_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.100298
Standard Dev: ±1.284063
--------------------------------------------------------------------------------


In [18]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["implied_daily_vol"], from_year=1900, dynamic_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BMW is
Mean: ±0.095037
Standard Dev: ±1.237775
--------------------------------------------------------------------------------
Model used: GMM, features used: [implied_daily_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.095188
Standard Dev: ±1.195121
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [implied_daily_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.095152
Standard Dev: ±1.205371
--------------------------------------------------------------------------------


In [17]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["daily_log_return", "implied_daily_vol"], from_year=1900, dynamic_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BMW is
Mean: ±0.095037
Standard Dev: ±1.237775
--------------------------------------------------------------------------------
Model used: GMM, features used: [daily_log_return, implied_daily_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.105122
Standard Dev: ±1.281672
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [daily_log_return, implied_daily_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.095179
Standard Dev: ±1.205436
--------------------------------------------------------------------------------


In [16]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["daily_log_return"], from_year=1900, dynamic_predictor=False)

Backtesting BMW. Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BMW is
Mean: ±0.095037
Standard Dev: ±1.237775
--------------------------------------------------------------------------------
Model used: GMM, features used: [daily_log_return]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.086808
Standard Dev: ±1.233850
--------------------------------------------------------------------------------
Model used: Gaussian HMM, features used: [daily_log_return]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.111172
Standard Dev: ±1.286845
--------------------------------------------------------------------------------


In [14]:
printFullAlgoPerformance("BMW_BMW.txt", "BMW", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Backtesting BMW. Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BMW is
Mean: ±0.095037
Standard Dev: ±1.237775
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BMW is
Mean: ±0.062248
Standard Dev: ±1.065458
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BMW is
Mean: ±0.087754
Standard Dev: ±1.310582
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with se

### BBVA

In [0]:
printFullAlgoPerformance("BBVA_MC.txt", "BBVA", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2000
Backtesting performance with static predictor...

The performance of the algorithm using static predictor on BBVA is
Mean: ±0.092277
Standard Dev: ±1.361710
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on BBVA is
Mean: ±0.058314
Standard Dev: ±0.949860
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on BBVA is
Mean: ±0.088251
Standard Dev: ±1.358629
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on

ValueError: ignored

### Vivendi

In [0]:
printFullAlgoPerformance("VIVENDI.txt", "Vivendi", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Vivendi is
Mean: ±0.037385
Standard Dev: ±0.918880
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Vivendi is
Mean: ±0.073474
Standard Dev: ±0.784894
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Vivendi is
Mean: ±0.036503
Standard Dev: ±0.923523
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segment

### TESCO

In [0]:
printFullAlgoPerformance("TESCO.txt", "Tesco", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2008

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Tesco is
Mean: ±0.045996
Standard Dev: ±0.828027
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Tesco is
Mean: ±0.148551
Standard Dev: ±1.071701
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Tesco is
Mean: ±0.049749
Standard Dev: ±0.839903
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented dat

### TELEFONICA

In [0]:
printFullAlgoPerformance("TEF.txt", "Telefonica", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2001

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Telefonica is
Mean: ±0.002406
Standard Dev: ±0.797528
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Telefonica is
Mean: ±0.078622
Standard Dev: ±1.528175
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Telefonica is
Mean: ±0.018784
Standard Dev: ±0.799085
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor wit

### NOKIA

In [0]:
printFullAlgoPerformance("NOKIA.txt", "Nokia", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Nokia is
Mean: ±0.053665
Standard Dev: ±1.141943
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Nokia is
Mean: ±0.119720
Standard Dev: ±1.618934
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Nokia is
Mean: ±0.060769
Standard Dev: ±1.172547
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented dat

### GSK

In [0]:
printFullAlgoPerformance("GSK.txt", "GSK", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2008

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on GSK is
Mean: ±0.015963
Standard Dev: ±0.689863
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on GSK is
Mean: ±0.058979
Standard Dev: ±0.770256
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on GSK is
Mean: ±0.003239
Standard Dev: ±0.689950
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented data on G

### DIAGEO

In [0]:
printFullAlgoPerformance("DIAGEO.txt", "Diageo", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2008

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Diageo is
Mean: ±0.143363
Standard Dev: ±0.557072
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Diageo is
Mean: ±0.092379
Standard Dev: ±0.596173
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Diageo is
Mean: ±0.140762
Standard Dev: ±0.566523
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented 

### COMMERZBANK

In [0]:
printFullAlgoPerformance("COMMERZBANK.txt", "Commerzbank", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Commerzbank is
Mean: ±0.256454
Standard Dev: ±2.549025
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Commerzbank is
Mean: ±0.133939
Standard Dev: ±5.018480
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Commerzbank is
Mean: ±0.247826
Standard Dev: ±2.701519
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor 

### AIRBUS

In [0]:
printFullAlgoPerformance("AIR_AIRBUS.txt", "Airbus", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2003

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Airbus is
Mean: ±0.024169
Standard Dev: ±1.003630
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...

The performance of the algorithm using dynamic predictor on Airbus is
Mean: ±0.008608
Standard Dev: ±1.051960
--------------------------------------------------------------------------------
Model used: GMM, features used: [money_flow_index, log_total_traded_vol]

Backtesting performance of static predictor using segmented data...

The performance of the algorithm using static predictor with segmented data on Airbus is
Mean: ±0.029627
Standard Dev: ±1.120599
--------------------------------------------------------------------------------
Backtesting performance of dynamic predictor using segmented data ...

The performance of the algorithm using dynamic predictor with segmented 

### NOVARTIS

In [0]:
printFullAlgoPerformance("NOVARTIS.txt", "Novartis", ["money_flow_index", "log_total_traded_vol"], from_year=1900)

Using data from 2008

Backtesting performance with static predictor...

The performance of the algorithm using static predictor on Novartis is
Mean: ±0.064624
Standard Dev: ±0.604136
--------------------------------------------------------------------------------
Backtesting performance with dynamic predictor...



ValueError: ignored

## Test all files with daily returns

In [0]:
for r, d, f in os.walk("/"+DATAPATH+"/"):
  for file in f:
    if ".txt" in file:
      printFullAlgoPerformance(file, file.split(".")[0], ["daily_log_return"], from_year=1900, dynamic_predictor=False)

In [12]:
filepath

'.\\Data\\E-MINI-SP500.txt'

In [11]:
filepath = os.path.join(DATAPATH, "E-MINI-SP500.txt")

In [15]:
emini = pd.read_csv("file:///" + filepath, parse_dates=[['<DTYYYYMMDD>', '<TIME>']])

URLError: <urlopen error [WinError 3] The system cannot find the path specified: '\\.\\Data\\E-MINI-SP500.txt'>

In [24]:
df = pd.read_csv("file:///" + os.path.join(DATAPATH, "AAPL.txt"), parse_dates=[['<DTYYYYMMDD>', '<TIME>']])

In [26]:
apple = formatData(df)

In [27]:
apple

Unnamed: 0_level_0,open,high,low,close,volume,date,time
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-01-02 09:32:00,1.06,1.07,1.06,1.07,989800,2001-01-02,09:32:00
2001-01-02 09:33:00,1.07,1.07,1.06,1.07,205800,2001-01-02,09:33:00
2001-01-02 09:34:00,1.07,1.07,1.06,1.07,644000,2001-01-02,09:34:00
2001-01-02 09:35:00,1.07,1.07,1.06,1.07,404600,2001-01-02,09:35:00
2001-01-02 09:36:00,1.07,1.07,1.06,1.06,771400,2001-01-02,09:36:00
...,...,...,...,...,...,...,...
2020-03-06 15:56:00,289.04,289.64,288.79,288.81,209965,2020-03-06,15:56:00
2020-03-06 15:57:00,288.82,289.67,288.82,289.63,231190,2020-03-06,15:57:00
2020-03-06 15:58:00,289.61,290.82,289.56,289.83,420556,2020-03-06,15:58:00
2020-03-06 15:59:00,289.81,290.12,289.31,289.46,335329,2020-03-06,15:59:00


In [28]:
total_traded_vol = np.exp(get_log(get_total_traded_vol(apple))[VOLUME]).rename("log_total_traded_vol")

In [31]:
apple.groupby("date").sum()[VOLUME]

date
2001-01-02    108355800
2001-01-03    195965000
2001-01-04    174483400
2001-01-05     96156200
2001-01-08     88020800
                ...    
2020-03-02     59736664
2020-03-03     52706848
2020-03-04     39823037
2020-03-05     35286759
2020-03-06     38827169
Name: volume, Length: 4821, dtype: int64

In [29]:
total_traded_vol

date_time
2001-01-03    195965000.0
2001-01-04    174483400.0
2001-01-05     96156200.0
2001-01-08     88020800.0
2001-01-09    139378400.0
                 ...     
2020-03-02     59736664.0
2020-03-03     52706848.0
2020-03-04     39823037.0
2020-03-05     35286759.0
2020-03-06     38827169.0
Freq: B, Name: log_total_traded_vol, Length: 5003, dtype: float64

# COMPARACIÓN

In [8]:
ticker = "BMW"
from_year = 1900
filepath = os.path.join(DATAPATH, "BMW_BMW.txt")
df = pd.read_csv("file:///" + filepath, parse_dates=[['<DTYYYYMMDD>', '<TIME>']])
formatted_df = formatData(df)

first_year = formatted_df.index[0].year
if from_year >= first_year:
    formatted_df = formatted_df[formatted_df.index.year >= from_year]

print("Backtesting {}. Using data from {}\n".format(ticker, formatted_df.index[0].year))

enough_data = True

last_year = formatted_df.index.year.unique()[-1]
algo_df = prepareDataframe(formatted_df)
train = algo_df[algo_df.index.year < last_year]
test = algo_df[algo_df.index.year >= last_year]

## If test data has less than 150 business days, use previous year
if len(test) < 150*102:
    train = algo_df[algo_df.index.year < (last_year - 1)]
    test = algo_df[algo_df.index.year >= (last_year - 1)]
    enough_data = False

Backtesting BMW. Using data from 2003



In [10]:
algo_df

Unnamed: 0_level_0,open,high,low,close,volume,volume*price,vwap
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-06-20 09:00:00,127.45,127.51,127.05,127.34,19577,623660.90,31.856817
2003-06-20 09:05:00,126.82,126.93,126.73,126.88,19388,614962.85,31.718736
2003-06-20 09:10:00,158.48,158.54,158.45,158.53,12301,390105.50,31.713316
2003-06-20 09:15:00,158.53,158.63,158.48,158.63,10060,319272.22,31.736801
2003-06-20 09:20:00,127.50,127.62,127.42,127.46,25388,809844.84,31.898725
...,...,...,...,...,...,...,...
2019-10-11 17:05:00,323.53,323.64,323.46,323.60,24024,1555564.96,64.750456
2019-10-11 17:10:00,323.91,323.96,323.81,323.87,15016,972629.02,64.772844
2019-10-11 17:15:00,324.06,324.18,323.98,324.03,33398,2164312.07,64.803643
2019-10-11 17:20:00,324.00,324.15,323.92,324.04,55348,3585546.75,64.781867
