In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_json('data/ETH_USDT-1h.json')
df.columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
# convert timestamp to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
# set timestamp as index
df.set_index('timestamp', inplace=True)

print(df.shape)

df.head()

(48797, 5)


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-17 04:00:00,301.13,302.57,298.0,301.61,125.66877
2017-08-17 05:00:00,301.61,303.28,300.0,303.1,377.67246
2017-08-17 06:00:00,302.4,304.44,301.9,302.68,303.86672
2017-08-17 07:00:00,302.68,307.96,302.6,307.96,754.7451
2017-08-17 08:00:00,307.95,309.97,307.0,308.62,150.75029


In [3]:
# based on https://towardsdatascience.com/advanced-candlesticks-for-machine-learning-i-tick-bars-a8b93728b4c5
def get_tick_bars(prices: np.ndarray, vols: np.ndarray,
                  times: np.ndarray, freq: int) -> np.ndarray:
    bars = np.zeros(shape=(len(range(freq, len(prices), freq)), 6), dtype=object)
    ind = 0
    for i in range(freq, len(prices), freq):
        bars[ind][0] = pd.Timestamp(times[i - 1])          # time
        bars[ind][1] = prices[i - freq]                    # open
        bars[ind][2] = np.max(prices[i - freq: i])         # high
        bars[ind][3] = np.min(prices[i - freq: i])         # low
        bars[ind][4] = prices[i - 1]                       # close
        bars[ind][5] = np.sum(vols[i - freq: i])           # volume
        ind += 1
    return bars


def get_volume_bars(prices: np.ndarray, vols: np.ndarray,
                    times: np.ndarray, bar_vol: int) -> np.ndarray:
    bars = np.zeros(shape=(len(prices), 6), dtype=object)
    ind = 0
    last_tick = 0
    cur_volume = 0
    for i in range(len(prices)):
        cur_volume += vols[i]
        if cur_volume >= bar_vol:
            bars[ind][0] = pd.Timestamp(times[i - 1])            # time
            bars[ind][1] = prices[last_tick]                     # open
            bars[ind][2] = np.max(prices[last_tick: i + 1])      # high
            bars[ind][3] = np.min(prices[last_tick: i + 1])      # low
            bars[ind][4] = prices[i]                             # close
            bars[ind][5] = np.sum(vols[last_tick: i + 1])        # volume
            cur_volume = 0
            last_tick = i + 1
            ind += 1
    return bars[:ind]

def get_dollar_bars(prices: np.ndarray, vols: np.ndarray,
                    times: np.ndarray, bar_sum: int) -> np.ndarray:
    bars = np.zeros(shape=(len(prices), 6), dtype=object)
    ind = 0
    last_tick = 0
    cur_sum = 0
    for i in range(len(prices)):
        cur_sum += vols[i] * prices[i]
        if cur_sum >= bar_sum:
            bars[ind][0] = pd.Timestamp(times[i - 1])            # time
            bars[ind][1] = prices[last_tick]                     # open
            bars[ind][2] = np.max(prices[last_tick: i + 1])      # high
            bars[ind][3] = np.min(prices[last_tick: i + 1])      # low
            bars[ind][4] = prices[i]                             # close
            bars[ind][5] = np.sum(vols[last_tick: i + 1])        # volume
            cur_sum = 0
            last_tick = i + 1
            ind += 1
    return bars[:ind]

# symmetrical CUSUM filter
def getTEvents(gRaw: pd.Series, h: float) -> np.ndarray:
    gRaw = gRaw[~gRaw.index.duplicated(keep='first')]
    tEvents, sPos, sNeg = [], 0, 0
    diff = gRaw.diff()
    for i in diff.index[1:]:
        sPos, sNeg = max(0, sPos + diff.loc[i]), min(0, sNeg + diff.loc[i])
        if sNeg < -h:
            sNeg = 0
            tEvents.append(i)
        elif sPos > h:
            sPos = 0
            tEvents.append(i)
    return pd.DatetimeIndex(tEvents)

def pcaWeights(cov: np.ndarray, riskDist: np.ndarray = None,
               riskTarget: float = 1.) -> np.ndarray:
    eVal, eVec = np.linalg.eigh(cov)
    indices = eVal.argsort()[::-1]
    eVal, eVec = eVal[indices], eVec[:, indices]    # sorting by decreasing eVal (i.e. decreasing variance)
    if riskDist is None:
        riskDist = np.zeros(cov.shape[0])
        riskdist[-1] = 1.
    loads = riskTarget * (riskDist / eVal) ** 0.5
    weights = np.dot(eVec, np.reshape(loads, (-1, 1)))
    return weights

def custom_bars(bar_data, type_bar='volume_bar'):
    timestamps = []
    opens = []
    highs = []
    lows = []
    closes = []
    volumes = []

    for item in bar_data:
        timestamps.append(item[0]) # timestamp
        opens.append(item[1]) # open
        highs.append(item[2]) # high
        lows.append(item[3]) # low
        closes.append(item[4]) # close 
        volumes.append(item[5]) # volume

    data = {
        'timestamp': timestamps, 
        f'{type_bar}_open': opens,
        f'{type_bar}_high': highs,
        f'{type_bar}_low': lows,
        f'{type_bar}_close': closes,
        f'{type_bar}_volume': volumes
        }    

    d = pd.DataFrame(data)
    d['timestamp'] = pd.to_datetime(d['timestamp'], unit='ms')
    # set timestamp as index
    d.set_index('timestamp', inplace=True)
    print(d.shape)

    return d

In [4]:
# 10 trades per bar
## Tick bar
tick_bar = get_tick_bars(df['close'].values, df['volume'].values, df.index, 10)
df_tick_bar = custom_bars(tick_bar, type_bar='tick_bar')   
df_tick_bar.tail()

(4879, 5)


Unnamed: 0_level_0,tick_bar_open,tick_bar_high,tick_bar_low,tick_bar_close,tick_bar_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-15 16:00:00,1705.13,1705.13,1616.01,1616.01,547269.3044
2023-03-16 02:00:00,1629.52,1653.8,1629.52,1649.76,318522.1321
2023-03-16 12:00:00,1646.31,1664.56,1640.44,1650.42,263076.1065
2023-03-16 22:00:00,1657.16,1679.64,1656.25,1672.69,275783.7935
2023-03-17 08:00:00,1673.73,1724.84,1665.13,1711.98,326100.8736


In [5]:
# 30 contracts per bar
## Volume bar
volume_bar = get_volume_bars(df['close'].values, df['volume'].values, df.index, 30)
df_volume_bar = custom_bars(volume_bar, type_bar='volume_bar')   
df_volume_bar.tail()

(48721, 5)


Unnamed: 0_level_0,volume_bar_open,volume_bar_high,volume_bar_low,volume_bar_close,volume_bar_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-17 10:00:00,1761.12,1761.12,1761.12,1761.12,53441.6766
2023-03-17 11:00:00,1737.12,1737.12,1737.12,1737.12,43166.3812
2023-03-17 12:00:00,1736.0,1736.0,1736.0,1736.0,33845.654
2023-03-17 13:00:00,1711.13,1711.13,1711.13,1711.13,56422.2281
2023-03-17 14:00:00,1717.41,1717.41,1717.41,1717.41,32321.0631


In [6]:
# $30,000 per bar
## Dollar bar
dollar_bar = get_dollar_bars(df['close'].values, df['volume'].values, df.index, 30000)
df_dollar_bar = custom_bars(dollar_bar, type_bar='dollar_bar')   
df_dollar_bar.tail()

(48521, 5)


Unnamed: 0_level_0,dollar_bar_open,dollar_bar_high,dollar_bar_low,dollar_bar_close,dollar_bar_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-17 10:00:00,1761.12,1761.12,1761.12,1761.12,53441.6766
2023-03-17 11:00:00,1737.12,1737.12,1737.12,1737.12,43166.3812
2023-03-17 12:00:00,1736.0,1736.0,1736.0,1736.0,33845.654
2023-03-17 13:00:00,1711.13,1711.13,1711.13,1711.13,56422.2281
2023-03-17 14:00:00,1717.41,1717.41,1717.41,1717.41,32321.0631


In [15]:
dataframes = [
    df, 
#     df_tick_bar,
    df_volume_bar, 
    df_dollar_bar
]

result = pd.concat(dataframes, axis=1).sort_values(by='timestamp')#.reset_index(drop=True)
# result['timestamp'] = pd.to_datetime(result['timestamp'], unit='ms')

result.tail(50)

Unnamed: 0_level_0,open,high,low,close,volume,volume_bar_open,volume_bar_high,volume_bar_low,volume_bar_close,volume_bar_volume,dollar_bar_open,dollar_bar_high,dollar_bar_low,dollar_bar_close,dollar_bar_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-03-15 14:00:00,1666.82,1677.64,1628.94,1654.97,98646.4632,1634.04,1634.04,1634.04,1634.04,77394.4955,1634.04,1634.04,1634.04,1634.04,77394.4955
2023-03-15 15:00:00,1654.97,1656.49,1625.56,1634.04,77394.4955,1616.01,1616.01,1616.01,1616.01,76729.8382,1616.01,1616.01,1616.01,1616.01,76729.8382
2023-03-15 16:00:00,1634.03,1644.09,1612.25,1616.01,76729.8382,1629.52,1629.52,1629.52,1629.52,50539.3933,1629.52,1629.52,1629.52,1629.52,50539.3933
2023-03-15 17:00:00,1616.01,1632.3,1611.0,1629.52,50539.3933,1647.11,1647.11,1647.11,1647.11,59254.9983,1647.11,1647.11,1647.11,1647.11,59254.9983
2023-03-15 18:00:00,1629.52,1648.53,1620.58,1647.11,59254.9983,1647.99,1647.99,1647.99,1647.99,55779.9301,1647.99,1647.99,1647.99,1647.99,55779.9301
2023-03-15 19:00:00,1647.1,1655.41,1634.89,1647.99,55779.9301,1647.04,1647.04,1647.04,1647.04,23227.7264,1647.04,1647.04,1647.04,1647.04,23227.7264
2023-03-15 20:00:00,1648.0,1653.17,1637.72,1647.04,23227.7264,1653.8,1653.8,1653.8,1653.8,14007.0757,1653.8,1653.8,1653.8,1653.8,14007.0757
2023-03-15 21:00:00,1647.04,1654.4,1646.0,1653.8,14007.0757,1648.29,1648.29,1648.29,1648.29,22417.3934,1648.29,1648.29,1648.29,1648.29,22417.3934
2023-03-15 22:00:00,1653.79,1664.64,1641.12,1648.29,22417.3934,1649.96,1649.96,1649.96,1649.96,18177.7325,1649.96,1649.96,1649.96,1649.96,18177.7325
2023-03-15 23:00:00,1648.28,1655.12,1644.66,1649.96,18177.7325,1640.18,1640.18,1640.18,1640.18,32158.7723,1640.18,1640.18,1640.18,1640.18,32158.7723
