In [1]:
%%capture
%cd '../../src'

import os
import glob
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = '../data/raw'

In [3]:
ohlc_dict = {
    'Open'  :'first',
    'High'  :'max',
    'Low'   :'min',
    'Close' :'last',
    'Volume':'sum'
    }

In [4]:
file = 'XRP_1m.pqt'
df = pd.read_parquet(f'{data_dir}/{file}')
df['Datetime'] = pd.to_datetime(df['Datetime'])
df.sort_values(by = 'Datetime', ascending = True, inplace = True)
df.head(5)

Unnamed: 0,Datetime,Open,High,Low,Close,Volume
0,2020-01-06 08:21:00,0.197,0.197,0.197,0.197,6111.6
1,2020-01-06 08:22:00,0.197,0.197,0.197,0.197,0.0
2,2020-01-06 08:23:00,0.197,0.197,0.197,0.197,0.0
3,2020-01-06 08:24:00,0.197,0.197,0.197,0.197,0.0
4,2020-01-06 08:25:00,0.197,0.197,0.197,0.197,0.0


In [5]:
# sns.lineplot(data = df, x = 'Datetime', y = 'Close')
# plt.show()

In [6]:
df['Datetime'].min(), df['Datetime'].max()

(Timestamp('2020-01-06 08:21:00'), Timestamp('2023-10-13 23:59:00'))

In [7]:
horizon_def = {
    'in_freq'       : 120,           # minutes
    'pct_thresh'    : 2/100.,
    'lookahead'     : 60 * 24,       # minutes
    }

in_freq = horizon_def['in_freq']
pct_thresh = horizon_def['pct_thresh']
lookahead = math.ceil(horizon_def['lookahead'] / in_freq)
print(f'look ahead time periods: {lookahead}')

look ahead time periods: 12


In [8]:
in_freqs = [1, 3, 5, 10] + list(range(15, 121, 15))
lookaheads = [60 * h for h in range(1, 26)]
#in_freqs, lookaheads

In [9]:
# Resample to desired time interval
df = df.resample(f'{in_freq}T', on = 'Datetime').agg(ohlc_dict)
df.reset_index(inplace = True, drop = False)

# Long labeling
df['tgt_long'] = df['Close'] * (1 + pct_thresh)
df['rolling'] = df['Close'].rolling(lookahead).max().shift(-lookahead)
df['label_long'] = (df['rolling'] >= df['tgt_long']) * 1

# Short labeling
df['tgt_short'] = df['Close'] * (1 - pct_thresh)
df['rolling'] = df['Close'].rolling(lookahead).min().shift(-lookahead)
df['label_short'] = (df['rolling'] <= df['tgt_short']) * 1

df.drop(columns = ['tgt_long', 'tgt_short', 'rolling'], inplace = True)

df.head(10)

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,label_long,label_short
0,2020-01-06 08:00:00,0.197,0.2222,0.197,0.2127,6682635.7,1,0
1,2020-01-06 10:00:00,0.2127,0.2198,0.2105,0.213,7313724.3,1,0
2,2020-01-06 12:00:00,0.213,0.218,0.2108,0.212,8844217.7,1,0
3,2020-01-06 14:00:00,0.212,0.2152,0.2109,0.2124,9118517.3,1,1
4,2020-01-06 16:00:00,0.2124,0.2209,0.2115,0.2186,20605594.5,0,1
5,2020-01-06 18:00:00,0.2186,0.2206,0.2172,0.2194,6783720.5,0,1
6,2020-01-06 20:00:00,0.2195,0.2257,0.2175,0.2179,17405364.1,1,1
7,2020-01-06 22:00:00,0.2179,0.2259,0.2169,0.2226,15268444.8,0,1
8,2020-01-07 00:00:00,0.2227,0.2297,0.2191,0.2224,12309719.5,0,1
9,2020-01-07 02:00:00,0.2224,0.2238,0.2183,0.2196,6355407.5,0,1


In [10]:
def label_consolidator(row):
    lng = row['label_long']
    sht = row['label_short']
    
    combined = lng + sht
    if combined == 2:
        # anything can possible
        label = 'either'
    elif combined == 1:
        label = 'long' if lng > sht else 'short'
    elif combined == 0:
        label = 'neither'
    else: 
        label = None
        
    return label

In [11]:
df['label'] = df.apply(lambda r: label_consolidator(r), axis = 1)

In [12]:
print('Long  :', (df['label_long'].value_counts() / len(df) * 100).to_dict())
print('Short :', (df['label_short'].value_counts() / len(df) * 100).to_dict())

Long  : {0: 58.67433414043583, 1: 41.32566585956417}
Short : {0: 58.19007263922518, 1: 41.809927360774815}


In [13]:
df['label'].value_counts()

short      5511
long       5431
neither    4182
either     1396
Name: label, dtype: int64

In [14]:
df['label'].value_counts() / len(df)

short      0.333596
long       0.328753
neither    0.253148
either     0.084504
Name: label, dtype: float64