In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import plotly_express as px
import scipy.signal
from sklearn.preprocessing import MinMaxScaler
from func_tools import plot_labels_line, get_strategy_pnl,label_insights

## Data Import

In [2]:
experiments_folder = 'Experiments'
cache_folder = f'{experiments_folder}/cache'

pair = 'USDT_BTC'
lob_depth = 10
frequency = timedelta(seconds=10)
frequency_seconds = int(frequency.total_seconds())
date_start = '2020_04_04'
date_end = '2021_01_03'
norm_type = 'dyn_z_score'
k_plus = 15
k_minus = 15
alpha = 0 # Zero to allow only for 1 and -1 labels
trading_fee=0.000712
roll = 7200 * 6

In [3]:
# Data import - needs to be adjusted importing from several files using Dask
input_file_name = f'{pair}--{lob_depth}lev--{frequency_seconds}sec--{date_start}--{date_end}.csv.gz'

normalized_train_file = f'{cache_folder}/{pair}/TRAIN--{norm_type}-{roll}--{input_file_name}'
normalized_test_file = f'{cache_folder}/{pair}/TEST--{norm_type}-{roll}--{input_file_name}'

top_ob_train_file = f'{cache_folder}/{pair}/TRAIN_TOP--{input_file_name}'
top_ob_test_file = f'{cache_folder}/{pair}/TEST_TOP--{input_file_name}'

#if os.path.isfile(normalized_test_file): # testing for one of cache files, assuming all were saved 
print(f'Reading cached {normalized_train_file}')
train_dyn_df = pd.read_csv(normalized_train_file)
# print(f'Reading cached {normalized_test_file}')
# test_dyn_df = pd.read_csv(normalized_test_file)

print(f'Reading cached {top_ob_train_file}')
top_ob_train = pd.read_csv(top_ob_train_file)
# print(f'Reading cached {top_ob_test_file}')
# top_ob_test = pd.read_csv(top_ob_test_file)


# get dynamic mid price for get label
top_ob_train_dyn = train_dyn_df[train_dyn_df['Level'] == 0]
top_ob_train_dyn['Mid_Price'] = (top_ob_train_dyn['Ask_Price'] + top_ob_train_dyn['Bid_Price']) / 2
mid_px_train_dyn = top_ob_train_dyn.reset_index()['Mid_Price']

# get actual mid price
mid_px_train = top_ob_train['Mid_Price']

Reading cached Experiments/cache/USDT_BTC/TRAIN--dyn_z_score-43200--USDT_BTC--10lev--10sec--2020_04_04--2021_01_03.csv.gz
Reading cached Experiments/cache/USDT_BTC/TRAIN_TOP--USDT_BTC--10lev--10sec--2020_04_04--2021_01_03.csv.gz


### Savitzky–Golay filter (from Wiki)
A Savitzky–Golay filter is a digital filter that can be applied to a set of digital data points for the purpose of smoothing the data, that is, to increase the precision of the data without distorting the signal tendency. This is achieved, in a process known as convolution, by fitting successive sub-sets of adjacent data points with a low-degree polynomial by the method of linear least squares. When the data points are equally spaced, an analytical solution to the least-squares equations can be found, in the form of a single set of "convolution coefficients" that can be applied to all data sub-sets, to give estimates of the smoothed signal, (or derivatives of the smoothed signal) at the central point of each sub-set. 

### Local regression (from Wiki)
Local regression or local polynomial regression,[1] also known as moving regression,[2] is a generalization of moving average and polynomial regression.[3] Its most common methods, initially developed for scatterplot smoothing, are LOESS (locally estimated scatterplot smoothing) and LOWESS (locally weighted scatterplot smoothing), both pronounced /ˈloʊɛs/. They are two strongly related non-parametric regression methods that combine multiple regression models in a k-nearest-neighbor-based meta-model. Outside econometrics, LOESS is known and commonly referred to as Savitzky–Golay filter [4][5] (proposed 15 years before LOESS). 

In [None]:
# old approach with averages

# span_plus = 30
# span_minus = 60
# labels, minus_smooth, plus_smooth = get_labels(savgol_mid_px_3_41, span_plus, span_minus, a, technique='ema', long_only=False, return_smooth=True)
#labels = labels.fillna(0)

## Labelling

### Step 1 - labels based on smoothed signal direction

In [73]:
class Labels_Generator:
    def __init__(self, mid_px):
        self.mid_px = mid_px

    def get_smooth_px(self):
        print('step1')
        # smooth prices - Savitzky–Golay filter
        smooth_px = pd.Series(scipy.signal.savgol_filter(self.mid_px, 31, 1))
        return smooth_px

    def get_norm_smooth_px(self):
        # scale smoothed time series (squash between 0 and 1 with min max scaler)
        #smooth_px = self.get_smooth_px()
        values = self.get_smooth_px().values.reshape(self.mid_px.shape[0],1)
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler = scaler.fit(values)
        norm_smooth_px = pd.Series(scaler.transform(values).reshape(values.shape[0]))
        return norm_smooth_px

    def get_raw_labels(self):
        ''' Engine to generate basic labels '''
        # first level difference - smoothed time series direction
        a = 0
        d = np.diff(self.get_norm_smooth_px())

        # label based on the direction
        self.labels = pd.Series(np.where(d>a, 1, np.where(d<a, -1, 0)),  name='labels')
        return self.labels

    def get_cleaned_labels(self, **kwargs):
        ''' First step cleaning '''
        # recap dataframe
        df_trades = get_strategy_pnl(self.mid_px, self.labels)[['trade_grouper', 'labels', 'trade_len', 'gross_returns']].dropna(subset=['gross_returns'])

        # locate short unprofitable labels, replace them with NAs and fill them with prev label values
        df_trades['cleaned_labels'] = df_trades['labels']
        query = ' & '.join([f'`{k}`<={v}' for k, v in kwargs.items()])
        print(df_trades.query(query))
        df_trades.loc[df_trades.query(query)['cleaned_labels']] = pd.NA
        df_trades['cleaned_labels'].fillna(method='ffill', inplace=True)


        # expand table trades - one row per trade -  back into a full labels timeseries
        cleaned_labels = np.empty(self.labels.shape[0])
        cleaned_labels[:] = np.nan
        cleaned_labels = pd.Series(cleaned_labels, name='cleaned_labels')
        cleaned_labels.loc[df_trades['trade_grouper']] = df_trades['cleaned_labels']
        self.labels = cleaned_labels.fillna(method='ffill')

In [74]:
labels_generator = Labels_Generator(mid_px_train)
labels_generator.get_raw_labels()
label_insights(labels_generator.labels)


step1
Labels shape: (1626046,)
Labels: [-1  0  1] 
Count: [791924  32486 801636] 
Pctg: [0.48702435 0.01997852 0.49299712]
Number of trades: 100618


100618

In [75]:
labels_generator.get_cleaned_labels(gross_returns=0.002, trade_len=20)

Total non zero trades: 97847, sum of returns: 51.95, average return: 0.000531
         trade_grouper  labels  trade_len  gross_returns  cleaned_labels
81                81.0      -1          1      -0.000419              -1
225              225.0       1          1      -0.000020               1
226              226.0      -1         20      -0.000574              -1
246              246.0       1          3      -0.000504               1
298              298.0       1          3       0.000513               1
...                ...     ...        ...            ...             ...
1625799      1625799.0       0         16       0.000000               0
1625847      1625847.0      -1          1       0.000000              -1
1625848      1625848.0       1          1       0.000000               1
1625849      1625849.0       0         20       0.000000               0
1625942      1625942.0       0         17       0.000000               0

[70940 rows x 5 columns]


KeyError: '[1, -1] not in index'

In [52]:
my_dict = {'gross_returns':0.002, 'trade_len':20}

In [59]:
query2 = ' & '.join([f'`{k}`>={v}' for k, v in my_dict.items()])

In [62]:
my_dict.items()

dict_items([('gross_returns', 0.002), ('trade_len', 20)])

In [64]:
df_trades.query(query2)['cleaned_labels']

Unnamed: 0,trade_grouper,labels,trade_len,gross_returns
0,0.0,-1,50,0.002893
82,82.0,1,35,0.002137
117,117.0,-1,50,0.002276
167,167.0,1,35,0.002019
249,249.0,-1,49,0.005751
...,...,...,...,...
1624429,1624429.0,1,31,0.002825
1624519,1624519.0,-1,85,0.004590
1625191,1625191.0,-1,31,0.002055
1625222,1625222.0,1,102,0.002171


In [57]:
df_trades.query(query)

Unnamed: 0,trade_grouper,labels,trade_len,gross_returns
0,0.0,-1,50,0.002893
82,82.0,1,35,0.002137
117,117.0,-1,50,0.002276
167,167.0,1,35,0.002019
249,249.0,-1,49,0.005751
...,...,...,...,...
1624429,1624429.0,1,31,0.002825
1624519,1624519.0,-1,85,0.004590
1625191,1625191.0,-1,31,0.002055
1625222,1625222.0,1,102,0.002171


In [58]:
df_trades

Unnamed: 0,trade_grouper,labels,trade_len,gross_returns
0,0.0,-1,50,0.002893
50,50.0,1,31,0.001049
81,81.0,-1,1,-0.000419
82,82.0,1,35,0.002137
117,117.0,-1,50,0.002276
...,...,...,...,...
1625869,1625869.0,-1,42,0.001901
1625911,1625911.0,1,31,0.000507
1625942,1625942.0,0,17,0.000000
1625959,1625959.0,1,31,0.001156


In [56]:
query = ' & '.join([f'{k}>{v}' for k, v in my_dict.items()])

In [None]:
# replace remaining unprofitable trades with zeros
df_trades2['cleaned_labels'] = df_trades2['labels']
df_trades2.loc[df_trades2['gross_returns'] <= 0.0020, 'cleaned_labels'] = pd.NA
df_trades2['cleaned_labels'].fillna(0, inplace=True)

# transform table trades back into a full labels timeseries
cleaned_labels2 = np.empty(labels.shape[0])
cleaned_labels2[:] = np.nan
cleaned_labels2 = pd.Series(cleaned_labels2, name='cleaned_labels')
cleaned_labels2.loc[df_trades2['trade_grouper']] = df_trades2['cleaned_labels']
cleaned_labels2 = cleaned_labels2.fillna(method='ffill')

# print step 3 label insights
label_insights(cleaned_labels2)
df_strategy_3 = get_strategy_pnl(mid_px_train, cleaned_labels2)
df_trades3 = df_strategy_3[['trade_grouper', 'labels', 'trade_len', 'gross_returns']].dropna()

In [46]:
labels_generator.get_raw_labels()

step1
step1


0         -1
1         -1
2         -1
3         -1
4         -1
          ..
1626041   -1
1626042   -1
1626043   -1
1626044   -1
1626045   -1
Name: labels, Length: 1626046, dtype: int64

In [49]:
label_insights(labels_generator.labels)

Labels shape: (1626046,)
Labels: [-1.  0.  1.] 
Count: [796708  21557 807781] 
Pctg: [0.48996646 0.01325731 0.49677623]
Number of trades: 20553


20553

In [48]:
labels_generator.get_cleaned_labels()

Total non zero trades: 97847, sum of returns: 51.95, average return: 0.000531


In [43]:
a = 0.0 # threshold

# smooth prices - Savitzky–Golay filter
savgol_mid_px = pd.Series(scipy.signal.savgol_filter(mid_px_train, 31, 1)) 

# scale smoothed time series (squash between 0 and 1 with min max scaler)
values = savgol_mid_px.values.reshape(savgol_mid_px.shape[0],1)
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
normalized_savgol = scaler.transform(values).reshape(values.shape[0])

# first level difference - smoothed time series direction
d = np.diff(normalized_savgol)

# label based on the direction
labels = pd.Series(np.where(d>a, 1, np.where(d<a, -1, 0)),  name='labels')
label_insights(labels.dropna())

# recap dataframe
df_strategy = get_strategy_pnl(mid_px_train, labels)

#only keep one raw per trade (gross return NAs)
df_trades = df_strategy[['trade_grouper', 'labels', 'trade_len', 'gross_returns']].dropna(subset=['gross_returns'])

# plot trades 1 histogram
histo_trades = px.histogram(x=df_trades['gross_returns'], title='All trades')
histo_trades.show()

Labels shape: (1626046,)
Labels: [-1  0  1] 
Count: [791924  32486 801636] 
Pctg: [0.48702435 0.01997852 0.49299712]
Number of trades: 100618
Total non zero trades: 97847, sum of returns: 51.95, average return: 0.000531


### Step 2 - replace short and unprofitable labels with previous values
#### short labels seem to interrupt longer trends with no reason

In [None]:
# locate short unprofitable labels, replace them with NAs and fill them with prev label values
df_trades['cleaned_labels'] = df_trades['labels']
df_trades.loc[(df_trades['trade_len'] <= 20) & (df_trades['gross_returns'] <= 0.002), 'cleaned_labels'] = pd.NA
df_trades['cleaned_labels'].fillna(method='ffill', inplace=True)

#df_trades.groupby('cleaned_labels')['trade_len'].sum()

# transform table trades - one row per trade -  back into a full labels timeseries
cleaned_labels = np.empty(labels.shape[0])
cleaned_labels[:] = np.nan
cleaned_labels = pd.Series(cleaned_labels, name='cleaned_labels')
cleaned_labels.loc[df_trades['trade_grouper']] = df_trades['cleaned_labels']
cleaned_labels = cleaned_labels.fillna(method='ffill')

# print step 2 label insights
label_insights(cleaned_labels)
df_strategy_2 = get_strategy_pnl(mid_px_train, cleaned_labels)
df_trades2 = df_strategy_2[['trade_grouper', 'labels', 'trade_len', 'gross_returns']].dropna()

In [None]:
# plot trades 2 histogram
histo_trades2 = px.histogram(x=df_trades2[df_trades2['gross_returns']!=0]['gross_returns'], title='All trades', histfunc='count')
histo_trades2.show()

# plot a portion of the timeseries at step 2
end= 30000
start= 0
plot_labels_line(mid_px_train[start:end], cleaned_labels[start:end], title='Testing2', 
savgol_mid_px=savgol_mid_px[start:end]
)

### Step 3 - take all remaining trades with small return and fill those with zero labels
#### The shorter trades would have already been replaced at step 2. The remaining ones are
#### too long to just be "attached" to the previous labels and should be defined neither as buy nor as sell

In [None]:
# replace remaining unprofitable trades with zeros
df_trades2['cleaned_labels'] = df_trades2['labels']
df_trades2.loc[df_trades2['gross_returns'] <= 0.0020, 'cleaned_labels'] = pd.NA
df_trades2['cleaned_labels'].fillna(0, inplace=True)

# transform table trades back into a full labels timeseries
cleaned_labels2 = np.empty(labels.shape[0])
cleaned_labels2[:] = np.nan
cleaned_labels2 = pd.Series(cleaned_labels2, name='cleaned_labels')
cleaned_labels2.loc[df_trades2['trade_grouper']] = df_trades2['cleaned_labels']
cleaned_labels2 = cleaned_labels2.fillna(method='ffill')

# print step 3 label insights
label_insights(cleaned_labels2)
df_strategy_3 = get_strategy_pnl(mid_px_train, cleaned_labels2)
df_trades3 = df_strategy_3[['trade_grouper', 'labels', 'trade_len', 'gross_returns']].dropna()

In [None]:
# plot trades 3 histogram
histo_trades3 = px.histogram(x=df_trades3[df_trades3['gross_returns']!=0]['gross_returns'], title='All trades', histfunc='count')
histo_trades3.show()

# plot a portion of the timeseries at step 3
end=30000
start=0
plot_labels_line(mid_px_train[start:end], cleaned_labels2[start:end], title='Testing3', 
savgol_mid_px=savgol_mid_px[start:end]
)

### Improvements:
#### explore ho looking at 2nd, 3d 5th difference rather than first would impact labels
#### handle labels where step 2 rather than linking same sign labels, are just delaying it, deserving a zero instead
#### different parameters (min profitability and length) for day and night. Nights tend to be less volatile than days

## Exploratory data analysis - experiments

In [None]:
from fbprophet import Prophet

In [None]:
label_pred_df = top_ob_train[['Datetime', 'Mid_Price']].copy() # create a copy of the df
label_pred_df['Datetime'] = pd.to_datetime(label_pred_df['Datetime'])
#label_pred_df = label_pred_df.set_index('Datetime')
label_pred_df.columns = ['ds', 'y']

In [None]:
label_pred_df.head()

In [None]:
m = Prophet(changepoint_prior_scale=0.01).fit(label_pred_df[:20000])
future = m.make_future_dataframe(periods=60, freq='10s')
fcst = m.predict(future)
fig = m.plot(fcst)

In [None]:
fig = m.plot_components(fcst)

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
# prepare data
data = mid_px_train.values
# create class
model = ExponentialSmoothing(data, ...)
# fit model
model_fit = model.fit(...)
# make predictions
yhat = model_fit.predict(...)

In [None]:
# try smoothing for labels
# try to smooth pct probg pred

## Datashader - visualize very large datasets

In [None]:
import datashader as ds
from collections import OrderedDict
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [None]:
# Default plot ranges:
x_range = (0, df_profit_opt.shape[0])
y_range = (0.8*df_profit_opt['px'].min(), 1.2*df_profit_opt['px'].max())

In [None]:
%%time
cvs = ds.Canvas(x_range=x_range, y_range=y_range, plot_height=900, plot_width=2400)
cols = ['px', 'cleaned_labels']
aggs= OrderedDict((c, cvs.line(df_profit_opt.reset_index(), 'index', c)) for c in cols)
img = ds.transfer_functions.shade(aggs['px'])


In [None]:
img

In [None]:
arr = np.array(img)
z = arr.tolist()
dims = len(z[0]), len(z)

In [None]:
data = [ dict( 
        z = z, 
        x = np.linspace(x_range[0], x_range[1], dims[0]),
        y = np.linspace(y_range[0], y_range[1], dims[1]),
        colorscale = [ [0, 'rgba(255,255,255,0)'], [1, 'rgba(0,0,255,1)'] ],
        showscale = False,
        # reversescale = True,
        type = 'heatmap' ) ]

layout = dict(  
    margin = dict( t=0, b=0 ),
    yaxis = dict( 
        fixedrange=False,
        ),
    xaxis=dict(
        rangeslider=dict(
            visible=True
            ),
        )   
    )

fig = dict( data=data, layout=layout )

pio.show(fig)

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(y=px_ts, x=px_ts.index, name='Price'), secondary_y=False)
fig.add_trace(go.Scatter(y=labels, x=labels.index, name='Labels', marker=dict(color='rgba(240, 52, 52, 0.3)')), 
    secondary_y=True)


fig.update_layout(title=f'<b>{title}</b>')
fig.update_yaxes(title_text='ccy', fixedrange= False, secondary_y=False)
fig.update_yaxes(title_text='label', secondary_y=True)

In [None]:
mid_px_train