CLASS 1 = BUY OKX 
CLASS 2 = BUY BINANCE

# **Imports**

In [None]:
!pip install sktime
!pip install tabulate
from tabulate import tabulate
import pandas as pd 
import numpy as np 
import json 
import websocket
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime as dt
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, CuDNNLSTM
from keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.backend import clear_session
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics import hamming_loss
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
import ccxt
import os


# **Data**

## Fetch data

In [None]:
# Fetch OKX DATA
exchange = ccxt.okex({
    'enableRateLimit': True,
    'rateLimit': 10,  
})

# pair and timeframe
symbol = 'BTC/USDT'
timeframe = '5m'

# start and end date
end_date = exchange.milliseconds()  
start_date_str = '2019-01-01 00:00:00' 

# convert string to Unix timestamp milliseconds
start_date = int(time.mktime(datetime.datetime.strptime(start_date_str, '%Y-%m-%d %H:%M:%S').timetuple())) * 1000

# nrows to fetch per request 
limit = 100

# store data
ohlcv_list = []

# loop through data in batches of 100 rows
while True:
    # fetch data with limit of 100 rows per request
    ohlcv = exchange.fetch_ohlcv(symbol, timeframe, start_date, params={'to': end_date, 'limit': limit})
    
    # append
    ohlcv_list.extend(ohlcv)
    
    # update start date for next request
    if len(ohlcv) < limit:
        break  
    else:
        start_date = ohlcv[-1][0] + 300000  # add 5 minutes (in milliseconds)

# convert to df
df = pd.read_json(pd.DataFrame(ohlcv_list).to_json(), orient='records')
df.columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']

# convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# timestamp as index
df.set_index('timestamp', inplace=True)

# save to csv
df.to_csv('OKX_historical.csv')

# print the DataFrame
print(df)


## Binance Historical

In [None]:
symbol = "BTCUSDT"
timeframe = "5m"
start = "2019-01-01"
end = "2023-03-01"

In [None]:
klines = bh.fetch_klines(symbol=symbol, timeframe=timeframe, start=start, end=end)

In [None]:
klines = klines.rename(columns = {'open':'open_bnb'})

In [None]:
klines.to_csv('klines_binance.csv')

## loading and Subsetting 

In [None]:
klines = pd.read_csv('klines_binance.csv', index_col = 'open_datetime', on_bad_lines='warn')
print(klines)
okx = pd.read_csv('OKX_historical.csv', index_col='timestamp', on_bad_lines='warn')
print(okx)

In [None]:
start_date = pd.Timestamp('2020-10-01', tz='UTC')
end_date = pd.Timestamp('2021-03-01', tz='UTC')

In [None]:
# Convert to datetime format
okx.index = pd.to_datetime(okx.index).tz_localize('UTC')
okx = okx.rename(columns={'open':'open_okx'})
# Subset 
okx_subset = okx[(okx.index >= start_date) & (okx.index < end_date)]
okx_subset

In [None]:
# Convert to datetime format
klines.index = pd.to_datetime(klines.index).tz_convert('UTC')

#subset
klines_subset = klines[(klines.index >= start_date) & (klines.index < end_date)]
klines_subset

## Data Cleaning

In [None]:
# expected number of rows
num_intervals = (end_date - start_date) // pd.Timedelta(minutes=5)
expected_num_rows = num_intervals + 1

print(f"Expected number of rows: {expected_num_rows}")

In [None]:
# n missing rows
date_range = pd.date_range(start_date, end_date, freq='5min')

missing_rows_1 = date_range[~date_range.isin(okx_subset.index)]
missing_rows_2 = date_range[~date_range.isin(klines_subset.index)]

print(f"Number of missing rows in dataframe 1: {len(missing_rows_1)}")
print(f"Number of missing rows in dataframe 2: {len(missing_rows_2)}")

In [None]:
# index of rows in okx_subset not in klines_subset
missing_rows = okx_subset[~okx_subset.index.isin(klines_subset.index)].index

# Drop missing rows okx_subset
okx_subset = okx_subset.drop(missing_rows)
okx_subset

In [None]:
klines = klines_subset
okx = okx_subset 
print(len(klines))
print(len(okx))

## Merging 

In [None]:
# merge both df
merged_df = klines.merge(okx, left_index=True, right_index=True)
merged_df = merged_df.drop('close_datetime', axis = 1)
merged_df

In [None]:
#renaming columns for clarity
merged_df.rename(columns={'high_x':'high_bnb', 'low_x':'low_bnb', 'close_x':'close_bnb', 'volume_x':'volume_bnb', 'trades':'trades_bnb', 'high_y':'high_okx', 
                  'low_y':'low_okx', 'close_y':'close_okx', 'volume_y':'volume_okx'}, inplace=True)

## Target variable 

In [None]:
#target variable 
#buy okx represents the action of buying on okx and selling binance. (Calculated as open_bnb - slip) - (okx + slip)
#if positive class_variable = 1
#same for bnb
merged_df['spread'] = merged_df.open_bnb - merged_df.open_okx
merged_df['buy_okx'] = (merged_df.open_bnb - (merged_df.open_bnb*0.0001)) - (merged_df.open_okx + (merged_df.open_okx*0.0001))
merged_df['buy_bnb'] = (merged_df.open_okx - (merged_df.open_okx*0.0001)) - (merged_df.open_bnb + (merged_df.open_bnb*0.0001))
merged_df['class_variable'] = 0 
merged_df.loc[merged_df['buy_okx'] > 0.01, 'class_variable'] = 1
merged_df.loc[merged_df['buy_bnb'] > 0.01, 'class_variable'] = 2

In [None]:
merged_df

# **Pre-Processing**

## Timezones 

In [None]:
merged_df.index = pd.to_datetime(merged_df.index)
# tradin zone column
merged_df['trading_zone'] = 0

# time zone as h of d
merged_df.loc[(merged_df.index.hour >= 1) & (merged_df.index.hour < 9), 'trading_zone'] = 'eu'
merged_df.loc[(merged_df.index.hour >= 9) & (merged_df.index.hour < 17), 'trading_zone'] = 'na'
merged_df.loc[((merged_df.index.hour >= 17) & (merged_df.index.hour <= 23)) | ((merged_df.index.hour >= 0) & (merged_df.index.hour < 1)), 'trading_zone'] = 'as'

# encode
merged_df = pd.get_dummies(merged_df, columns=['trading_zone'])


## Days until/since halving

In [None]:
#days until halving
merged_df.index = merged_df.index.tz_localize(None)
next_halving = dt.datetime(2024, 5, 11)
merged_df['days_until_halving'] = (next_halving - merged_df.index).days

In [None]:
#days since halving
merged_df.index = merged_df.index.tz_localize(None)
halving_dates = [dt.datetime(2016, 7, 9), dt.datetime(2020, 5, 11)]
merged_df['days_since_halving'] = [(d - halving_dates[1]).days if d > halving_dates[1] else (d - halving_dates[0]).days for d in merged_df.index]


## Moving averages

In [None]:
def get_sma(df, window, var):
    """
    simple moving avg
    expects df, window size and, variable
    """
    return merged_df[var].rolling(window=window).mean()

def get_ema(df, window, var):
    """
    exponential moving average
    expects df, window size and, variable 
    """
    return merged_df[var].ewm(span=window, adjust=False).mean()

In [None]:
#window size is chosen to capture short, mid and, long term relationships
merged_df['SMA20_o'] = get_sma(merged_df, 20, 'open_okx')
merged_df['SMA50_o'] = get_sma(merged_df, 50, 'open_okx')
merged_df['SMA200_o'] = get_sma(merged_df, 200, 'open_okx')
merged_df['EMA12_o'] = get_ema(merged_df, 12, 'open_okx')
merged_df['EMA26_o'] = get_ema(merged_df, 26, 'open_okx')
merged_df['SMA20_b'] = get_sma(merged_df, 20, 'open_bnb')
merged_df['SMA50_b'] = get_sma(merged_df, 50, 'open_bnb')
merged_df['SMA200_b'] = get_sma(merged_df, 200, 'open_bnb')
merged_df['EMA12_b'] = get_ema(merged_df, 12, 'open_bnb')
merged_df['EMA26_b'] = get_ema(merged_df, 26, 'open_bnb')

In [None]:
merged_df

In [None]:
#subset to avoid nan values from moving averages
merged_df = merged_df.iloc[199:]

## Relative Strength Index (RSI)

In [None]:
merged_df.isna().any()

In [None]:
def get_rsi(prices, n=14):
  """
  relative strength index
  expects variable and window size
  """
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed >= 0].sum()/n
    down = -seed[seed < 0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1. + rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1]  
        if delta > 0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1. + rs)

    return rsi

merged_df['rsi_bnb'] = get_rsi(merged_df['open_bnb'])
merged_df['rsi_okx'] = get_rsi(merged_df['open_okx'])


## On Balance Volume (OBV)

In [None]:
def get_obv(df, col, vol):
    """
    On-Balance Volume
    expexcts df, price variable and volume variable
    """
    close_diff = df[col].diff()
    obv_direction = pd.Series(np.where(close_diff > 0, 1, np.where(close_diff < 0, -1, 0)), index=df.index)
    obv = (obv_direction * df[vol]).cumsum()
    
    return obv

merged_df['obv_bnb'] = get_obv(merged_df, 'open_bnb', 'volume_bnb')
merged_df['obv_okx'] = get_obv(merged_df, 'open_okx', 'volume_okx')

In [None]:
merged_df

# **X & y**

In [None]:
#rename for simplicity
df = merged_df
df.columns

In [None]:
#To use if many variables are used

# Specify the column name to exclude
column_to_exclude = ['class_variable', 'spread','open_okx', 'high_okx', 'low_okx', 'close_okx', 'volume_okx','trading_zone_as', 'trading_zone_eu',
       'trading_zone_na', 'days_until_halving', 'days_since_halving', 'trades_bnb', 'quote_volume', 'buy_bnb']

# Drop the column and retrieve the remaining column names
selected_columns = df.drop(column_to_exclude, axis=1).columns


In [None]:
#convert to float32 to make it run faster (less memory)
print(selected_columns)
# Specify the variables to convert to float32
variables_to_convert = selected_columns

# Convert the specified variables to float32
df[variables_to_convert] = df[variables_to_convert].astype('float32')


In [None]:
# Define the sequence size and features for the rolling windows
SEQUENCE_SIZE = 10
features = ['open_bnb', 'buy_okx'] 
#features = ['open_bnb', 'spread_percentage']
#features = ['open_bnb', 'high_bnb', 'low_bnb', 'close_bnb', 'volume_bnb',
#       'quote_volume', 'trades_bnb', 'spread']
#features = ['open_bnb', 'high_bnb', 'low_bnb', 'close_bnb', 'volume_bnb',
#       'quote_volume', 'trades_bnb', 'open_okx', 'high_okx', 'low_okx',
#      'close_okx', 'volume_okx', 'spread']
#features = selected_columns
#features = ['open_bnb', 'open_okx', 'obv_bnb', 'obv_okx', 'trading_zone', 'spread']
#features = ['open_bnb', 'open_okx', 'spread']
#features = ['open_bnb', 'spread']

# Array with feature sequence and labels
X = np.zeros((len(df) - SEQUENCE_SIZE + 1, SEQUENCE_SIZE, len(features)))
y = np.zeros(len(df) - SEQUENCE_SIZE + 1)

# Fill arrays
for i in range(len(df) - SEQUENCE_SIZE + 1):
    X[i, :, :] = df[features].iloc[i:i+SEQUENCE_SIZE].values
    y[i] = df['class_variable'].iloc[i+SEQUENCE_SIZE-1]

# encode
y = to_categorical(y, num_classes=3)

# shapes
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


In [None]:
merged_df.class_variable.value_counts()

# **models** 

## LSTM

In [None]:
%%time
#model parameters
input_shape = (SEQUENCE_SIZE, len(features))
activation = "relu"
loss = 'categorical_crossentropy'
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
dense_activation = 'softmax'

# number of splits for rolling validation
n_splits = 5

tscv = TimeSeriesSplit(n_splits=n_splits)

# performance metrics
classification_reports = []
kappa_scores = []

# model architecture
model = Sequential()
model.add(LSTM(units=64, activation='relu', return_sequences=True, input_shape=input_shape, unroll=True))
model.add(Dropout(0.2))
model.add(LSTM(units=32))
model.add(Dropout(0.2))
model.add(Dense(units=3, activation='softmax'))
model.summary()

# Compile 
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# scaler
scaler = MinMaxScaler((-1,1))

# scale at each split and not before to avoid data leakage

for train_index, test_index in tscv.split(X):
    # Train and Test for current split 
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_split, y_test_split = y[train_index], y[test_index]

    # Reshape for scaling
    X_train_fold_reshaped = X_train_fold.reshape(X_train_fold.shape[0], -1)
    X_test_fold_reshaped = X_test_fold.reshape(X_test_fold.shape[0], -1)

    # fit_transform only on train, transform on test to avoid leakage
    X_train_scaled = scaler.fit_transform(X_train_fold_reshaped)
    X_test_scaled = scaler.transform(X_test_fold_reshaped)
    #reshape back for LSTM
    X_train_scaled = X_train_scaled.reshape(X_train_fold.shape)
    X_test_scaled = X_test_scaled.reshape(X_test_fold.shape)


    # early stopping to limit training time and avoid overfitting
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

    # Fit on training data
    history = model.fit(X_train_scaled, y_train_split, epochs=50, batch_size=32, callbacks=[es], validation_split=0.2, verbose=1)

    # predictions on test 
    y_pred = model.predict(X_test_scaled)

    # Convert to labels for profitability and metrics
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_test_labels = np.argmax(y_test_split, axis=1)

    # Classification report and Kappa for split
    report = classification_report(y_test_labels, y_pred_labels, output_dict=True)
    kappa = cohen_kappa_score(y_test_labels, y_pred_labels)

    # store metrics
    classification_reports.append(report)
    kappa_scores.append(kappa)

    # Print classification report and kappa for split
    print("------")
    print("Classification Report:")
    print(report)
    print("Cohen's Kappa:", kappa)


In [None]:
y_train

In [None]:
#average classification report and Kappa score

# store metrics
class_metrics = {}

# Process individual reports
for i, report in enumerate(classification_reports):
    print(f"Fold {i+1}:")
    table = []
    headers = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
    table.append(headers)
    for class_label, metrics in report.items():
        if class_label.isnumeric():
            row = [class_label, metrics['precision'], metrics['recall'], metrics['f1-score'], metrics['support']]
            table.append(row)
            if class_label not in class_metrics:
                class_metrics[class_label] = {
                    'precisions': [],
                    'recalls': [],
                    'f1_scores': [],
                    'supports': []
                }
            class_metrics[class_label]['precisions'].append(metrics['precision'])
            class_metrics[class_label]['recalls'].append(metrics['recall'])
            class_metrics[class_label]['f1_scores'].append(metrics['f1-score'])
            class_metrics[class_label]['supports'].append(metrics['support'])
    print(tabulate(table, headers='firstrow'))
    print("")

    # Print Kappa
    kappa_score = kappa_scores[i]
    print(f"Cohen's Kappa Score: {kappa_score}")
    print("------")

# average metrics for each class
avg_report = {}
avg_table = []
avg_headers = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
avg_table.append(avg_headers)
for class_label, metrics in class_metrics.items():
    avg_precision = np.mean(metrics['precisions'])
    avg_recall = np.mean(metrics['recalls'])
    avg_f1_score = np.mean(metrics['f1_scores'])
    avg_support = np.sum(metrics['supports'])
    avg_report[class_label] = {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1-score': avg_f1_score,
        'support': avg_support
    }
    avg_row = [class_label, avg_precision, avg_recall, avg_f1_score, avg_support]
    avg_table.append(avg_row)

# Print average report
print("Average Report:")
print(tabulate(avg_table, headers='firstrow'))

# average Cohen's Kappa score
avg_kappa_score = np.mean(kappa_scores)

# average Cohen's Kappa score
print(f"Average Cohen's Kappa Score: {avg_kappa_score}")


In [None]:
kappa

In [None]:
#save model
joblib.dump(model, 'Chopp_Lstm_ALLv.joblib')

## MiniRocketMultivariate + LGBM

In [None]:
merged_df.class_variable.value_counts()


In [None]:
for i in merged_df.class_variable.value_counts():
  print(i/len(merged_df.class_variable))


In [None]:
%%time
# y back to 1d because MiniRocket expects it
y_1d = np.argmax(y, axis=1)

# swap n_features and dsequence length because MiniRocket expects it
X_swapped = np.swapaxes(X, 1, 2)


# pipeline MiniRocketMultivariate and LGBM classifier
MiniLgbm = make_pipeline(
    MiniRocketMultivariate(num_kernels=10000, max_dilations_per_kernel = 32, n_jobs = -1, random_state = 7),
    LGBMClassifier(objective='multiclass', n_estimators = 250, random_state = 7)
)

# number of splits for rolling validation
n_splits = 5

# n splits
tscv = TimeSeriesSplit(n_splits=n_splits)

# metrics
classification_reports = []
cohen_kappa_scores = []

# rolling window validation for robust results
for train_index, test_index in tscv.split(X_swapped):
    # train and test for current split
    X_train, X_test = X_swapped[train_index], X_swapped[test_index]
    y_train, y_test = y_1d[train_index], y_1d[test_index]

    #scale at each split to avoid data leakage
    scaler = MinMaxScaler((-1,1))

    #reshape for scaling
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

    #fit tranform only train, transform only test to avoid data leakage
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    #reshape for MiniRocket
    X_train_scaled = X_train_scaled.reshape(X_train.shape)
    X_test_scaled = X_test_scaled.reshape(X_test.shape)

    # Fit pipeline on train 
    MiniLgbm.fit(X_train_scaled, y_train)

    # predictions on test
    y_pred = MiniLgbm.predict(X_test_scaled)

    # store reports 
    report = classification_report(y_test, y_pred, output_dict=True)
    classification_reports.append(report)

    # kappa score for current split 
    kappa = cohen_kappa_score(y_test, y_pred)
    cohen_kappa_scores.append(kappa)

    print("------")
    print("Classification Report:")
    print(report)
    print(f"Cohen's Kappa Score: {kappa}")

In [None]:
# All reports + Average classification report and Cohen Kappa score
class_metrics = {}

# individual reports
for i, report in enumerate(classification_reports):
    print(f"Fold {i+1}:")
    table = []
    headers = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
    table.append(headers)
    for class_label, metrics in report.items():
        if class_label.isnumeric():
            row = [class_label, metrics['precision'], metrics['recall'], metrics['f1-score'], metrics['support']]
            table.append(row)
            if class_label not in class_metrics:
                class_metrics[class_label] = {
                    'precisions': [],
                    'recalls': [],
                    'f1_scores': [],
                    'supports': []
                }
            class_metrics[class_label]['precisions'].append(metrics['precision'])
            class_metrics[class_label]['recalls'].append(metrics['recall'])
            class_metrics[class_label]['f1_scores'].append(metrics['f1-score'])
            class_metrics[class_label]['supports'].append(metrics['support'])
    print(tabulate(table, headers='firstrow'))
    print("")

    # Cohen's Kappa Score
    kappa_score = cohen_kappa_scores[i]
    print(f"Cohen's Kappa Score: {kappa_score}")
    print("------")

# average metrics for each class
avg_report = {}
avg_table = []
avg_headers = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
avg_table.append(avg_headers)
for class_label, metrics in class_metrics.items():
    avg_precision = np.mean(metrics['precisions'])
    avg_recall = np.mean(metrics['recalls'])
    avg_f1_score = np.mean(metrics['f1_scores'])
    avg_support = np.sum(metrics['supports'])
    avg_report[class_label] = {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1-score': avg_f1_score,
        'support': avg_support
    }
    avg_row = [class_label, avg_precision, avg_recall, avg_f1_score, avg_support]
    avg_table.append(avg_row)

# average report
print("Average Report:")
print(tabulate(avg_table, headers='firstrow'))

#average Cohen's Kappa score
avg_kappa_score = np.mean(cohen_kappa_scores)

#average Cohen's Kappa score
print(f"Average Cohen's Kappa Score: {avg_kappa_score}")


In [None]:
joblib.dump(MiniLgbm, 'Chopp_Mini_ALLv.joblib')

In [None]:
loaded_model = joblib.load('mini_4m_2V_chop.joblib')

In [None]:
y_test.shape, y_pred.shape

### MiniRocketMultivariate + GradientBoostingClassifier

In [None]:
y_train_1d = np.argmax(y_train, axis=1)
y_val_1d = np.argmax(y_val, axis=1)
y_test_1d = np.argmax(y_test, axis=1)

# pipeline minirocket + gradientboostingclassifier
MiniGrad = make_pipeline(
    MiniRocketMultivariate(),
    GradientBoostingClassifier(random_state=42)
)
MiniGrad.fit(X_train_scaled_swapped, y_train_1d)
MiniGrad.score(X_val_scaled_swapped, y_val_1d)
MiniGrad.score(X_test_scaled_swapped, y_test_1d)

y_pred = MiniGrad.predict(X_test_scaled_swapped)

### GridSearch MiniGrad

In [None]:
#grid search to find optimal parameters
y_train_1d = np.argmax(y_train, axis=1)


# pipeline
minirocket = make_pipeline(
    MiniRocketMultivariate(),
    GradientBoostingClassifier(random_state=42)
)

# hyperparameter grid for GradientBoostingClassifier
param_grid = {
    'gradientboostingclassifier__learning_rate': [0.1, 0.01, 0.001],
    'gradientboostingclassifier__n_estimators': [100, 200, 300],
    'gradientboostingclassifier__max_depth': [3, 4, 5]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(minirocket, param_grid, cv=5)
grid_search.fit(X_train_scaled_swapped, y_train_1d)

# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


### MiniRocketMultivariate + RidgeClassifierCV

In [None]:
# Define the MiniRocket pipeline with Ridge regression
MiniRidge = make_pipeline(
    MiniRocketMultivariate(),
    RidgeClassifierCV()
)
MiniRidge.fit(X_train_scaled_swapped, y_train)
MiniRidge.score(X_val_scaled_swapped, y_val)
MiniRidge.score(X_test_scaled_swapped, y_test)
# make predictions on test data
y_pred = MiniRidge.predict(X_test_scaled_swapped)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1-score:', f1_score(y_test, y_pred, average='weighted'))

hamming_loss_value = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamming_loss_value)

jaccard_scores = jaccard_score(y_test, y_pred, average=None)
print("Jaccard Similarity Scores:", jaccard_scores)

In [None]:
# make predictions on test data
y_pred = minirocket.predict(X_test_scaled_swapped)

print('Accuracy:', accuracy_score(y_test_1d, y_pred))
print('Precision:', precision_score(y_test_1d, y_pred, average='weighted'))
print('Recall:', recall_score(y_test_1d, y_pred, average='weighted'))
print('F1-score:', f1_score(y_test_1d, y_pred, average='weighted'))

## LOADING

In [None]:
#load model
loaded_model = joblib.load('Chop_Mini_2v.joblib')

In [None]:
#swap for MiniRocket
y_1d = np.argmax(y, axis=1)
X_swapped = np.swapaxes(X, 1, 2)

In [None]:
#split 
size = 0.2 
X_train, X_test, y_train, y_test = train_test_split(X_swapped, y_1d, test_size=size, shuffle = False)

In [None]:
#reshape, scale, reshape
scaler = MinMaxScaler((-1,1))

X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

X_train_scaled = scaler.fit_transform(X_train_reshaped)
X_test_scaled = scaler.transform(X_test_reshaped)

X_train_scaled = X_train_scaled.reshape(X_train.shape)
X_test_scaled = X_test_scaled.reshape(X_test.shape)

In [None]:
#predict with saved model
y_pred = loaded_model.predict(X_test_scaled)

In [None]:
# kappa score
cohen_kappa_score(y_test, y_pred)

In [None]:
#report
classification_report(y_test, y_pred, output_dict=True)

# **Analysis**

## chains

In [None]:
#find the longest chain of 1
def longest_chain_of_ones(class_variable):
  """
  expects variable
  """
    current_chain = 0
    max_chain = 0
    
    for value in class_variable:
        if value == 1:
            current_chain += 1
            max_chain = max(max_chain, current_chain)
        else:
            current_chain = 0
    
    return max_chain
longest_chain = longest_chain_of_ones(merged_df.class_variable)
print("Longest chain of 1s:", longest_chain)


In [None]:
#subsets the longest chain of 1
def subset_dataframe(df, column_name):
    longest_chain = 0
    current_chain = 0
    start_index = 0
    end_index = 0
    
    for i, value in enumerate(df[column_name]):
        if value == 1:
            current_chain += 1
            if current_chain > longest_chain:
                longest_chain = current_chain
                start_index = i - longest_chain + 1
                end_index = i + 1
        else:
            current_chain = 0
    
    subset = df.iloc[start_index:end_index]
    return subset


subset = subset_dataframe(merged_df, 'class_variable')
subset = pd.DataFrame(subset)
subset.head(50)

In [None]:
#subset longest chain of 2
def subset_dataframe(df, column_name):
  """
  expects df and column name 
  """
    longest_chain = 0
    current_chain = 0
    start_index = 0
    end_index = 0
    
    for i, value in enumerate(df[column_name]):
        if value == 2:
            current_chain += 1
            if current_chain > longest_chain:
                longest_chain = current_chain
                start_index = i - longest_chain + 1
                end_index = i + 1
        else:
            current_chain = 0
    
    subset = df.iloc[start_index:end_index]
    return subset


subset = subset_dataframe(merged_df, 'class_variable')
subset = pd.DataFrame(subset)
subset.tail(50)


In [None]:
#number of chains longer than min_length
def count_long_chains(df, column_name, min_length):
  """
  expects df, column name and min length
  """
  
    num_long_chains = 0
    current_chain = 0
    
    for value in df[column_name]:
        if value == 1:
            current_chain += 1
            if current_chain > min_length:
                num_long_chains += 1
        else:
            current_chain = 0
    
    return num_long_chains


min_length = 5
num_long_chains = count_long_chains(merged_df, 'class_variable', min_length)
print("Number of chains longer than", min_length, ":", num_long_chains)


In [None]:
#plot chains class 1
def plot_chain_length_distribution(class_variable):
  """
  expects variable
  """
    chain_lengths = []
    current_chain = 0
    non_chain_count = 0

    for value in class_variable:
        if value == 1:
            current_chain += 1
        else:
            if current_chain > 0:
                chain_lengths.append(current_chain)
                current_chain = 0
            non_chain_count += 1

    if current_chain > 0:
        chain_lengths.append(current_chain)

    chain_lengths.append(0) 
    non_chain_count += 1

    plt.hist(chain_lengths, bins=max(chain_lengths)+1)
    plt.xlabel('Chain Length')
    plt.ylabel('Number of Occurrences')
    plt.xticks(range(max(chain_lengths)+1))
    plt.title('Chain Length Distribution')
    plt.show()


plot_chain_length_distribution(merged_df.class_variable)


In [None]:
#plot chains class 2
def plot_chain_length_distribution(class_variable):
  """
  expects variable
  """
    chain_lengths = []
    current_chain = 0
    non_chain_count = 0

    for value in class_variable:
        if value == 2:
            current_chain += 1
        else:
            if current_chain > 0:
                chain_lengths.append(current_chain)
                current_chain = 0
            non_chain_count += 1

    if current_chain > 0:
        chain_lengths.append(current_chain)

    chain_lengths.append(0)  
    non_chain_count += 1

    max_chain_length = max(chain_lengths)
    bins = range(max_chain_length + 1)

    plt.hist(chain_lengths, bins=max(chain_lengths)+1)
    plt.xlabel('Chain Length')
    plt.ylabel('Number of Occurrences')
    plt.xticks(range(0, max_chain_length + 1, 3))
    plt.title('Chain Length Distribution')
    plt.show()


plot_chain_length_distribution(merged_df.class_variable)


In [None]:
merged_df.class_variable.value_counts()

## transitions

In [None]:
#average number of class 0 instance between class -1 and 2
def calculate_average_zeros_between(df, column_name):
  """
  expects df and column_name
  """
  
    count_zeros = 0
    count_transitions = 0
    between_zeros = []

    for value in df[column_name]:
        #print("Value:", value)
        if value == 1:
            if count_transitions > 0:
                between_zeros.append(count_zeros)
                count_zeros = 0
            count_transitions += 1
        elif value == 0:
            count_zeros += 1
        elif value == 2:
            if count_zeros > 0:
                between_zeros.append(count_zeros)
                count_zeros = 0

    if count_transitions > 0:
        average_zeros_between = sum(between_zeros) / count_transitions
    else:
        average_zeros_between = 0

    return average_zeros_between


# Apply on merged_df DataFrame
average_zeros_between = calculate_average_zeros_between(merged_df, 'class_variable')
print("Average number of class 0 between instances of 1 and 2:", average_zeros_between)


## Profitability
Althoug there are more opportunities in class 2, they are more profitable in class 1 with respect to how the code works. 

In [None]:
y_pred = np.argmax(y_pred, axis=1)

In [None]:
#create the profitability df
prof_df = merged_df.tail(len(y_test))
prof_df['y_test'] = y_test
prof_df['y_pred'] = y_pred
prof_df = prof_df[['buy_okx', 'buy_bnb', 'y_test', 'y_pred']]
prof_df = prof_df.reset_index()

In [None]:
#quick overview of what profitability to expect
print(prof_df.y_pred.value_counts())
print(prof_df.y_test.value_counts())

### class1

In [None]:
prof_df

In [None]:
#convert to array
buy_bnb = np.array(prof_df.buy_bnb)
buy_okx = np.array(prof_df.buy_okx)
y_pred = np.array(prof_df.y_pred)

#list of closest values, aka values the spread at which you leave the trade
closest_values1 = []

for i in range(len(buy_okx)):
    current_okx = buy_okx[i]
    
    if y_pred[i] == 1 and current_okx < 0:
      if i + 20 < len(buy_bnb):
        closest_value = abs(buy_bnb[i + 20])
      else:
        closest_value = abs(buy_bnb[i+1])

    else:
        if y_pred[i] == 1:
            bnb_values_after_okx = abs(buy_bnb[i + 1:])

            # closest value in bnb_values_after_okx that is smaller than current_okx
            valid_values = bnb_values_after_okx[abs(bnb_values_after_okx) < current_okx]
            if len(valid_values) > 0:
                closest_value = abs(valid_values[0])
            else:
                closest_value = abs(buy_bnb[i+1])
        else:
            closest_value = np.nan
    
    closest_values1.append(closest_value)

print(closest_values1)


In [None]:
#create and print the lists
sells_bnb = [value for value in closest_values1 if not np.isnan(value)]
print(len(sells_bnb))
buys_okx = prof_df[prof_df.y_pred == 1]['buy_okx']
print(len(buys_okx))
#profit class 1 calculated by subtracting all exit spreads from entry spreads
profit_class1 = [x - y for x, y in zip(buys_okx, sells_bnb)]
print(sum(profit_class1))

In [None]:
#average of negative values to investigate profitability
neg = []
for i in profit_class1:
  if i < 0:
    neg.append(i)

sum(neg)/len(neg)

### class2

In [None]:
# arrays
buy_bnb = np.array(prof_df.buy_bnb)
buy_okx = np.array(prof_df.buy_okx)
y_pred = np.array(prof_df.y_pred)

#list of exit spreads
closest_values2 = []
for i in range(len(buy_bnb)):
    current_bnb = buy_bnb[i]
    
    if y_pred[i] == 2 and current_bnb < 0:
      if i + 20 < len(buy_okx):
        closest_value = abs(buy_okx[i + 20])
      else:
        closest_value = abs(buy_okx[i+1])

    else:
        if y_pred[i] == 2:
            okx_values_after_bnb = abs(buy_okx[i + 1:])

            #closest value in okx_values_after_bnb that is smaller than current_bnb
            valid_values = okx_values_after_bnb[abs(okx_values_after_bnb) < current_bnb]
            if len(valid_values) > 0:
                closest_value = abs(valid_values[0])
            else:
                closest_value = abs(buy_okx[i+1])
        else:
            closest_value = np.nan
    closest_values2.append(closest_value)
print(closest_values2)


In [None]:
#entry and exit spreads
sells_okx = [value for value in closest_values2 if not np.isnan(value)]
print(len(sells_okx))
buys_bnb = prof_df[prof_df.y_pred == 2]['buy_bnb']
print(len(buys_bnb))
#profit class2 which is calculated as entry spreads - exit spreads
profit_class2 = [x - y for x, y in zip(buys_bnb, sells_okx)]
print(sum(profit_class2))

In [None]:
#investigate negative values to understand profitability better
buys_bnb = np.array(buys_bnb)  # Assuming buys_bnb is the array
sells_okx = [value for value in sells_okx]  # Assuming sells_okx is the list

result = buys_bnb - sells_okx

neg = []
for i in result:
  if i < 0:
    neg.append(i)
sum(neg)/len(neg)

## Visualize model performance

In [None]:
#cohen kappa for each model at each fold for each period
LSTM_bear = [0.95, 0, 0.89, 0.96, 0.93]
Mini_bear = [0.57, 0.69, 0.68, 0.84, 0.85]
LSTM_side = [0.945, 0.948, 0.964, 0.982, 0.954]
Mini_side = [0.603, 0.576, 0.587, 0.673, 0.51]
LSTM_bull = [0.066, 0, 0.178, 0.848, 0.916]
Mini_bull = [0.008, 0.753, 0.655, 0.766, 0.839]

In [None]:
fig, ax = plt.subplots()
ax.plot(LSTM_bear, color='skyblue', label='LSTM_bear')
ax.plot(Mini_bear, color='red', label='Mini_bear')
ax.plot(LSTM_side, color='blue', label='LSTM_side')
ax.plot(Mini_side, color='salmon', label='Mini_side')
ax.plot(LSTM_bull, color='royalblue', label='LSTM_bull')
ax.plot(Mini_bull, color='tomato', label='Mini_bull')

# title legend
ax.set_title('Cohen Kappa Scores For Each Model')
ax.legend()

# plot
plt.show()