In [1]:
import datetime
import os
import random
import numpy as np
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import tensorflow as tf
import mlflow.keras
import importlib
import joblib
from sklearn.metrics import roc_auc_score, f1_score

import ml_investing_wne.config as config
from ml_investing_wne.data_engineering.load_data import get_hist_data
from ml_investing_wne.data_engineering.prepare_dataset import prepare_processed_dataset
from ml_investing_wne.train_test_val_split import train_test_val_split
from ml_investing_wne.helper import confusion_matrix_plot, compute_profitability_classes
from ml_investing_wne.utils import get_logger

seed = 12345
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
                   
logger = get_logger()

2022-10-22 11:15:46.311946: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-22 11:15:46.311987: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
df = get_hist_data(currency=config.currency)
df = prepare_processed_dataset(df=df)
X, y, X_val, y_val, X_test, y_test, y_cat, y_val_cat, y_test_cat, train = train_test_val_split(df)


2022-10-22 11:16:18,861 - ml_investing_wne.data_engineering.prepare_dataset - exported to /home/jupyter/ml_investing_wne/src/ml_investing_wne/data/processed/EURCHF/EURCHF_processed_720min.csv
2022-10-22 11:16:18,919 - ml_investing_wne.train_test_val_split - first sequence begins: 2010-01-24 12:00:00
2022-10-22 11:16:18,920 - ml_investing_wne.train_test_val_split - first sequence ends: 2010-03-25 00:00:00
2022-10-22 11:16:18,945 - ml_investing_wne.train_test_val_split - last sequence begins: 2019-10-30 12:00:00
2022-10-22 11:16:18,947 - ml_investing_wne.train_test_val_split - last sequence ends: 2019-12-30 12:00:00
2022-10-22 11:16:19,001 - ml_investing_wne.train_test_val_split - first sequence begins: 2019-11-13 12:00:00
2022-10-22 11:16:19,002 - ml_investing_wne.train_test_val_split - first sequence ends: 2020-01-14 00:00:00
2022-10-22 11:16:19,006 - ml_investing_wne.train_test_val_split - last sequence begins: 2020-10-29 12:00:00
2022-10-22 11:16:19,006 - ml_investing_wne.train_test_

In [3]:
model = load_model(os.path.join(config.package_directory, 'models', 'production',
                        '{}_{}_{}_{}_{}_{}'.format(config.model, 'hist_data',
                                                   config.currency, config.freq,
                                                   str(config.steps_ahead),
                                                   config.seq_len)))

2022-10-22 11:16:23.484972: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-22 11:16:23.485027: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-22 11:16:23.485066: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (python-20220811): /proc/driver/nvidia/version does not exist
2022-10-22 11:16:23.485416: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
model.evaluate(X_val, y_val_cat)

2022-10-22 11:16:49.532274: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)




[0.6796161532402039, 0.5960144996643066]

In [5]:
model.evaluate(X_test, y_test_cat)



[0.6759060621261597, 0.6000000238418579]

In [6]:
df.head(2)

Unnamed: 0,datetime,open,high,low,close,y_pred,SMA_3,EMA_3,VAR_3,SMA_5,...,BBU_5_2.0,BBB_5_2.0,BBP_5_2.0,roc_1,hour,weekday,hour_sin,hour_cos,weekday_sin,weekday_cos
0,2010-01-24 12:00:00,1.4724,1.474,1.4714,1.4715,0,1.472133,1.471879,7.233333e-07,1.47142,...,1.473481,0.280094,0.519411,0.998914,12,6,-2.449294e-16,1.0,-2.449294e-16,1.0
1,2010-01-25 00:00:00,1.4716,1.4733,1.4712,1.4715,0,1.472033,1.471689,8.533333e-07,1.47162,...,1.473468,0.251126,0.467529,1.0,0,0,0.0,1.0,0.0,1.0


In [7]:
# this part is for backtesting

In [8]:
start_date = joblib.load(os.path.join(config.package_directory, 'models',
                                      'first_sequence_ends_{}_{}_{}.save'.format('test',
                                                                                 config.currency,
                                                                                 config.freq)))
end_date = joblib.load(os.path.join(config.package_directory, 'models',
                                    'last_sequence_ends_{}_{}_{}.save'.format('test',
                                                                              config.currency,
                                                                              config.freq)))
print(start_date, end_date)

2021-01-15 12:00:00 2021-12-24 12:00:00


In [9]:
df['change'] = df['close'].shift(-config.steps_ahead) / df['close'] - 1
prediction = df.loc[(df.datetime >= start_date) & (df.datetime <= end_date), ['datetime', 'open', 'high', 'low', 'close', 'y_pred', 'change']]
prediction.shape

(540, 7)

In [10]:
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(axis=-1)

In [11]:
prediction['prediction_1'] = y_pred[:,1]

In [24]:
prediction.to_csv(os.path.join(config.processed_data_path, config.currency,
                               'Backtest_{}_{}.csv'.format(config.freq, config.model)), index=False)

In [12]:
import matplotlib.pyplot as plt
def compute_profitability_classes(df, y_pred, date_start, date_end, lower_bound, upper_bound, time_waw_list=None):
    prediction = df.copy()
    prediction.reset_index(inplace=True)
    df['y_pred'] = df['close'].shift(-config.steps_ahead) / df['close'] - 1
    # new_start = config.val_end + config.seq_len * datetime.timedelta(minutes=int(''.join(filter(str.isdigit, config.freq))))
    prediction = df.loc[(df.datetime >= date_start) & (df.datetime <= date_end)]
    prediction['datetime_waw'] = prediction['datetime'].dt.tz_localize('US/Eastern').dt.tz_convert(
        'Europe/Warsaw').dt.tz_localize(None)
    prediction['hour_waw'] = prediction['datetime_waw'].dt.time
    # prediction['trade'] = y_pred.argmax(axis=1)
    prediction['prediction'] = y_pred[:, 1]
    conditions = [
        (prediction['prediction'] <= lower_bound),
        (prediction['prediction'] > lower_bound) & (prediction['prediction'] <= upper_bound),
        (prediction['prediction'] > upper_bound)
    ]
    values = [0, 0.5, 1]
    prediction['trade'] = np.select(conditions, values)
    if time_waw_list:
        prediction.loc[~prediction['hour_waw'].isin(time_waw_list), 'trade'] = 0.5
    prediction.reset_index(inplace=True)
    budget = 100
    transaction = None
    i = 0
    # drop last row for which we don't have a label
    prediction.drop(prediction.tail(1).index, inplace=True)
    while i < prediction.shape[0]:
        # for i in range(prediction.shape[0]):
        if prediction.loc[i, 'trade'] == config.nb_classes - 1:
            # add transaction cost if position changes
            if transaction != 'buy':
                # initally I assumed that cost can be beared twice, but with spread as only cost it should count only once
                if not transaction:
                    budget = budget * (1 - prediction.loc[i, 'cost'])
                else:
                    budget = budget * (1 - prediction.loc[i, 'cost'])
            transaction = 'buy'
            budget = budget + budget * prediction.loc[i, 'y_pred']
            prediction.loc[i, 'budget'] = budget
            prediction.loc[i, 'transaction'] = transaction
            i = i + config.steps_ahead
        elif prediction.loc[i, 'trade'] == 0:
            # add transaction cost if position changes
            if transaction != 'sell':
                if not transaction:
                    budget = budget * (1 - prediction.loc[i, 'cost'])
                else:
                    budget = budget * (1 - (1 * prediction.loc[i, 'cost']))
            transaction = 'sell'
            budget = budget + budget * (-prediction.loc[i, 'y_pred'])
            prediction.loc[i, 'budget'] = budget
            prediction.loc[i, 'transaction'] = transaction
            i = i + config.steps_ahead
        elif prediction.loc[i, 'trade'] == 0.5:
            if transaction in ['buy', 'sell']:
                # budget = budget * (1 - prediction.loc[i, 'cost'])
                transaction = None
            prediction.loc[i, 'budget'] = budget
            prediction.loc[i, 'transaction'] = transaction
            i = i + 1

    hits = prediction.loc[((prediction['transaction'] == 'buy') & (prediction['y_pred'] > 0)) |
                          ((prediction['transaction'] == 'sell') & (prediction['y_pred'] < 0))].shape[0]
    transactions = prediction.loc[prediction['transaction'].isin(['buy', 'sell'])].shape[0]
    try:
        hits_ratio = hits / transactions
    except ZeroDivisionError:
        hits_ratio = 0
    share_of_time_active = round(prediction.loc[prediction['transaction'].isin(['buy', 'sell'])].shape[0] * \
                                 config.steps_ahead / prediction.shape[0], 2)
    logger.info('share_of_time_active for bounds {}-{} is {} and hit ratio is {}'.format(lower_bound, upper_bound,
                                                                                         share_of_time_active,
                                                                                         hits_ratio))
    plt.figure(2)
    plt.plot(prediction['datetime'], prediction['budget'])
    plt.axhline(y=100, color='r', linestyle='-')
    plt.savefig(os.path.join(config.package_directory, 'models', 'portfolio_evolution_{}_{}_{}_{}_{}.png'.
                             format(config.model, config.currency, config.nb_classes, lower_bound, upper_bound)))
    plt.close()

    logger.info('Portfolio result:  {}'.format(budget))

    return prediction, budget, hits_ratio, share_of_time_active

In [17]:


if 'JPY' in config.currency:
    df['cost'] = (config.pips / 100) / df['close']
else:
    df['cost'] = (config.pips / 10000) / df['close']

start_date = joblib.load(os.path.join(config.package_directory, 'models',
                                      'first_sequence_ends_{}_{}_{}.save'.format('test',
                                                                                 config.currency,
                                                                                 config.freq)))
end_date = joblib.load(os.path.join(config.package_directory, 'models',
                                    'last_sequence_ends_{}_{}_{}.save'.format('test',
                                                                              config.currency,
                                                                              config.freq)))
lower_bounds = [0.35]
upper_bounds = [1 - lower for lower in lower_bounds]

for lower_bound, upper_bound in zip(lower_bounds, upper_bounds):
    prediction, portfolio_result, hit_ratio, time_active = compute_profitability_classes(df, y_pred, start_date,
                                                                             end_date, lower_bound,
                                                                             upper_bound)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [16]:
import pandas as pd
pd.set_option('display.max_rows', 550)
prediction

Unnamed: 0,index,datetime,open,high,low,close,y_pred,SMA_3,EMA_3,VAR_3,...,weekday_sin,weekday_cos,change,cost,datetime_waw,hour_waw,prediction,trade,budget,transaction
0,6274,2021-01-15 12:00:00,1.0761,1.07658,1.0757,1.0757,0.000781,1.07704,1.076641,3.9711e-06,...,-0.8660254,-0.5,0.000781,0.000186,2021-01-15 18:00:00,18:00:00,0.519121,1.0,100.059482,buy
1,6275,2021-01-17 12:00:00,1.07576,1.07701,1.07531,1.07654,-0.000873,1.07611,1.07659,1.767e-07,...,-2.449294e-16,1.0,-0.000873,0.000186,2021-01-17 18:00:00,18:00:00,0.471611,0.0,100.128245,sell
2,6276,2021-01-18 00:00:00,1.07655,1.07694,1.07388,1.0756,0.001748,1.075947,1.076095,2.665333e-07,...,0.0,1.0,0.001748,0.000186,2021-01-18 06:00:00,06:00:00,0.540796,1.0,100.284605,buy
3,6277,2021-01-18 12:00:00,1.0756,1.07759,1.0754,1.07748,9.3e-05,1.07654,1.076788,8.836e-07,...,0.0,1.0,9.3e-05,0.000186,2021-01-18 18:00:00,18:00:00,0.561834,1.0,100.293912,buy
4,6278,2021-01-19 00:00:00,1.07748,1.07809,1.07561,1.07758,0.000399,1.076887,1.077184,1.244133e-06,...,0.8660254,0.5,0.000399,0.000186,2021-01-19 06:00:00,06:00:00,0.511994,1.0,100.333934,buy
5,6279,2021-01-19 12:00:00,1.07758,1.0782,1.07684,1.07801,-0.000427,1.07769,1.077597,7.93e-08,...,0.8660254,0.5,-0.000427,0.000186,2021-01-19 18:00:00,18:00:00,0.504471,1.0,100.29112,buy
6,6280,2021-01-20 00:00:00,1.07798,1.07923,1.07669,1.07755,0.000436,1.077713,1.077573,6.623333e-08,...,0.8660254,-0.5,0.000436,0.000186,2021-01-20 06:00:00,06:00:00,0.545362,1.0,100.334864,buy
7,6281,2021-01-20 12:00:00,1.07758,1.07812,1.07653,1.07802,-0.000955,1.07786,1.077797,7.21e-08,...,0.8660254,-0.5,-0.000955,0.000186,2021-01-20 18:00:00,18:00:00,0.477736,0.0,100.412097,sell
8,6282,2021-01-21 00:00:00,1.07803,1.07863,1.07564,1.07699,0.000121,1.07752,1.077393,2.659e-07,...,1.224647e-16,-1.0,0.000121,0.000186,2021-01-21 06:00:00,06:00:00,0.474786,0.0,100.399977,sell
9,6283,2021-01-21 12:00:00,1.07699,1.07753,1.07642,1.07712,0.000975,1.077377,1.077257,3.146333e-07,...,1.224647e-16,-1.0,0.000975,0.000186,2021-01-21 18:00:00,18:00:00,0.489363,0.0,100.302105,sell


In [5]:
y_pred

array([[0.53736967, 0.4626303 ],
       [0.5254014 , 0.47459865],
       [0.6033795 , 0.39662042],
       [0.41517782, 0.5848222 ],
       [0.5191155 , 0.4808844 ],
       [0.4964055 , 0.50359446],
       [0.51125485, 0.48874515],
       [0.48603597, 0.51396406],
       [0.5272705 , 0.4727295 ],
       [0.46900377, 0.5309962 ],
       [0.5297231 , 0.47027695],
       [0.5110963 , 0.48890376],
       [0.5205414 , 0.4794586 ],
       [0.5906773 , 0.40932265],
       [0.3970464 , 0.60295355],
       [0.51154065, 0.4884593 ],
       [0.49251166, 0.50748837],
       [0.5182365 , 0.48176348],
       [0.47378096, 0.526219  ],
       [0.5271798 , 0.47282022],
       [0.48044112, 0.5195589 ],
       [0.5506127 , 0.4493873 ],
       [0.5450352 , 0.45496476],
       [0.5097074 , 0.49029264],
       [0.6326073 , 0.36739272],
       [0.3779776 , 0.6220224 ],
       [0.52690315, 0.47309682],
       [0.50355774, 0.49644223],
       [0.55926293, 0.44073713],
       [0.49928543, 0.50071454],
       [0.

In [17]:
model.save(os.path.join(config.package_directory, 'models', 'production',
                        '{}_{}_{}_{}_{}_{}'.format(config.model, 'hist_data_retrain',
                                                   config.currency, config.freq,
                                                   str(config.steps_ahead),
                                                   config.seq_len)))

2022-09-18 16:19:00.342979: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-09-18 16:19:03,896 - absl - Found untraced functions such as embedding_layer_call_fn, embedding_layer_call_and_return_conditional_losses, query_layer_call_fn, query_layer_call_and_return_conditional_losses, key_layer_call_fn while saving (showing 5 of 125). These functions will not be directly callable after loading.


INFO:tensorflow:Assets written to: /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/production/transformer_learnable_encoding_hist_data_retrain_EURCHF_720min_1_96/assets


2022-09-18 16:19:05,150 - tensorflow - Assets written to: /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/production/transformer_learnable_encoding_hist_data_retrain_EURCHF_720min_1_96/assets


In [15]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))].shape

(446, 43)

In [21]:
import pandas as pd
pd.set_option('display.max_rows', 500)

In [22]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))]['datetime'].reset_index()

Unnamed: 0,index,datetime
0,6748,2021-11-14 12:00:00
1,6749,2021-11-15 00:00:00
2,6750,2021-11-15 12:00:00
3,6751,2021-11-16 00:00:00
4,6752,2021-11-16 12:00:00
5,6753,2021-11-17 00:00:00
6,6754,2021-11-17 12:00:00
7,6755,2021-11-18 00:00:00
8,6756,2021-11-18 12:00:00
9,6757,2021-11-19 00:00:00


In [23]:
check = np.concatenate([y_pred, y_test_cat], axis=1)

In [32]:
import pandas as pd
check = pd.DataFrame(check)

In [36]:
check

Unnamed: 0,0,1,2,3
0,0.537370,0.462630,1.0,0.0
1,0.525401,0.474599,1.0,0.0
2,0.603379,0.396620,1.0,0.0
3,0.415178,0.584822,0.0,1.0
4,0.519116,0.480884,1.0,0.0
...,...,...,...,...
345,0.530520,0.469480,0.0,1.0
346,0.507777,0.492223,1.0,0.0
347,0.524888,0.475112,0.0,1.0
348,0.501059,0.498941,1.0,0.0


In [37]:
check.columns = ['p_down', 'p_up','down','up']

In [38]:
check

Unnamed: 0,p_down,p_up,down,up
0,0.537370,0.462630,1.0,0.0
1,0.525401,0.474599,1.0,0.0
2,0.603379,0.396620,1.0,0.0
3,0.415178,0.584822,0.0,1.0
4,0.519116,0.480884,1.0,0.0
...,...,...,...,...
345,0.530520,0.469480,0.0,1.0
346,0.507777,0.492223,1.0,0.0
347,0.524888,0.475112,0.0,1.0
348,0.501059,0.498941,1.0,0.0


In [46]:
check['correct'] = 0
check.loc[((check['p_up']>0.5) & (check['up']==1.0)) | ((check['p_down']>0.5) & (check['down']==1.0)), 'correct']=1

In [50]:
check[0:20]

Unnamed: 0,p_down,p_up,down,up,correct
0,0.53737,0.46263,1.0,0.0,1
1,0.525401,0.474599,1.0,0.0,1
2,0.603379,0.39662,1.0,0.0,1
3,0.415178,0.584822,0.0,1.0,1
4,0.519116,0.480884,1.0,0.0,1
5,0.496406,0.503594,0.0,1.0,1
6,0.511255,0.488745,0.0,1.0,0
7,0.486036,0.513964,1.0,0.0,0
8,0.52727,0.47273,0.0,1.0,0
9,0.469004,0.530996,1.0,0.0,0


In [51]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))].shape

(446, 43)