In [1]:
import datetime
import os
import random
import numpy as np
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import tensorflow as tf
import mlflow.keras
import importlib
import joblib
from sklearn.metrics import roc_auc_score, f1_score

import ml_investing_wne.config as config
from ml_investing_wne.data_engineering.load_data import get_hist_data
from ml_investing_wne.data_engineering.prepare_dataset import prepare_processed_dataset
from ml_investing_wne.train_test_val_split import train_test_val_split
from ml_investing_wne.helper import confusion_matrix_plot, compute_profitability_classes
from ml_investing_wne.utils import get_logger

seed = 12345
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
                   
logger = get_logger()

2022-09-25 08:36:59.206981: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-25 08:36:59.207026: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
df = get_hist_data(currency=config.currency)
df = prepare_processed_dataset(df=df)
X, y, X_val, y_val, X_test, y_test, y_cat, y_val_cat, y_test_cat, train = train_test_val_split(df)


2022-09-25 08:37:40,852 - ml_investing_wne.data_engineering.prepare_dataset - exported to /home/jupyter/ml_investing_wne/src/ml_investing_wne/data/processed/EURCHF/EURCHF_processed_720min.csv
2022-09-25 08:37:40,928 - ml_investing_wne.train_test_val_split - first sequence begins: 2010-01-24 12:00:00
2022-09-25 08:37:40,929 - ml_investing_wne.train_test_val_split - first sequence ends: 2010-03-25 00:00:00
2022-09-25 08:37:40,955 - ml_investing_wne.train_test_val_split - last sequence begins: 2019-10-30 12:00:00
2022-09-25 08:37:40,956 - ml_investing_wne.train_test_val_split - last sequence ends: 2019-12-30 12:00:00
2022-09-25 08:37:41,013 - ml_investing_wne.train_test_val_split - first sequence begins: 2019-11-13 12:00:00
2022-09-25 08:37:41,014 - ml_investing_wne.train_test_val_split - first sequence ends: 2020-01-14 00:00:00
2022-09-25 08:37:41,020 - ml_investing_wne.train_test_val_split - last sequence begins: 2021-10-26 00:00:00
2022-09-25 08:37:41,021 - ml_investing_wne.train_test_

In [3]:
early_stop = EarlyStopping(monitor='val_accuracy', patience=config.patience, restore_best_weights=True)
model_path_final = os.path.join(config.package_directory, 'models',
                                '{}_{}_{}_{}_{}.h5'.format(config.model, 'hist_data',
                                                           config.currency, config.freq,
                                                           config.steps_ahead))
model_checkpoint = ModelCheckpoint(filepath=model_path_final, monitor='val_accuracy', verbose=1,
                                   save_best_only=True)
csv_logger = CSVLogger(os.path.join(config.package_directory, 'logs', 'keras_log.csv'), append=True,
                       separator=';')
callbacks = [early_stop, model_checkpoint, csv_logger]


In [9]:
model = load_model(os.path.join(config.package_directory, 'models', 'production',
                        '{}_{}_{}_{}_{}_{}'.format(config.model, 'hist_data_retrain',
                                                   config.currency, config.freq,
                                                   str(config.steps_ahead),
                                                   config.seq_len)))

In [12]:
history = model.fit(X, y_cat, batch_size=64, epochs=config.epochs, verbose=2,
                    validation_data=(X_val, y_val_cat), callbacks=callbacks)

Epoch 1/100
88/88 - 34s - loss: 0.6791 - accuracy: 0.5574 - val_loss: 0.6884 - val_accuracy: 0.5318

Epoch 00001: val_accuracy improved from -inf to 0.53184, saving model to /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/transformer_learnable_encoding_hist_data_EURCHF_720min_1.h5
Epoch 2/100
88/88 - 30s - loss: 0.6765 - accuracy: 0.5737 - val_loss: 0.6721 - val_accuracy: 0.5901

Epoch 00002: val_accuracy improved from 0.53184 to 0.59013, saving model to /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/transformer_learnable_encoding_hist_data_EURCHF_720min_1.h5
Epoch 3/100
88/88 - 30s - loss: 0.6708 - accuracy: 0.5726 - val_loss: 0.6720 - val_accuracy: 0.5857

Epoch 00003: val_accuracy did not improve from 0.59013
Epoch 4/100
88/88 - 31s - loss: 0.6711 - accuracy: 0.5751 - val_loss: 0.6777 - val_accuracy: 0.5596

Epoch 00004: val_accuracy did not improve from 0.59013
Epoch 5/100
88/88 - 30s - loss: 0.6691 - accuracy: 0.5812 - val_loss: 0.6811 - val_accuracy: 0.5587

In [10]:
model.evaluate(X_val, y_val_cat)



[0.6745448112487793, 0.5937219858169556]

In [11]:
model.evaluate(X_test, y_test_cat)



[0.6918887495994568, 0.5199999809265137]

In [12]:
df.head(2)

Unnamed: 0,datetime,open,high,low,close,y_pred,SMA_3,EMA_3,VAR_3,SMA_5,...,BBB_5_2.0,BBP_5_2.0,roc_1,hour,weekday,hour_sin,hour_cos,weekday_sin,weekday_cos,cost
0,2010-01-24 12:00:00,1.4724,1.474,1.4714,1.4715,0.0,1.472133,1.471879,7.233333e-07,1.47142,...,0.280094,0.519411,0.998914,12,6,-2.449294e-16,1.0,-2.449294e-16,1.0,0.000136
1,2010-01-25 00:00:00,1.4716,1.4733,1.4712,1.4715,-0.000204,1.472033,1.471689,8.533333e-07,1.47162,...,0.251126,0.467529,1.0,0,0,0.0,1.0,0.0,1.0,0.000136


In [13]:
df.loc[df['datetime']>=datetime.datetime(2020,10,7,12,0,0)].head(10)

Unnamed: 0,datetime,open,high,low,close,y_pred,SMA_3,EMA_3,VAR_3,SMA_5,...,BBB_5_2.0,BBP_5_2.0,roc_1,hour,weekday,hour_sin,hour_cos,weekday_sin,weekday_cos,cost
6120,2020-10-07 12:00:00,1.07873,1.07925,1.07837,1.0791,-0.000575,1.078277,1.078621,1.291633e-06,1.07829,...,0.278381,0.769842,1.000324,12,2,-2.449294e-16,1.0,0.8660254,-0.5,0.000185
6121,2020-10-08 00:00:00,1.0791,1.08003,1.07806,1.07848,-0.000278,1.078777,1.07855,9.663333e-08,1.078256,...,0.273438,0.575974,0.999425,0,3,0.0,1.0,1.224647e-16,-1.0,0.000185
6122,2020-10-08 12:00:00,1.07848,1.07893,1.078,1.07818,-0.002161,1.078587,1.078365,2.201333e-07,1.078298,...,0.269124,0.459338,0.999722,12,3,-2.449294e-16,1.0,1.224647e-16,-1.0,0.000185
6123,2020-10-09 00:00:00,1.07818,1.07858,1.07579,1.07585,6.5e-05,1.077503,1.077108,2.072633e-06,1.078072,...,0.427312,0.017663,0.997839,0,4,0.0,1.0,-0.8660254,-0.5,0.000186
6124,2020-10-09 12:00:00,1.07583,1.07635,1.07544,1.07592,0.000214,1.07665,1.076514,1.7569e-06,1.077506,...,0.503599,0.20772,1.000065,12,4,-2.449294e-16,1.0,-0.8660254,-0.5,0.000186
6125,2020-10-11 12:00:00,1.07542,1.07624,1.07535,1.07615,-0.002109,1.075973,1.076332,2.463333e-08,1.076916,...,0.431849,0.335292,1.000214,12,6,-2.449294e-16,1.0,-2.449294e-16,1.0,0.000186
6126,2020-10-12 00:00:00,1.07616,1.07739,1.07342,1.07388,-0.000754,1.075317,1.075106,1.561233e-06,1.075996,...,0.506946,0.112079,0.997891,0,0,0.0,1.0,0.0,1.0,0.000186
6127,2020-10-12 12:00:00,1.07392,1.07443,1.0729,1.07307,0.000503,1.074367,1.074088,2.549233e-06,1.074974,...,0.466757,0.120529,0.999246,12,0,-2.449294e-16,1.0,0.0,1.0,0.000186
6128,2020-10-13 00:00:00,1.07306,1.07433,1.07255,1.07361,0.000913,1.07352,1.073849,1.701e-07,1.074526,...,0.469603,0.31847,1.000503,0,1,0.0,1.0,0.8660254,0.5,0.000186
6129,2020-10-13 12:00:00,1.07361,1.07494,1.07342,1.07459,-0.001601,1.073757,1.074219,5.937333e-07,1.07426,...,0.396303,0.577513,1.000913,12,1,-2.449294e-16,1.0,0.8660254,0.5,0.000186


In [14]:
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(axis=-1)

if 'JPY' in config.currency:
    df['cost'] = (config.pips / 100) / df['close']
else:
    df['cost'] = (config.pips / 10000) / df['close']

start_date = joblib.load(os.path.join(config.package_directory, 'models',
                                      'first_sequence_ends_{}_{}_{}.save'.format('test',
                                                                                 config.currency,
                                                                                 config.freq)))
end_date = joblib.load(os.path.join(config.package_directory, 'models',
                                    'last_sequence_ends_{}_{}_{}.save'.format('test',
                                                                              config.currency,
                                                                              config.freq)))
lower_bounds = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
upper_bounds = [1 - lower for lower in lower_bounds]

for lower_bound, upper_bound in zip(lower_bounds, upper_bounds):
    portfolio_result, hit_ratio, time_active = compute_profitability_classes(df, y_pred, start_date,
                                                                             end_date, lower_bound,
                                                                             upper_bound)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  'Europe/Warsaw').dt.tz_localize(None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['hour_waw'] = prediction['datetime_waw'].dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prediction'] = y_pred[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.


In [5]:
y_pred

array([[0.53736967, 0.4626303 ],
       [0.5254014 , 0.47459865],
       [0.6033795 , 0.39662042],
       [0.41517782, 0.5848222 ],
       [0.5191155 , 0.4808844 ],
       [0.4964055 , 0.50359446],
       [0.51125485, 0.48874515],
       [0.48603597, 0.51396406],
       [0.5272705 , 0.4727295 ],
       [0.46900377, 0.5309962 ],
       [0.5297231 , 0.47027695],
       [0.5110963 , 0.48890376],
       [0.5205414 , 0.4794586 ],
       [0.5906773 , 0.40932265],
       [0.3970464 , 0.60295355],
       [0.51154065, 0.4884593 ],
       [0.49251166, 0.50748837],
       [0.5182365 , 0.48176348],
       [0.47378096, 0.526219  ],
       [0.5271798 , 0.47282022],
       [0.48044112, 0.5195589 ],
       [0.5506127 , 0.4493873 ],
       [0.5450352 , 0.45496476],
       [0.5097074 , 0.49029264],
       [0.6326073 , 0.36739272],
       [0.3779776 , 0.6220224 ],
       [0.52690315, 0.47309682],
       [0.50355774, 0.49644223],
       [0.55926293, 0.44073713],
       [0.49928543, 0.50071454],
       [0.

In [17]:
model.save(os.path.join(config.package_directory, 'models', 'production',
                        '{}_{}_{}_{}_{}_{}'.format(config.model, 'hist_data_retrain',
                                                   config.currency, config.freq,
                                                   str(config.steps_ahead),
                                                   config.seq_len)))

2022-09-18 16:19:00.342979: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-09-18 16:19:03,896 - absl - Found untraced functions such as embedding_layer_call_fn, embedding_layer_call_and_return_conditional_losses, query_layer_call_fn, query_layer_call_and_return_conditional_losses, key_layer_call_fn while saving (showing 5 of 125). These functions will not be directly callable after loading.


INFO:tensorflow:Assets written to: /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/production/transformer_learnable_encoding_hist_data_retrain_EURCHF_720min_1_96/assets


2022-09-18 16:19:05,150 - tensorflow - Assets written to: /home/jupyter/ml_investing_wne/src/ml_investing_wne/models/production/transformer_learnable_encoding_hist_data_retrain_EURCHF_720min_1_96/assets


In [15]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))].shape

(446, 43)

In [21]:
import pandas as pd
pd.set_option('display.max_rows', 500)

In [22]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))]['datetime'].reset_index()

Unnamed: 0,index,datetime
0,6748,2021-11-14 12:00:00
1,6749,2021-11-15 00:00:00
2,6750,2021-11-15 12:00:00
3,6751,2021-11-16 00:00:00
4,6752,2021-11-16 12:00:00
5,6753,2021-11-17 00:00:00
6,6754,2021-11-17 12:00:00
7,6755,2021-11-18 00:00:00
8,6756,2021-11-18 12:00:00
9,6757,2021-11-19 00:00:00


In [23]:
check = np.concatenate([y_pred, y_test_cat], axis=1)

In [32]:
import pandas as pd
check = pd.DataFrame(check)

In [36]:
check

Unnamed: 0,0,1,2,3
0,0.537370,0.462630,1.0,0.0
1,0.525401,0.474599,1.0,0.0
2,0.603379,0.396620,1.0,0.0
3,0.415178,0.584822,0.0,1.0
4,0.519116,0.480884,1.0,0.0
...,...,...,...,...
345,0.530520,0.469480,0.0,1.0
346,0.507777,0.492223,1.0,0.0
347,0.524888,0.475112,0.0,1.0
348,0.501059,0.498941,1.0,0.0


In [37]:
check.columns = ['p_down', 'p_up','down','up']

In [38]:
check

Unnamed: 0,p_down,p_up,down,up
0,0.537370,0.462630,1.0,0.0
1,0.525401,0.474599,1.0,0.0
2,0.603379,0.396620,1.0,0.0
3,0.415178,0.584822,0.0,1.0
4,0.519116,0.480884,1.0,0.0
...,...,...,...,...
345,0.530520,0.469480,0.0,1.0
346,0.507777,0.492223,1.0,0.0
347,0.524888,0.475112,0.0,1.0
348,0.501059,0.498941,1.0,0.0


In [46]:
check['correct'] = 0
check.loc[((check['p_up']>0.5) & (check['up']==1.0)) | ((check['p_down']>0.5) & (check['down']==1.0)), 'correct']=1

In [50]:
check[0:20]

Unnamed: 0,p_down,p_up,down,up,correct
0,0.53737,0.46263,1.0,0.0,1
1,0.525401,0.474599,1.0,0.0,1
2,0.603379,0.39662,1.0,0.0,1
3,0.415178,0.584822,0.0,1.0,1
4,0.519116,0.480884,1.0,0.0,1
5,0.496406,0.503594,0.0,1.0,1
6,0.511255,0.488745,0.0,1.0,0
7,0.486036,0.513964,1.0,0.0,0
8,0.52727,0.47273,0.0,1.0,0
9,0.469004,0.530996,1.0,0.0,0


In [51]:
df.loc[(df['datetime']>=datetime.datetime(2021,11,14,12,0,0)) & (df['datetime']<datetime.datetime(2022,9,1,0,0,0))].shape

(446, 43)