### Dataset

In [75]:
import pandas as pd
from auto_ts import auto_timeseries
import dill
import talib
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam, RMSprop, SGD
from scikeras.wrappers import KerasRegressor
from keras.callbacks import EarlyStopping
from skopt import BayesSearchCV
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.backend as K
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from keras.losses import categorical_crossentropy
from sklearn.metrics import log_loss
from keras.utils import to_categorical
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OStackedEnsembleEstimator

In [76]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [77]:
columns = [
    'Open_time',
    'Open',
    'High',
    'Low',
    # 'Close',
    'Number of trades',
    'Close_BTCUSDT',
    'Volume_BTCUSDT',
    'Number_of_trades_BTCUSDT',
    'Close_ETHUSDT',
    'Volume_ETHUSDT',
    'Number_of_trades_ETHUSDT',
    'Close_BNBUSDT',
    'Volume_BNBUSDT',
    'Number_of_trades_BNBUSDT',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'buy_1000x_high_coinbase',
    'sell_1000x_high_coinbase',
    'total_trades_coinbase',	
    'Tweets_Utilizados',
    'Tweets_Utilizados_coin',
    'Tweets_Utilizados_referentes',
    'Tweets_Utilizados_whale_alert',
    'Buy_1000x_high',
    'sell_1000x_high',
    'total_trades_binance'
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

In [78]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Open_time'] = pd.to_datetime(classifier_dataset['Open_time'])
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

clasifier_validation = classifier_dataset[-10:]
classifier_dataset = classifier_dataset[:-10]

In [79]:
display(classifier_dataset.tail())

Unnamed: 0,Open_time,Open,High,Low,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Tendencia
941,2024-04-25,6.93,7.0,6.7,71088.0,64498.34,31341.46,1375324.0,3155.8,352288.55,861077.0,613.2,453745.52,353114.0,7.43,7.45,9.08,7.43,5.77,38.83,21.0,26.0,33468.0,151,114,0.0,22.0,242.0,219.0,48000.0,Lateral
942,2024-04-26,6.86,6.95,6.71,67383.0,63770.01,27085.19,1025561.0,3131.3,252522.65,628635.0,598.0,302119.88,269508.0,7.34,7.38,8.94,7.34,5.74,37.81,29.0,24.0,26619.0,117,106,0.0,14.0,292.0,324.0,42000.0,Lateral
943,2024-04-27,6.76,6.87,6.51,64779.0,63461.98,20933.06,912422.0,3255.56,323811.19,734026.0,596.2,268783.91,233820.0,7.24,7.33,8.73,7.24,5.76,38.57,17.0,17.0,25565.0,101,138,0.0,7.0,248.0,179.0,41000.0,Lateral
944,2024-04-28,6.81,6.95,6.69,43208.0,63118.62,16949.2,790652.0,3263.45,304766.01,753239.0,600.2,258059.43,206703.0,7.13,7.27,8.38,7.13,5.88,37.66,16.0,20.0,20954.0,82,106,0.0,13.0,173.0,165.0,26000.0,Lateral
945,2024-04-29,6.73,6.83,6.47,63006.0,63866.0,28150.23,1152296.0,3216.73,421831.29,943719.0,592.8,330474.01,271926.0,7.03,7.2,8.08,7.03,5.97,36.02,69.0,37.0,33959.0,115,125,0.0,24.0,260.0,188.0,41000.0,Bajista


In [80]:
classifier_dataset.shape

(946, 31)

In [81]:
# X = classifier_dataset.drop(columns=["Tendencia", "Open_time"])
# y = classifier_dataset["Tendencia"]

# y = y.to_numpy().reshape(-1, 1)
# onehot_encoder = OneHotEncoder(sparse=False)
# y_one_hot = onehot_encoder.fit_transform(y)


In [82]:
# display(y_one_hot)

In [83]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 hours 48 mins
H2O_cluster_timezone:,America/Argentina/Buenos_Aires
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,2 months and 3 days
H2O_cluster_name:,H2O_from_python_mmarchetta_ssxbpb
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.137 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [84]:
h2o_df = h2o.H2OFrame(classifier_dataset)

aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, seed=1234)
aml.train(y='Tendencia', training_frame=h2o_df)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),124/124
# GBM base models (used / total),18/18
# XGBoost base models (used / total),18/18
# GLM base models (used / total),1/1
# DeepLearning base models (used / total),85/85
# DRF base models (used / total),2/2
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
253.0,20.0,40.0,0.1916933,60 / 313
14.0,226.0,80.0,0.29375,94 / 320
73.0,68.0,172.0,0.4504792,141 / 313
340.0,314.0,292.0,0.3118393,295 / 946

k,hit_ratio
1,0.6881607
2,0.9175476
3,1.0

Alcista,Bajista,Lateral,Error,Rate
240.0,22.0,51.0,0.2332268,73 / 313
13.0,240.0,67.0,0.25,80 / 320
69.0,58.0,186.0,0.4057508,127 / 313
322.0,320.0,304.0,0.2959831,280 / 946

k,hit_ratio
1,0.7040169
2,0.9186047
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7047472,0.0396158,0.6717949,0.7089947,0.7219251,0.7595629,0.6614583
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.2952528,0.0396158,0.3282051,0.2910053,0.2780749,0.2404372,0.3385417
err_count,56.0,8.746428,64.0,55.0,52.0,44.0,65.0
loglikelihood,,0.0,,,,,
logloss,0.7480022,0.0881941,0.8333361,0.7110236,0.7213065,0.6334509,0.8408937
max_per_class_error,0.4086826,0.0744401,0.3492064,0.4411765,0.4363636,0.3166667,0.5
mean_per_class_accuracy,0.7065459,0.0376498,0.6715608,0.7179061,0.7135581,0.7602938,0.6694107
mean_per_class_error,0.2934541,0.0376498,0.3284392,0.282094,0.2864419,0.2397062,0.3305893


In [85]:
leaderboard = aml.leaderboard
print(leaderboard)

model_id                                                   mean_per_class_error    logloss      rmse       mse
StackedEnsemble_AllModels_5_AutoML_3_20240516_170316                   0.296326   0.749621  0.486736  0.236912
StackedEnsemble_BestOfFamily_5_AutoML_3_20240516_170316                0.296396   0.713311  0.487962  0.238107
StackedEnsemble_BestOfFamily_8_AutoML_3_20240516_170316                0.297414   0.72368   0.489411  0.239523
StackedEnsemble_BestOfFamily_7_AutoML_3_20240516_170316                0.298386   0.71921   0.494807  0.244834
StackedEnsemble_BestOfFamily_2_AutoML_3_20240516_170316                0.298456   0.712292  0.489722  0.239828
StackedEnsemble_BestOfFamily_4_AutoML_3_20240516_170316                0.298502   0.699221  0.483674  0.23394
StackedEnsemble_BestOfFamily_6_AutoML_3_20240516_170316                0.301721   0.74299   0.492506  0.242562
GLM_1_AutoML_3_20240516_170316                                         0.30232    0.728736  0.499583  0.249583
St

### Guardado de los mejores 5 modelos

In [87]:
top_models = aml.leaderboard.as_data_frame()['model_id'].tolist()[:5]
for i, model_id in enumerate(top_models):
    model = h2o.get_model(model_id)
    file_name = f"modelo_{i}.h2o"
    h2o.save_model(model=model, path=f"automl_classifier", filename=file_name)


### Cargo los mejores 5 modelos

In [88]:
stack_models = []
for i in range(5):
    model_path = f"automl_classifier/modelo_{i}.h2o"
    model = h2o.load_model(model_path)
    # model.nfolds = 5
    stack_models.append(model)

### Armo un ensamble con los mejores 5 modelos (stacks)

In [90]:
# stacked_ensemble = H2OStackedEnsembleEstimator(base_models=stack_models)
# stacked_ensemble.train(y='Tendencia', training_frame=h2o_df)
# h2o.save_model(model=stacked_ensemble, path="automl_classifier", filename="stacked_ensemble_model.h2o")


stackedensemble Model Build progress: |

 (failed)


OSError: Job with key $03017f00000132d4ffffffff$_841400cfa644b2c7229767114d774454 failed with an exception: water.exceptions.H2OIllegalArgumentException: Base model does not use cross-validation: 0
stacktrace: 
water.exceptions.H2OIllegalArgumentException: Base model does not use cross-validation: 0
	at hex.ensemble.StackedEnsemble.checkAndInheritModelProperties(StackedEnsemble.java:483)
	at hex.ensemble.StackedEnsemble$StackedEnsembleDriver.computeImpl(StackedEnsemble.java:767)
	at hex.ModelBuilder$Driver.compute2(ModelBuilder.java:253)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1704)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:976)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [None]:
# stacked_ensemble.model_performance()

In [98]:
# predictions = stacked_ensemble.predict(clasifier_validation)
h2o_validation_df = h2o.H2OFrame(clasifier_validation)
predictions = stack_models[0].predict(h2o_validation_df)
display(predictions)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.0118122,0.933522,0.0546654
Alcista,0.633489,0.0131357,0.353375
Alcista,0.94753,0.0264663,0.0260036
Alcista,0.928234,0.016586,0.0551799
Alcista,0.444594,0.155572,0.399834
Alcista,0.897622,0.0602897,0.0420883
Lateral,0.187251,0.399099,0.41365
Bajista,0.122592,0.676654,0.200754
Lateral,0.24774,0.364582,0.387678
Alcista,0.767195,0.102922,0.129883


##### Datos generados por auto ml con prophet:

In [99]:
auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
h2o_prophet_df = h2o.H2OFrame(auto_ml_prophet_df)
auto_mp_prophet_predictions = stack_models[0].predict(h2o_prophet_df)

display(auto_mp_prophet_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.123923,0.608896,0.267181
Bajista,0.153719,0.52286,0.32342
Bajista,0.105159,0.567341,0.3275
Bajista,0.0943615,0.616379,0.28926
Bajista,0.156415,0.655672,0.187913
Bajista,0.115665,0.652456,0.231879
Bajista,0.0883514,0.575225,0.336424
Bajista,0.0968877,0.630801,0.272311
Bajista,0.0646327,0.690246,0.245122
Bajista,0.0686611,0.733267,0.198072


##### Datos generados por auto ml con stats:

In [100]:
auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
h2o_stats_df = h2o.H2OFrame(auto_ml_stats_df)
auto_mp_stats_predictions = stack_models[1].predict(h2o_stats_df)

display(auto_mp_stats_predictions)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.126824,0.449691,0.423485
Bajista,0.0720404,0.660756,0.267204
Bajista,0.0896589,0.615369,0.294972
Bajista,0.117317,0.534516,0.348167
Bajista,0.0778152,0.599896,0.322289
Bajista,0.108362,0.575022,0.316617
Bajista,0.0201922,0.782867,0.196941
Bajista,0.0198606,0.803616,0.176524
Bajista,0.054257,0.701886,0.243857
Bajista,0.0484801,0.704673,0.246847


##### Datos generados por auto ml con modelos clasicos:

In [101]:
auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
h2o_mp_df = h2o.H2OFrame(auto_ml_df)
auto_mp_predictions = stack_models[1].predict(h2o_mp_df)

display(auto_mp_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.0256038,0.71345,0.260946
Bajista,0.0669484,0.524547,0.408504
Lateral,0.0615192,0.464968,0.473513
Bajista,0.0476053,0.562497,0.389897
Bajista,0.0464708,0.611921,0.341608
Bajista,0.0491577,0.652783,0.298059
Bajista,0.0607769,0.611844,0.327379
Bajista,0.0612728,0.611385,0.327342
Bajista,0.0405912,0.636013,0.323396
Bajista,0.038719,0.686051,0.27523
