### Dataset

In [248]:
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

import h2o
from h2o.automl import H2OAutoML

In [249]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [250]:
columns = [
    # 'Open_time',
    'Open',
    'High',
    'Low',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'MACD',
    'Signal',
    'ADX',
    'SlowK',
    'SlowD',
    'CCI',
    'ATR',
    # 'MFI' <= se calcula a partir del volumen
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

In [251]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

clasifier_validation = classifier_dataset[-10:]
classifier_dataset = classifier_dataset[:-10]

In [252]:
# PARA NORMALIZACION
scaler = MinMaxScaler() #StandardScaler()
features = classifier_dataset[columns]
features_scaled = scaler.fit_transform(features)

classifier_dataset_scaled = pd.DataFrame(features_scaled, columns=columns)
classifier_dataset_scaled['Tendencia'] = classifier_dataset['Tendencia'].values

In [253]:
# PARA NORMALIZACION
features_validation = clasifier_validation[columns]
features_validation_scaled = scaler.transform(features_validation)
clasifier_validation_scaled = pd.DataFrame(features_validation_scaled, columns=columns)
clasifier_validation_scaled['Tendencia'] = clasifier_validation['Tendencia'].values

In [254]:
display(classifier_dataset_scaled.tail())

Unnamed: 0,Open,High,Low,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,MACD,Signal,ADX,SlowK,SlowD,CCI,ATR,Tendencia
941,0.07,0.06,0.06,0.08,0.08,0.09,0.08,0.07,0.26,0.44,0.43,0.57,0.44,0.48,0.51,0.12,Lateral
942,0.06,0.06,0.06,0.08,0.08,0.09,0.08,0.06,0.24,0.44,0.43,0.57,0.46,0.47,0.51,0.11,Lateral
943,0.06,0.06,0.06,0.08,0.08,0.09,0.08,0.06,0.25,0.44,0.43,0.58,0.46,0.47,0.46,0.11,Lateral
944,0.06,0.06,0.06,0.07,0.08,0.08,0.07,0.07,0.24,0.44,0.43,0.59,0.44,0.47,0.48,0.11,Lateral
945,0.06,0.06,0.06,0.07,0.08,0.07,0.07,0.07,0.21,0.44,0.43,0.6,0.32,0.42,0.41,0.1,Bajista


In [255]:
classifier_dataset.shape

(946, 17)

In [270]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "18.0.2" 2022-07-19; OpenJDK Runtime Environment Corretto-18.0.2.9.1 (build 18.0.2+9-FR); OpenJDK 64-Bit Server VM Corretto-18.0.2.9.1 (build 18.0.2+9-FR, mixed mode, sharing)
  Starting server from /Users/mmarchetta/Desktop/Tesis-2024/myenv/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/ls/f33jscqd6sb6bd857yb2j2s00000gp/T/tmpzo9g6s5h
  JVM stdout: /var/folders/ls/f33jscqd6sb6bd857yb2j2s00000gp/T/tmpzo9g6s5h/h2o_mmarchetta_started_from_python.out
  JVM stderr: /var/folders/ls/f33jscqd6sb6bd857yb2j2s00000gp/T/tmpzo9g6s5h/h2o_mmarchetta_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/Argentina/Buenos_Aires
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,2 months and 18 days
H2O_cluster_name:,H2O_from_python_mmarchetta_36tjal
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [257]:
# h2o_df = h2o.H2OFrame(classifier_dataset)
h2o_df = h2o.H2OFrame(classifier_dataset_scaled) # PARA NORMALIZACION

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, seed=1234) <= para la primer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la segunda version performante de h2o
# aml = H2OAutoML(nfolds=10, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la tercer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.4, seed=12345, max_runtime_secs= 60 * 60 * 4) <= para la cuarta version performante de h2o

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3579, max_runtime_secs= 60 * 60 * 4) para la quinta version performante de h2o

aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3579, max_runtime_secs= 60 * 60 * 2 )

aml.train(y='Tendencia', training_frame=h2o_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |██████████████████████████████████████████████████████████████Failed polling AutoML progress log: Unexpected HTTP error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
█| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),22/193
# GBM base models (used / total),3/48
# XGBoost base models (used / total),1/42
# DeepLearning base models (used / total),16/100
# GLM base models (used / total),0/1
# DRF base models (used / total),2/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
301.0,0.0,12.0,0.0383387,12 / 313
0.0,304.0,16.0,0.05,16 / 320
18.0,16.0,279.0,0.1086262,34 / 313
319.0,320.0,307.0,0.0655391,62 / 946

k,hit_ratio
1,0.9344609
2,0.9989429
3,1.0

Alcista,Bajista,Lateral,Error,Rate
269.0,0.0,44.0,0.1405751,44 / 313
2.0,271.0,47.0,0.153125,49 / 320
23.0,30.0,260.0,0.1693291,53 / 313
294.0,301.0,351.0,0.154334,146 / 946

k,hit_ratio
1,0.845666
2,0.9904863
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8436298,0.018207,0.8440367,0.8148148,0.8418079,0.8629442,0.8545455
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.1563702,0.018207,0.1559633,0.1851852,0.1581921,0.1370558,0.1454545
err_count,29.6,4.7222877,34.0,35.0,28.0,27.0,24.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.3856268,0.0328646,0.4003512,0.4358551,0.3659491,0.3538105,0.3721681
max_per_class_error,0.1888391,0.0093398,0.1805556,0.203125,0.1857143,0.1818182,0.1929824
mean_per_class_accuracy,0.8458271,0.0190316,0.8464459,0.8149508,0.8448038,0.8630148,0.8599202
mean_per_class_error,0.1541729,0.0190316,0.1535541,0.1850492,0.1551962,0.1369852,0.1400798


In [258]:
leaderboard = aml.leaderboard
print(leaderboard)

model_id                                                  mean_per_class_error    logloss      rmse       mse
StackedEnsemble_AllModels_6_AutoML_6_20240531_90057                   0.154343   0.387349  0.345597  0.119437
StackedEnsemble_AllModels_5_AutoML_6_20240531_90057                   0.161914   0.432502  0.355491  0.126374
StackedEnsemble_BestOfFamily_5_AutoML_6_20240531_90057                0.165964   0.429218  0.36121   0.130473
StackedEnsemble_Best500_1_AutoML_6_20240531_90057                     0.166936   0.494226  0.408262  0.166678
StackedEnsemble_BestOfFamily_7_AutoML_6_20240531_90057                0.167053   0.414821  0.361183  0.130453
StackedEnsemble_AllModels_4_AutoML_6_20240531_90057                   0.172238   0.49341   0.408126  0.166567
StackedEnsemble_BestOfFamily_6_AutoML_6_20240531_90057                0.172587   0.470572  0.369243  0.13634
DeepLearning_grid_2_AutoML_6_20240531_90057_model_6                   0.174507   0.45733   0.362078  0.131101
StackedEnse

### Guardado de los mejores 5 modelos

In [259]:
top_models = aml.leaderboard.as_data_frame()['model_id'].tolist()[:5]
for i, model_id in enumerate(top_models):
    model = h2o.get_model(model_id)
    file_name = f"modelo_{i}.h2o"
    h2o.save_model(model=model, path=f"automl_classifier", filename=file_name)


### Cargo los mejores 5 modelos

In [271]:
stack_models = []
for i in range(5):
    model_path = f"automl_classifier/modelo_{i}.h2o"
    model = h2o.load_model(model_path)
    stack_models.append(model)

### Hago las predicciones

In [272]:
stack_models[0]

key,value
Stacking strategy,cross_validation
Number of base models (used / total),22/193
# GBM base models (used / total),3/48
# XGBoost base models (used / total),1/42
# DeepLearning base models (used / total),16/100
# GLM base models (used / total),0/1
# DRF base models (used / total),2/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
301.0,0.0,12.0,0.0383387,12 / 313
0.0,304.0,16.0,0.05,16 / 320
18.0,16.0,279.0,0.1086262,34 / 313
319.0,320.0,307.0,0.0655391,62 / 946

k,hit_ratio
1,0.9344609
2,0.9989429
3,1.0

Alcista,Bajista,Lateral,Error,Rate
269.0,0.0,44.0,0.1405751,44 / 313
2.0,271.0,47.0,0.153125,49 / 320
23.0,30.0,260.0,0.1693291,53 / 313
294.0,301.0,351.0,0.154334,146 / 946

k,hit_ratio
1,0.845666
2,0.9904863
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8436298,0.018207,0.8440367,0.8148148,0.8418079,0.8629442,0.8545455
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.1563702,0.018207,0.1559633,0.1851852,0.1581921,0.1370558,0.1454545
err_count,29.6,4.7222877,34.0,35.0,28.0,27.0,24.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.3856268,0.0328646,0.4003512,0.4358551,0.3659491,0.3538105,0.3721681
max_per_class_error,0.1888391,0.0093398,0.1805556,0.203125,0.1857143,0.1818182,0.1929824
mean_per_class_accuracy,0.8458271,0.0190316,0.8464459,0.8149508,0.8448038,0.8630148,0.8599202
mean_per_class_error,0.1541729,0.0190316,0.1535541,0.1850492,0.1551962,0.1369852,0.1400798


In [273]:
h2o_validation_df = h2o.H2OFrame(clasifier_validation_scaled) # PARA NORMALIZACION
# h2o_validation_df = h2o.H2OFrame(clasifier_validation[columns])
predictions = stack_models[0].predict(h2o_validation_df)
display(predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.000357426,0.974979,0.0246634
Alcista,0.999249,1.35546e-05,0.000737651
Alcista,0.996999,5.57835e-05,0.00294507
Lateral,0.0506909,0.136195,0.813114
Bajista,0.00915654,0.715203,0.27564
Alcista,0.929271,0.00217856,0.0685509
Bajista,0.000498965,0.975984,0.0235173
Bajista,0.0154018,0.719416,0.265182
Lateral,0.0796881,0.190568,0.729744
Lateral,0.189357,0.0624044,0.748238


In [284]:
auto_ml_feature_predictor_df = pd.read_csv('auto_ml_feature_predictor/result.csv')
features = auto_ml_feature_predictor_df[columns]
features_scaled = scaler.transform(features)
features_scaled = pd.DataFrame(features_scaled, columns=columns)
h2o_df = h2o.H2OFrame(features_scaled)
predictions = stack_models[0].predict(h2o_df)

display(predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.0484864,0.0530321,0.898482
Lateral,0.045042,0.159067,0.795891
Lateral,0.0231921,0.476538,0.50027
Lateral,0.0289259,0.389557,0.581517
Lateral,0.0240347,0.442678,0.533287
Bajista,0.0149193,0.584752,0.400329
Bajista,0.0083545,0.736738,0.254907
Bajista,0.00466949,0.819665,0.175665
Bajista,0.00300791,0.867133,0.129859
Bajista,0.00282485,0.870924,0.126251


#### Datos generados con Auto-ts

In [311]:
auto_ts_feature_predictor_df = pd.read_csv('auto_ts_models/result.csv')
features_auto_ts = auto_ts_feature_predictor_df[columns]
features_scaled_ts = scaler.transform(features_auto_ts)
features_scaled_ts = pd.DataFrame(features_scaled_ts, columns=columns)
h2o_df_ts = h2o.H2OFrame(features_scaled_ts)
predictions_ts = stack_models[0].predict(h2o_df_ts)

display(predictions_ts)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.156613,0.029374,0.814013
Alcista,0.998899,1.25782e-05,0.00108807
Alcista,0.997279,3.80001e-05,0.00268287
Lateral,0.146333,0.065293,0.788374
Lateral,0.0218992,0.350668,0.627433
Alcista,0.877139,0.00399404,0.118867
Bajista,0.00047517,0.948934,0.0505913
Bajista,0.0014128,0.889618,0.10897
Lateral,0.0290624,0.236606,0.734332
Lateral,0.27688,0.0545219,0.668598


#### Datos generados con AutoTS

In [310]:
auto_ts_feature_predictor_df = pd.read_csv('autoTS_models/result.csv')
features_auto_ts = auto_ts_feature_predictor_df[columns]
features_scaled_ts = scaler.transform(features_auto_ts)
features_scaled_ts = pd.DataFrame(features_scaled_ts, columns=columns)
h2o_df_ts = h2o.H2OFrame(features_scaled_ts)
predictions_ts = stack_models[0].predict(h2o_df_ts)

display(predictions_ts)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.0293324,0.336671,0.633996
Bajista,0.00771399,0.79338,0.198906
Bajista,0.00010656,0.990306,0.00958753
Bajista,9.93812e-06,0.998305,0.00168464
Bajista,5.69738e-06,0.998954,0.00104072
Bajista,6.36987e-07,0.999869,0.000130276
Bajista,5.14835e-07,0.99985,0.000149802
Bajista,5.29412e-09,0.999996,3.63384e-06
Bajista,5.87096e-09,0.999997,2.9144e-06
Bajista,4.96973e-09,0.999998,2.44697e-06


##### Datos generados por auto ml con prophet:

In [None]:
# auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
# h2o_prophet_df = h2o.H2OFrame(auto_ml_prophet_df[columns])
# auto_mp_prophet_predictions = stack_models[2].predict(h2o_prophet_df)

# display(auto_mp_prophet_predictions)

#PARA NORMALIZACION
# auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
# features_prophet = auto_ml_prophet_df[columns]
# features_prophet_scaled = scaler.transform(features_prophet)
# auto_ml_prophet_df_scaled = pd.DataFrame(features_prophet_scaled, columns=columns)
# h2o_prophet_df = h2o.H2OFrame(auto_ml_prophet_df_scaled)
# auto_mp_prophet_predictions = stack_models[0].predict(h2o_prophet_df)

# display(auto_mp_prophet_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.0401515,0.893763,0.0660852
Bajista,0.0701171,0.875681,0.0542021
Bajista,0.0592252,0.860789,0.0799853
Bajista,0.111972,0.789698,0.09833
Bajista,0.204723,0.660427,0.13485
Bajista,0.261489,0.567229,0.171282
Bajista,0.28871,0.58417,0.12712
Bajista,0.274176,0.585812,0.140012
Bajista,0.256394,0.662884,0.0807219
Bajista,0.248306,0.609311,0.142383


##### Datos generados por auto ml con stats:

In [None]:
# auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
# h2o_stats_df = h2o.H2OFrame(auto_ml_stats_df[columns])
# auto_mp_stats_predictions = stack_models[2].predict(h2o_stats_df)

# display(auto_mp_stats_predictions)

# PARA NORMALIZACION
# auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
# features_stats = auto_ml_stats_df[columns]
# features_stats_scaled = scaler.transform(features_stats)
# auto_ml_stats_df_scaled = pd.DataFrame(features_stats_scaled, columns=columns)
# h2o_stats_df = h2o.H2OFrame(auto_ml_stats_df_scaled)
# auto_mp_stats_predictions = stack_models[0].predict(h2o_stats_df)

# display(auto_mp_stats_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.267739,0.0521153,0.680145
Lateral,0.268078,0.102013,0.629909
Lateral,0.143153,0.0360006,0.820847
Lateral,0.079663,0.0235718,0.896765
Lateral,0.220724,0.171886,0.60739
Lateral,0.254443,0.116495,0.629062
Bajista,0.262876,0.589797,0.147326
Bajista,0.157521,0.74479,0.0976893
Bajista,0.0890769,0.794788,0.116136
Bajista,0.093831,0.748432,0.157737


##### Datos generados por auto ml con modelos clasicos:

In [None]:
# # auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
# # h2o_mp_df = h2o.H2OFrame(auto_ml_df[columns])
# # auto_mp_predictions = stack_models[2].predict(h2o_mp_df)

# # display(auto_mp_predictions)

# #PARA NORAMLIZACION
# auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
# features_ml = auto_ml_df[columns]
# # Valores de valdiacion para el primer dia alcista: 6.42	6.94	6.13	6.90	175570.00	327.00	340.00
# # features_ml['Open'] = 6.42
# # features_ml['High'] = 6.94
# # features_ml['Low'] = 6.13
# # features_ml['Number of trades'] = 175570
# # features_ml['EMA_20'] = 6.85
# # features_ml['Upper_Band'] = 7.41
# # features_ml['Middle_Band'] = 7.85
# # features_ml['Buy_1000x_high'] = 327
# # features_ml['sell_1000x_high'] = 340
# # features_ml['RSI'] = 43.30

# features_ml_scaled = scaler.transform(features_ml)
# auto_ml_df_scaled = pd.DataFrame(features_ml_scaled, columns=columns)
# h2o_mp_df = h2o.H2OFrame(auto_ml_df_scaled)
# auto_mp_predictions = stack_models[0].predict(h2o_mp_df)

# display(auto_mp_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Bajista,0.430944,0.497427,0.071629
Bajista,0.348946,0.557738,0.0933162
Bajista,0.343622,0.557817,0.0985615
Alcista,0.493165,0.416127,0.0907088
Alcista,0.501951,0.381661,0.116389
Alcista,0.627487,0.27633,0.0961836
Alcista,0.562607,0.324686,0.112707
Alcista,0.534127,0.342642,0.123232
Alcista,0.625069,0.259448,0.115483
Alcista,0.674359,0.212157,0.113484


##### Datos generados por auto ml con modelos clasicos:

In [None]:
# # skforecast_df = pd.read_csv('skforecast/predicciones.csv')
# # skforecast_df = h2o.H2OFrame(skforecast_df)
# # skforecast_predictions = stack_models[2].predict(skforecast_df)

# # display(skforecast_predictions)

# # PARA NORMALIZACION
# skforecast_df = pd.read_csv('skforecast/predicciones.csv')
# features_skforecast = skforecast_df[columns]
# features_skforecast_scaled = scaler.transform(features_skforecast)
# skforecast_df_scaled = pd.DataFrame(features_skforecast_scaled, columns=columns)
# h2o_skforecast_df = h2o.H2OFrame(skforecast_df_scaled)
# skforecast_predictions = stack_models[0].predict(h2o_skforecast_df)

# display(skforecast_predictions)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.306651,0.159797,0.533552
Bajista,0.169592,0.659044,0.171364
Bajista,0.102157,0.738161,0.159681
Bajista,0.085701,0.747509,0.16679
Bajista,0.061623,0.773839,0.164538
Bajista,0.0462372,0.510455,0.443308
Lateral,0.0445384,0.405089,0.550373
Lateral,0.0275597,0.0284618,0.943978
Lateral,0.0166629,0.0245155,0.958822
Lateral,0.0137068,0.010096,0.976197


In [None]:
# stack_models[0].explain(h2o_validation_df)

### Feature importance

In [None]:
# from matplotlib import pyplot as plt


# ensemble_model = stack_models[0]

# # Obtener los modelos base del ensemble
# base_models_ids = ensemble_model.base_models

# # Visualizar la importancia de características para cada modelo base
# for model_id in base_models_ids:
#     base_model = h2o.get_model(model_id)
#     try:
#         # Generar y mostrar el gráfico de importancia de características
#         base_model.varimp_plot()
#         plt.show()
#     except:
#         print(f"No se puede generar la importancia de características para el modelo {model_id} de tipo {base_model.algo}")
