### Dataset

In [10]:
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

import h2o
from h2o.automl import H2OAutoML

In [11]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [12]:
columns = [
    # 'Open_time',
    'Open',
    'High',
    'Low',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'MACD',
    'Signal',
    'ADX',
    'SlowK',
    'SlowD',
    'CCI',
    'ATR',
    # 'MFI' <= se calcula a partir del volumen
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

In [13]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/automation/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

In [14]:
# PARA NORMALIZACION
scaler = MinMaxScaler() #StandardScaler()
features = classifier_dataset[columns]
features_scaled = scaler.fit_transform(features)

classifier_dataset_scaled = pd.DataFrame(features_scaled, columns=columns)
classifier_dataset_scaled['Tendencia'] = classifier_dataset['Tendencia'].values

In [15]:
display(classifier_dataset_scaled.tail())

Unnamed: 0,Open,High,Low,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,MACD,Signal,ADX,SlowK,SlowD,CCI,ATR,Tendencia
984,0.07,0.07,0.05,0.08,0.07,0.07,0.08,0.09,0.26,0.49,0.5,0.18,0.37,0.37,0.25,0.07,Bajista
985,0.06,0.06,0.06,0.08,0.07,0.07,0.08,0.09,0.18,0.49,0.5,0.22,0.28,0.35,0.22,0.07,Bajista
986,0.05,0.06,0.06,0.07,0.07,0.07,0.07,0.08,0.25,0.48,0.49,0.26,0.25,0.3,0.27,0.06,Alcista
987,0.06,0.06,0.06,0.07,0.07,0.07,0.07,0.08,0.27,0.49,0.5,0.22,0.22,0.25,0.32,0.06,Lateral
988,0.06,0.06,0.06,0.07,0.07,0.07,0.07,0.08,0.28,0.49,0.5,0.33,0.22,0.25,0.32,0.06,Lateral


In [16]:
classifier_dataset.shape

(989, 17)

In [17]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 mins 20 secs
H2O_cluster_timezone:,America/Argentina/Buenos_Aires
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,"28 days, 17 hours and 37 minutes"
H2O_cluster_name:,H2O_from_python_mmarchetta_t3i2mq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [18]:
# h2o_df = h2o.H2OFrame(classifier_dataset)
h2o_df = h2o.H2OFrame(classifier_dataset_scaled) # PARA NORMALIZACION

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, seed=1234) <= para la primer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la segunda version performante de h2o
# aml = H2OAutoML(nfolds=10, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la tercer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.4, seed=12345, max_runtime_secs= 60 * 60 * 4) <= para la cuarta version performante de h2o

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3579, max_runtime_secs= 60 * 60 * 4) para la quinta version performante de h2o

aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3579, max_runtime_secs= 60 * 60 * 2 )

aml.train(y='Tendencia', training_frame=h2o_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |██████████████████████████████████████████████████████████████Failed polling AutoML progress log: Unexpected HTTP error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
█| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),18/182
# GBM base models (used / total),2/47
# XGBoost base models (used / total),1/42
# DeepLearning base models (used / total),14/90
# GLM base models (used / total),0/1
# DRF base models (used / total),1/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
300.0,0.0,23.0,0.0712074,23 / 323
0.0,319.0,15.0,0.0449102,15 / 334
37.0,26.0,269.0,0.189759,63 / 332
337.0,345.0,307.0,0.1021234,101 / 989

k,hit_ratio
1,0.8978766
2,0.9989889
3,1.0

Alcista,Bajista,Lateral,Error,Rate
272.0,0.0,51.0,0.1578947,51 / 323
2.0,286.0,46.0,0.1437126,48 / 334
32.0,33.0,267.0,0.1957831,65 / 332
306.0,319.0,364.0,0.1658241,164 / 989

k,hit_ratio
1,0.8341759
2,0.9848332
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8347514,0.0141611,0.8398268,0.814433,0.8404255,0.8514851,0.8275862
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.1652487,0.0141611,0.1601732,0.185567,0.1595745,0.1485149,0.1724138
err_count,32.6,3.5777087,37.0,36.0,30.0,30.0,30.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.4022592,0.0494142,0.4036882,0.4859816,0.3640401,0.3689556,0.3886306
max_per_class_error,0.202879,0.030523,0.1794872,0.234375,0.1833333,0.1791045,0.2380952
mean_per_class_accuracy,0.8368457,0.0143598,0.8411313,0.8139204,0.8432933,0.8520244,0.833859
mean_per_class_error,0.1631543,0.0143598,0.1588687,0.1860796,0.1567067,0.1479755,0.166141


In [19]:
leaderboard = aml.leaderboard
print(leaderboard)

model_id                                                  mean_per_class_error    logloss      rmse       mse
StackedEnsemble_AllModels_6_AutoML_1_20240611_90644                   0.165797   0.404234  0.355176  0.12615
StackedEnsemble_BestOfFamily_7_AutoML_1_20240611_90644                0.172819   0.421146  0.364376  0.13277
StackedEnsemble_AllModels_5_AutoML_1_20240611_90644                   0.173747   0.446354  0.362891  0.13169
DeepLearning_grid_2_AutoML_1_20240611_90644_model_6                   0.174845   0.445724  0.359713  0.129394
StackedEnsemble_AllModels_4_AutoML_1_20240611_90644                   0.174953   0.509724  0.416052  0.173099
StackedEnsemble_BestOfFamily_6_AutoML_1_20240611_90644                0.175709   0.458469  0.367978  0.135408
StackedEnsemble_BestOfFamily_5_AutoML_1_20240611_90644                0.175841   0.423882  0.361767  0.130875
StackedEnsemble_Best500_1_AutoML_1_20240611_90644                     0.181941   0.5116    0.417078  0.173954
DeepLearning_

### Guardado de los mejores 5 modelos

In [23]:
top_models = aml.leaderboard.as_data_frame()['model_id'].tolist()[:5]
for i, model_id in enumerate(top_models):
    model = h2o.get_model(model_id)
    file_name = f"modelo_{i}.h2o"
    h2o.save_model(model=model, path=f"automl_classifier", filename=file_name)


### Cargo los mejores 5 modelos

In [24]:
stack_models = []
for i in range(5):
    model_path = f"automl_classifier/modelo_{i}.h2o"
    model = h2o.load_model(model_path)
    stack_models.append(model)

#### Datos generados con Auto-ts

In [25]:
stack_models[0]

key,value
Stacking strategy,cross_validation
Number of base models (used / total),18/182
# GBM base models (used / total),2/47
# XGBoost base models (used / total),1/42
# DeepLearning base models (used / total),14/90
# GLM base models (used / total),0/1
# DRF base models (used / total),1/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
300.0,0.0,23.0,0.0712074,23 / 323
0.0,319.0,15.0,0.0449102,15 / 334
37.0,26.0,269.0,0.189759,63 / 332
337.0,345.0,307.0,0.1021234,101 / 989

k,hit_ratio
1,0.8978766
2,0.9989889
3,1.0

Alcista,Bajista,Lateral,Error,Rate
272.0,0.0,51.0,0.1578947,51 / 323
2.0,286.0,46.0,0.1437126,48 / 334
32.0,33.0,267.0,0.1957831,65 / 332
306.0,319.0,364.0,0.1658241,164 / 989

k,hit_ratio
1,0.8341759
2,0.9848332
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8347514,0.0141611,0.8398268,0.814433,0.8404255,0.8514851,0.8275862
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.1652487,0.0141611,0.1601732,0.185567,0.1595745,0.1485149,0.1724138
err_count,32.6,3.5777087,37.0,36.0,30.0,30.0,30.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.4022592,0.0494142,0.4036882,0.4859816,0.3640401,0.3689556,0.3886306
max_per_class_error,0.202879,0.030523,0.1794872,0.234375,0.1833333,0.1791045,0.2380952
mean_per_class_accuracy,0.8368457,0.0143598,0.8411313,0.8139204,0.8432933,0.8520244,0.833859
mean_per_class_error,0.1631543,0.0143598,0.1588687,0.1860796,0.1567067,0.1479755,0.166141


In [None]:
auto_ts_feature_predictor_df = pd.read_csv('auto_ts_models/result.csv')
features_auto_ts = auto_ts_feature_predictor_df[columns]
features_scaled_ts = scaler.transform(features_auto_ts)
features_scaled_ts = pd.DataFrame(features_scaled_ts, columns=columns)
h2o_df_ts = h2o.H2OFrame(features_scaled_ts)
predictions_ts = stack_models[0].predict(h2o_df_ts)

display(predictions_ts)

### Feature importance

In [None]:
# from matplotlib import pyplot as plt


# ensemble_model = stack_models[0]

# # Obtener los modelos base del ensemble
# base_models_ids = ensemble_model.base_models

# # Visualizar la importancia de características para cada modelo base
# for model_id in base_models_ids:
#     base_model = h2o.get_model(model_id)
#     try:
#         # Generar y mostrar el gráfico de importancia de características
#         base_model.varimp_plot()
#         plt.show()
#     except:
#         print(f"No se puede generar la importancia de características para el modelo {model_id} de tipo {base_model.algo}")
