### Dataset

In [30]:
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

import h2o
from h2o.automl import H2OAutoML

In [31]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [32]:
columns = [
    # 'Open_time',
    'Open',
    'High',
    'Low',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'MACD',
    'Signal',
    'ADX',
    'SlowK',
    'SlowD',
    'CCI',
    'ATR',
    # 'MFI' <= se calcula a partir del volumen
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

In [33]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/automation/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

In [34]:
# PARA NORMALIZACION
scaler = MinMaxScaler() #StandardScaler()
features = classifier_dataset[columns]
features_scaled = scaler.fit_transform(features)

classifier_dataset_scaled = pd.DataFrame(features_scaled, columns=columns)
classifier_dataset_scaled['Tendencia'] = classifier_dataset['Tendencia'].values

In [35]:
display(classifier_dataset_scaled.tail())

Unnamed: 0,Open,High,Low,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,MACD,Signal,ADX,SlowK,SlowD,CCI,ATR,Tendencia
983,0.07,0.07,0.07,0.08,0.08,0.07,0.08,0.09,0.43,0.5,0.5,0.13,0.38,0.32,0.5,0.05,Lateral
984,0.07,0.07,0.05,0.08,0.07,0.07,0.08,0.09,0.26,0.49,0.5,0.18,0.37,0.37,0.25,0.07,Bajista
985,0.06,0.06,0.06,0.08,0.07,0.07,0.08,0.09,0.18,0.49,0.5,0.22,0.28,0.35,0.22,0.07,Bajista
986,0.05,0.06,0.06,0.07,0.07,0.07,0.07,0.08,0.25,0.48,0.49,0.26,0.25,0.3,0.27,0.06,Alcista
987,0.06,0.06,0.06,0.07,0.07,0.07,0.07,0.08,0.27,0.49,0.5,0.22,0.22,0.25,0.32,0.06,Lateral


In [36]:
classifier_dataset.shape

(988, 17)

In [37]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 39 mins
H2O_cluster_timezone:,America/Argentina/Buenos_Aires
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_mmarchetta_2j4nwx
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.589 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [38]:
# h2o_df = h2o.H2OFrame(classifier_dataset)
h2o_df = h2o.H2OFrame(classifier_dataset_scaled) # PARA NORMALIZACION

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, seed=1234) <= para la primer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la segunda version performante de h2o
# aml = H2OAutoML(nfolds=10, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.1, seed=12345) <= para la tercer version performante de h2o
# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=5, exploitation_ratio=0.4, seed=12345, max_runtime_secs= 60 * 60 * 4) <= para la cuarta version performante de h2o

# aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3579, max_runtime_secs= 60 * 60 * 4) para la quinta version performante de h2o

aml = H2OAutoML(nfolds=5, keep_cross_validation_predictions=True, stopping_rounds=7, exploitation_ratio=0.5, seed=3581, max_runtime_secs= 60 * 60 * 2 )

aml.train(y='Tendencia', training_frame=h2o_df)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),164/164
# GBM base models (used / total),58/58
# XGBoost base models (used / total),35/35
# DeepLearning base models (used / total),68/68
# GLM base models (used / total),1/1
# DRF base models (used / total),2/2
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
302.0,0.0,21.0,0.0650155,21 / 323
1.0,314.0,19.0,0.0598802,20 / 334
33.0,33.0,265.0,0.1993958,66 / 331
336.0,347.0,305.0,0.1082996,107 / 988

k,hit_ratio
1,0.8917004
2,0.9949393
3,1.0

Alcista,Bajista,Lateral,Error,Rate
270.0,2.0,51.0,0.1640867,53 / 323
2.0,291.0,41.0,0.1287425,43 / 334
47.0,39.0,245.0,0.2598187,86 / 331
319.0,332.0,337.0,0.1842105,182 / 988

k,hit_ratio
1,0.8157895
2,0.9767206
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.817322,0.0330456,0.7705628,0.8608248,0.8085107,0.8316832,0.8150289
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.182678,0.0330456,0.2294372,0.1391753,0.1914894,0.1683168,0.1849711
err_count,36.4,9.864077,53.0,27.0,36.0,34.0,32.0
loglikelihood,,0.0,,,,,
logloss,0.5022785,0.1044344,0.6760428,0.4068696,0.5086065,0.4411812,0.4786923
max_per_class_error,0.2521577,0.0560379,0.3333333,0.1904762,0.2181818,0.2380952,0.2807018
mean_per_class_accuracy,0.8180832,0.0267718,0.784567,0.8568493,0.8066378,0.8279104,0.8144514
mean_per_class_error,0.1819168,0.0267718,0.215433,0.1431507,0.1933622,0.1720896,0.1855486


In [45]:
leaderboard = aml.leaderboard
print(leaderboard)

model_id                                                   mean_per_class_error    logloss      rmse       mse
StackedEnsemble_AllModels_5_AutoML_2_20240611_183012                   0.184216   0.508754  0.387262  0.149972
StackedEnsemble_BestOfFamily_6_AutoML_2_20240611_183012                0.190286   0.479129  0.39041   0.15242
StackedEnsemble_BestOfFamily_4_AutoML_2_20240611_183012                0.193112   0.50438   0.38864   0.151041
DeepLearning_grid_1_AutoML_2_20240611_183012_model_18                  0.194391   0.495173  0.391684  0.153416
StackedEnsemble_BestOfFamily_5_AutoML_2_20240611_183012                0.195239   0.537432  0.400705  0.160564
StackedEnsemble_AllModels_4_AutoML_2_20240611_183012                   0.204749   0.595369  0.457733  0.20952
StackedEnsemble_AllModels_3_AutoML_2_20240611_183012                   0.210801   0.595262  0.457355  0.209174
DeepLearning_grid_1_AutoML_2_20240611_183012_model_17                  0.238742   0.710916  0.434513  0.188802
Sta

### Guardado de los mejores 5 modelos

In [40]:
top_models = aml.leaderboard.as_data_frame()['model_id'].tolist()[:5]
for i, model_id in enumerate(top_models):
    model = h2o.get_model(model_id)
    file_name = f"modelo_{i}.h2o"
    h2o.save_model(model=model, path=f"automl_classifier", filename=file_name)


### Cargo los mejores 5 modelos

In [41]:
stack_models = []
for i in range(5):
    model_path = f"automl_classifier/modelo_{i}.h2o"
    model = h2o.load_model(model_path)
    stack_models.append(model)

#### Datos generados con Auto-ts

In [42]:
stack_models[0]

key,value
Stacking strategy,cross_validation
Number of base models (used / total),164/164
# GBM base models (used / total),58/58
# XGBoost base models (used / total),35/35
# DeepLearning base models (used / total),68/68
# GLM base models (used / total),1/1
# DRF base models (used / total),2/2
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Alcista,Bajista,Lateral,Error,Rate
302.0,0.0,21.0,0.0650155,21 / 323
1.0,314.0,19.0,0.0598802,20 / 334
33.0,33.0,265.0,0.1993958,66 / 331
336.0,347.0,305.0,0.1082996,107 / 988

k,hit_ratio
1,0.8917004
2,0.9949393
3,1.0

Alcista,Bajista,Lateral,Error,Rate
270.0,2.0,51.0,0.1640867,53 / 323
2.0,291.0,41.0,0.1287425,43 / 334
47.0,39.0,245.0,0.2598187,86 / 331
319.0,332.0,337.0,0.1842105,182 / 988

k,hit_ratio
1,0.8157895
2,0.9767206
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.817322,0.0330456,0.7705628,0.8608248,0.8085107,0.8316832,0.8150289
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.182678,0.0330456,0.2294372,0.1391753,0.1914894,0.1683168,0.1849711
err_count,36.4,9.864077,53.0,27.0,36.0,34.0,32.0
loglikelihood,,0.0,,,,,
logloss,0.5022785,0.1044344,0.6760428,0.4068696,0.5086065,0.4411812,0.4786923
max_per_class_error,0.2521577,0.0560379,0.3333333,0.1904762,0.2181818,0.2380952,0.2807018
mean_per_class_accuracy,0.8180832,0.0267718,0.784567,0.8568493,0.8066378,0.8279104,0.8144514
mean_per_class_error,0.1819168,0.0267718,0.215433,0.1431507,0.1933622,0.1720896,0.1855486


In [46]:
auto_ts_feature_predictor_df = pd.read_csv('auto_ts_models/result.csv')
features_auto_ts = auto_ts_feature_predictor_df[columns]
features_scaled_ts = scaler.transform(features_auto_ts)
features_scaled_ts = pd.DataFrame(features_scaled_ts, columns=columns)
h2o_df_ts = h2o.H2OFrame(features_scaled_ts)
predictions_ts = stack_models[0].predict(h2o_df_ts)

display(predictions_ts)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Alcista,Bajista,Lateral
Lateral,0.0338992,0.12949,0.83661
Bajista,0.0117434,0.777559,0.210697
Bajista,0.00642568,0.942643,0.0509311
Bajista,0.00657081,0.952551,0.0408785
Bajista,0.00753954,0.970638,0.0218225
Bajista,0.0070825,0.976577,0.0163405
Bajista,0.00700297,0.977514,0.0154833
Bajista,0.00607627,0.973401,0.0205228
Bajista,0.00513425,0.974937,0.0199289
Bajista,0.00514983,0.9637,0.0311498


### Feature importance

In [44]:
# from matplotlib import pyplot as plt


# ensemble_model = stack_models[1]

# # Obtener los modelos base del ensemble
# base_models_ids = ensemble_model.base_models

# # Visualizar la importancia de características para cada modelo base
# for model_id in base_models_ids:
#     base_model = h2o.get_model(model_id)
#     try:
#         # Generar y mostrar el gráfico de importancia de características
#         base_model.varimp_plot()
#         plt.show()
#     except:
#         print(f"No se puede generar la importancia de características para el modelo {model_id} de tipo {base_model.algo}")
