### Dataset

In [1]:
import pandas as pd
from auto_ts import auto_timeseries
import dill
import talib
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

Imported auto_timeseries version:0.0.90. Call by using:
model = auto_timeseries(score_type='rmse',
        time_interval='M', non_seasonal_pdq=None, seasonality=False,
        seasonal_period=12, model_type=['best'], verbose=2, dask_xgboost_flag=0)
model.fit(traindata, ts_column,target)
model.predict(testdata, model='best')



In [2]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [3]:
columns = [
    # 'Open_time',
    # 'Close',
    'Open',
    'High',
    'Low',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'MACD',
    'Signal',
    'ADX',
    'SlowK',
    'SlowD',
    'CCI',
    'ATR',
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

#### Modelo Light GBM

In [4]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
# classifier_dataset['Open_time'] = pd.to_datetime(classifier_dataset['Open_time'])
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

clasifier_validation = classifier_dataset[-10:]
classifier_dataset = classifier_dataset[:-10]

In [5]:
display(classifier_dataset.tail())

Unnamed: 0,Open,High,Low,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,MACD,Signal,ADX,SlowK,SlowD,CCI,ATR,Tendencia
941,6.93,7.0,6.7,7.43,7.45,9.08,7.43,5.77,38.83,-0.48,-0.53,33.89,45.65,47.62,-11.66,0.6,Lateral
942,6.86,6.95,6.71,7.34,7.38,8.94,7.34,5.74,37.81,-0.48,-0.52,34.06,47.38,47.3,-14.45,0.57,Lateral
943,6.76,6.87,6.51,7.24,7.33,8.73,7.24,5.76,38.57,-0.47,-0.51,34.51,47.54,46.86,-44.81,0.56,Lateral
944,6.81,6.95,6.69,7.13,7.27,8.38,7.13,5.88,37.66,-0.46,-0.5,34.64,45.17,46.7,-30.25,0.53,Lateral
945,6.73,6.83,6.47,7.03,7.2,8.08,7.03,5.97,36.02,-0.46,-0.49,35.1,34.26,42.32,-73.58,0.52,Bajista


In [6]:
classifier_dataset.shape

(946, 17)

In [7]:
X = classifier_dataset.drop(columns=["Tendencia"])
y = classifier_dataset["Tendencia"]

y = y.to_numpy().reshape(-1, 1)
onehot_encoder = OneHotEncoder(sparse=False)
y_one_hot = onehot_encoder.fit_transform(y)


In [8]:
scalers = {}
for column in X.columns:
    scaler = StandardScaler()
    X[[column]] = scaler.fit_transform(X[[column]])
    clasifier_validation[[column]] = scaler.transform(clasifier_validation[[column]])
    scalers[column] = scaler

In [9]:
display(y_one_hot)

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [10]:
classifier = LGBMClassifier(objective='multiclass', num_class=3, metric='multi_logloss', verbosity=-1)

# Definir el espacio de búsqueda de hiperparámetros
param_space = {
    'num_leaves': (10, 500),
    'max_depth': (3, 20),
    'learning_rate': (0.001, 0.5), 
    'n_estimators': (50, 2000),
    'min_child_samples': (5, 200),
    'subsample': (0.5, 1.0),  
    'colsample_bytree': (0.5, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'objective': ['multiclass'],
    'metric': ['multi_logloss', 'multi_error'],
    'importance_type': ['gain', 'split'],
    'boosting_type': ['gbdt', 'dart']#, 'rf'],
}
# Definir función de pérdida para la búsqueda bayesiana
def multi_log_loss(estimator, X_test, y_test):
    y_pred = estimator.predict_proba(X_test)
    loss = log_loss(y_test, y_pred)
    return loss

# Definir validación cruzada temporal
cv = TimeSeriesSplit(n_splits=5).split(X)
# stratified_cv = StratifiedKFold(n_splits=10, shuffle=False)

# Realizar la búsqueda bayesiana de hiperparámetros
bayes_search = BayesSearchCV(
    classifier, 
    param_space, 
    scoring='neg_log_loss', 
    cv=cv,
    # cv=TimeSeriesSplit(n_splits=10),
    # cv=stratified_cv,
    n_iter=500,
    verbose=0,
    n_jobs=-1
)

# Realizar la búsqueda bayesiana
bayes_result = bayes_search.fit(X, y)

In [None]:
# Show best results
print("Best score:", bayes_result.best_score_)
print("Best parameters:", bayes_result.best_params_)

# Entrenar el modelo con los mejores hiperparámetros
best_model = bayes_result.best_estimator_
best_model.fit(X, y)

Best score: 3.458719420188401
Best parameters: OrderedDict([('boosting_type', 'gbdt'), ('colsample_bytree', 1.0), ('importance_type', 'gain'), ('learning_rate', 0.5), ('metric', 'multi_logloss'), ('min_child_samples', 48), ('n_estimators', 340), ('n_iter', 500), ('num_leaves', 95), ('objective', 'multiclass'), ('reg_alpha', 0.0), ('reg_lambda', 0.0), ('subsample', 0.8933012014094083)])


In [None]:
import json

# Obtener los hiperparámetros y puntajes de los 5 mejores modelos
top_n_models = 5
best_params_list = []
best_scores_list = []

for i in range(min(top_n_models, len(bayes_search.cv_results_['params']))):
    best_params_list.append(bayes_search.cv_results_['params'][i])
    best_scores_list.append(bayes_search.cv_results_['mean_test_score'][i])

# Guardar los hiperparámetros de los 5 mejores modelos en un archivo JSON
with open('gbm_classifier/top_5_hyperparameters.json', 'w') as f:
    json.dump({'best_params': best_params_list, 'best_scores': best_scores_list}, f)

# O imprimir los hiperparámetros
print("Top 5 mejores modelos:")
for i in range(len(best_params_list)):
    print("Modelo", i+1)
    print("Hiperparámetros:", best_params_list[i])
    print("Puntaje:", best_scores_list[i])


Top 5 mejores modelos:
Modelo 1
Hiperparámetros: OrderedDict([('boosting_type', 'dart'), ('colsample_bytree', 0.8941882029766676), ('importance_type', 'gain'), ('learning_rate', 0.019882422470529012), ('metric', 'multi_error'), ('min_child_samples', 181), ('n_estimators', 418), ('n_iter', 295), ('num_leaves', 175), ('objective', 'multiclass'), ('reg_alpha', 0.6706462037686927), ('reg_lambda', 0.8272135833591839), ('subsample', 0.7914163215472552)])
Puntaje: 1.0693452836731776
Modelo 2
Hiperparámetros: OrderedDict([('boosting_type', 'gbdt'), ('colsample_bytree', 0.5589694270943428), ('importance_type', 'split'), ('learning_rate', 0.2174955782444395), ('metric', 'multi_error'), ('min_child_samples', 94), ('n_estimators', 125), ('n_iter', 94), ('num_leaves', 117), ('objective', 'multiclass'), ('reg_alpha', 0.21624249202880214), ('reg_lambda', 0.4104174649836443), ('subsample', 0.9810203155226598)])
Puntaje: 1.0427473702060328
Modelo 3
Hiperparámetros: OrderedDict([('boosting_type', 'dart'

#### Armado del ensamble

In [None]:
# Crear una lista para almacenar los modelos individuales
individual_models = []

# Entrenar los modelos individuales
for params_str in best_params_list:
    model = LGBMClassifier(verbosity=-1, **params_str)
    model.fit(X, y)
    individual_models.append(model)

voting_model = VotingClassifier(estimators=[('model_'+str(i), model) for i, model in enumerate(individual_models)], voting='hard', verbose=0)#voting='soft', verbose=0)
voting_model.fit(X, y)

with open('gbm_classifier/gbm_boosting_classifier.pkl', 'wb') as f:
    dill.dump(voting_model, f)

#### Clasificacion con el ensamble sobre las redicciones de los modelos generativos

In [None]:
with open('gbm_classifier/gbm_boosting_classifier.pkl', 'rb') as f:
    voting_model = dill.load(f)

In [None]:
voting_model.fit(X, y)

##### Datos originales:

In [None]:
display(clasifier_validation)

Unnamed: 0,Open,High,Low,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,MACD,Signal,ADX,SlowK,SlowD,CCI,ATR,Tendencia
946,-0.46,-0.48,-0.5,-0.45,-0.43,-0.45,-0.45,-0.44,-1.13,-0.3,-0.34,1.07,-0.7,-0.4,-1.13,-0.38,Bajista
947,-0.48,-0.45,-0.49,-0.46,-0.43,-0.48,-0.46,-0.41,-0.36,-0.26,-0.33,1.07,-0.43,-0.54,-0.46,-0.36,Alcista
948,-0.43,-0.41,-0.43,-0.46,-0.43,-0.48,-0.46,-0.41,0.14,-0.2,-0.31,0.93,0.29,-0.29,0.66,-0.35,Alcista
949,-0.4,-0.41,-0.4,-0.45,-0.43,-0.48,-0.45,-0.4,0.1,-0.14,-0.28,0.8,0.96,0.28,0.76,-0.37,Lateral
950,-0.4,-0.42,-0.39,-0.45,-0.43,-0.48,-0.45,-0.4,-0.05,-0.11,-0.25,0.68,1.15,0.82,0.62,-0.39,Bajista
951,-0.41,-0.41,-0.4,-0.45,-0.43,-0.48,-0.45,-0.4,0.2,-0.06,-0.21,0.53,1.17,1.13,0.75,-0.4,Alcista
952,-0.39,-0.4,-0.39,-0.45,-0.43,-0.47,-0.45,-0.4,-0.04,-0.04,-0.18,0.37,1.09,1.17,0.76,-0.41,Bajista
953,-0.41,-0.42,-0.4,-0.44,-0.43,-0.48,-0.44,-0.39,-0.21,-0.03,-0.15,0.24,1.03,1.13,0.51,-0.42,Bajista
954,-0.42,-0.43,-0.41,-0.44,-0.43,-0.47,-0.44,-0.39,-0.22,-0.02,-0.12,0.14,0.83,1.01,0.36,-0.44,Lateral
955,-0.43,-0.44,-0.42,-0.44,-0.43,-0.47,-0.44,-0.39,-0.11,-0.0,-0.1,0.08,0.81,0.91,0.21,-0.45,Lateral


In [None]:
n_clases = 3 

validation_predictions = voting_model.predict(clasifier_validation[columns])
display(validation_predictions)

array(['Bajista', 'Alcista', 'Alcista', 'Lateral', 'Bajista', 'Alcista',
       'Bajista', 'Bajista', 'Bajista', 'Bajista'], dtype=object)

##### Datos generados por auto ml con Auto-TS:

In [None]:
auto_ts_feature_predictor_df = pd.read_csv('auto_ts_models/result.csv')
features_auto_ts = auto_ts_feature_predictor_df[columns]

for column in features_auto_ts.columns:
    scaler = scalers[column]
    features_auto_ts[[column]] = scaler.transform(features_auto_ts[[column]])


predictions_ts = voting_model.predict(features_auto_ts)

display(predictions_ts)

array(['Bajista', 'Alcista', 'Alcista', 'Alcista', 'Alcista', 'Bajista',
       'Bajista', 'Bajista', 'Bajista', 'Alcista'], dtype=object)

##### Datos generados por auto ml con prophet:

In [None]:
auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
auto_mp_prophet_predictions = voting_model.predict(auto_ml_prophet_df[columns].drop(columns=["Open_time"]))
display(auto_mp_prophet_predictions)

KeyError: "['MACD', 'Signal', 'ADX', 'SlowK', 'SlowD', 'CCI', 'ATR'] not in index"

##### Datos generados por auto ml con stats:

In [None]:
auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
auto_mp_stats_predictions = voting_model.predict(auto_ml_stats_df[columns].drop(columns=["Open_time"]))

display(auto_mp_stats_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista',
       'Bajista', 'Bajista', 'Bajista', 'Bajista'], dtype=object)

##### Datos generados por auto ml con modelos clasicos:

In [None]:
auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
auto_mp_predictions = voting_model.predict(auto_ml_df[columns].drop(columns=["Open_time"]))
display(auto_mp_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista',
       'Bajista', 'Bajista', 'Bajista', 'Bajista'], dtype=object)

##### Datos generados por auto ml con skforecast:

In [None]:
skforecast_df = pd.read_csv('skforecast/predicciones.csv')
skforecast_predictions = voting_model.predict(skforecast_df[columns[1:]])
display(skforecast_predictions)

array(['Bajista', 'Lateral', 'Alcista', 'Alcista', 'Alcista', 'Alcista',
       'Alcista', 'Alcista', 'Alcista', 'Alcista'], dtype=object)