### Dataset

In [3]:
import pandas as pd
from auto_ts import auto_timeseries
import dill
import talib
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam, RMSprop, SGD
from scikeras.wrappers import KerasRegressor
from keras.callbacks import EarlyStopping
from skopt import BayesSearchCV
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.backend as K
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from keras.losses import categorical_crossentropy
from sklearn.metrics import log_loss
from keras.utils import to_categorical
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold

2024-05-26 11:05:45.065345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [5]:
columns = [
    'Open_time',
    'Open',
    'High',
    'Low',
    # 'Close',
    'Number of trades',
    'Close_BTCUSDT',
    'Volume_BTCUSDT',
    'Number_of_trades_BTCUSDT',
    'Close_ETHUSDT',
    'Volume_ETHUSDT',
    'Number_of_trades_ETHUSDT',
    'Close_BNBUSDT',
    'Volume_BNBUSDT',
    'Number_of_trades_BNBUSDT',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'buy_1000x_high_coinbase',
    'sell_1000x_high_coinbase',
    'total_trades_coinbase',	
    'Tweets_Utilizados',
    'Tweets_Utilizados_coin',
    'Tweets_Utilizados_referentes',
    'Tweets_Utilizados_whale_alert',
    'Buy_1000x_high',
    'sell_1000x_high',
    'total_trades_binance'
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

#### Modelo Light GBM

In [6]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Open_time'] = pd.to_datetime(classifier_dataset['Open_time'])
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

clasifier_validation = classifier_dataset[-10:]
classifier_dataset = classifier_dataset[:-10]

In [7]:
display(classifier_dataset.tail())

Unnamed: 0,Open_time,Open,High,Low,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Tendencia
941,2024-04-25,6.93,7.0,6.7,71088.0,64498.34,31341.46,1375324.0,3155.8,352288.55,861077.0,613.2,453745.52,353114.0,7.43,7.45,9.08,7.43,5.77,38.83,21.0,26.0,33468.0,151,114,0.0,22.0,242.0,219.0,48000.0,Lateral
942,2024-04-26,6.86,6.95,6.71,67383.0,63770.01,27085.19,1025561.0,3131.3,252522.65,628635.0,598.0,302119.88,269508.0,7.34,7.38,8.94,7.34,5.74,37.81,29.0,24.0,26619.0,117,106,0.0,14.0,292.0,324.0,42000.0,Lateral
943,2024-04-27,6.76,6.87,6.51,64779.0,63461.98,20933.06,912422.0,3255.56,323811.19,734026.0,596.2,268783.91,233820.0,7.24,7.33,8.73,7.24,5.76,38.57,17.0,17.0,25565.0,101,138,0.0,7.0,248.0,179.0,41000.0,Lateral
944,2024-04-28,6.81,6.95,6.69,43208.0,63118.62,16949.2,790652.0,3263.45,304766.01,753239.0,600.2,258059.43,206703.0,7.13,7.27,8.38,7.13,5.88,37.66,16.0,20.0,20954.0,82,106,0.0,13.0,173.0,165.0,26000.0,Lateral
945,2024-04-29,6.73,6.83,6.47,63006.0,63866.0,28150.23,1152296.0,3216.73,421831.29,943719.0,592.8,330474.01,271926.0,7.03,7.2,8.08,7.03,5.97,36.02,69.0,37.0,33959.0,115,125,0.0,24.0,260.0,188.0,41000.0,Bajista


In [8]:
classifier_dataset.shape

(946, 31)

In [9]:
X = classifier_dataset.drop(columns=["Tendencia", "Open_time"])
y = classifier_dataset["Tendencia"]

y = y.to_numpy().reshape(-1, 1)
onehot_encoder = OneHotEncoder(sparse=False)
y_one_hot = onehot_encoder.fit_transform(y)


In [10]:
display(y_one_hot)

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [11]:
classifier = LGBMClassifier(objective='multiclass', num_class=3, metric='multi_logloss', verbosity=-1)

# Definir el espacio de búsqueda de hiperparámetros
param_space = {
    'num_leaves': (10, 500),
    'learning_rate': (0.001, 0.5), 
    'n_estimators': (50, 500),
    'min_child_samples': (5, 200),
    'subsample': (0.5, 1.0),  
    'colsample_bytree': (0.5, 1.0),
    'n_iter': (50, 500),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'objective': ['multiclass'],
    'metric': ['multi_logloss', 'multi_error'],
    'importance_type': ['gain', 'split'],
    'boosting_type': ['gbdt', 'dart', 'rf'],
}
# Definir función de pérdida para la búsqueda bayesiana
def multi_log_loss(estimator, X_test, y_test):
    y_pred = estimator.predict_proba(X_test)
    loss = log_loss(y_test, y_pred)
    return loss

# Definir validación cruzada temporal
cv = TimeSeriesSplit(n_splits=10).split(X)
# stratified_cv = StratifiedKFold(n_splits=10, shuffle=False)

# Realizar la búsqueda bayesiana de hiperparámetros
bayes_search = BayesSearchCV(
    classifier, 
    param_space, 
    scoring=multi_log_loss, 
    # cv=cv
    # cv=TimeSeriesSplit(n_splits=10),
    # cv=stratified_cv,
    # n_iter=100,
    verbose=0,
    n_jobs=-1
)

# Realizar la búsqueda bayesiana
bayes_result = bayes_search.fit(X, y)

In [179]:
# Show best results
print("Best score:", bayes_result.best_score_)
print("Best parameters:", bayes_result.best_params_)

# Entrenar el modelo con los mejores hiperparámetros
best_model = bayes_result.best_estimator_
best_model.fit(X, y)

Best score: 6.885260735808174
Best parameters: OrderedDict([('boosting_type', 'gbdt'), ('colsample_bytree', 0.5), ('importance_type', 'gain'), ('learning_rate', 0.21563682141704843), ('metric', 'multi_logloss'), ('min_child_samples', 36), ('n_estimators', 329), ('n_iter', 500), ('num_leaves', 464), ('objective', 'multiclass'), ('reg_alpha', 0.0), ('reg_lambda', 0.0), ('subsample', 0.7400565273683408)])


In [180]:
import json

# Obtener los hiperparámetros y puntajes de los 5 mejores modelos
top_n_models = 5
best_params_list = []
best_scores_list = []

for i in range(min(top_n_models, len(bayes_search.cv_results_['params']))):
    best_params_list.append(bayes_search.cv_results_['params'][i])
    best_scores_list.append(bayes_search.cv_results_['mean_test_score'][i])

# Guardar los hiperparámetros de los 5 mejores modelos en un archivo JSON
with open('gbm_classifier/top_5_hyperparameters.json', 'w') as f:
    json.dump({'best_params': best_params_list, 'best_scores': best_scores_list}, f)

# O imprimir los hiperparámetros
print("Top 5 mejores modelos:")
for i in range(len(best_params_list)):
    print("Modelo", i+1)
    print("Hiperparámetros:", best_params_list[i])
    print("Puntaje:", best_scores_list[i])


Top 5 mejores modelos:
Modelo 1
Hiperparámetros: OrderedDict([('boosting_type', 'rf'), ('colsample_bytree', 0.6640058726815146), ('importance_type', 'gain'), ('learning_rate', 0.4323494224455862), ('metric', 'multi_logloss'), ('min_child_samples', 23), ('n_estimators', 316), ('n_iter', 104), ('num_leaves', 365), ('objective', 'multiclass'), ('reg_alpha', 0.58120052568765), ('reg_lambda', 0.26916188313538064), ('subsample', 0.6368865124311849)])
Puntaje: 1.2861887738563345
Modelo 2
Hiperparámetros: OrderedDict([('boosting_type', 'gbdt'), ('colsample_bytree', 0.6698998613002191), ('importance_type', 'gain'), ('learning_rate', 0.3285155778390284), ('metric', 'multi_error'), ('min_child_samples', 122), ('n_estimators', 408), ('n_iter', 173), ('num_leaves', 285), ('objective', 'multiclass'), ('reg_alpha', 0.22689605408891694), ('reg_lambda', 0.7042148383733584), ('subsample', 0.5357531123324053)])
Puntaje: 2.163352437711942
Modelo 3
Hiperparámetros: OrderedDict([('boosting_type', 'rf'), ('c

#### Armado del ensamble

In [181]:
# Crear una lista para almacenar los modelos individuales
individual_models = []

# Entrenar los modelos individuales
for params_str in best_params_list:
    model = LGBMClassifier(verbosity=-1, **params_str)
    model.fit(X, y)
    individual_models.append(model)

voting_model = VotingClassifier(estimators=[('model_'+str(i), model) for i, model in enumerate(individual_models)], voting='hard', verbose=0)#voting='soft', verbose=0)
voting_model.fit(X, y)

with open('gbm_classifier/gbm_boosting_classifier.pkl', 'wb') as f:
    dill.dump(voting_model, f)

#### Clasificacion con el ensamble sobre las redicciones de los modelos generativos

In [12]:
with open('gbm_classifier/gbm_boosting_classifier.pkl', 'rb') as f:
    voting_model = dill.load(f)

In [13]:
voting_model.fit(X, y)

##### Datos originales:

In [14]:
display(clasifier_validation)

Unnamed: 0,Open_time,Open,High,Low,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Tendencia
946,2024-04-30,6.59,6.67,6.04,115512.0,60672.0,54947.66,1985671.0,3014.05,561717.49,1292873.0,578.4,766513.45,486465.0,6.93,7.13,7.8,6.93,6.05,34.18,51.0,55.0,48709.0,142,187,1.0,23.0,379.0,377.0,70000.0,Bajista
947,2024-05-01,6.42,6.93,6.13,175570.0,58364.97,81166.47,2401089.0,2972.46,624963.78,1365039.0,561.8,669027.32,427425.0,6.85,7.11,7.41,6.85,6.29,43.3,42.0,50.0,83718.0,130,177,0.0,36.0,327.0,340.0,107000.0,Alcista
948,2024-05-02,6.9,7.41,6.69,109002.0,59060.61,47583.82,1572898.0,2986.19,365939.72,880167.0,560.5,359794.32,250921.0,6.85,7.12,7.42,6.85,6.29,49.27,87.0,57.0,61208.0,461,374,1.0,25.0,270.0,282.0,71000.0,Alcista
949,2024-05-03,7.27,7.39,7.0,109634.0,62882.01,43628.4,1558661.0,3102.61,355825.84,859542.0,587.0,342906.43,257575.0,6.9,7.14,7.44,6.9,6.35,48.86,52.0,40.0,46255.0,573,474,1.0,22.0,386.0,635.0,69000.0,Lateral
950,2024-05-04,7.24,7.28,7.11,71120.0,63892.04,24368.69,1113509.0,3117.23,196263.95,575026.0,585.7,197129.25,210303.0,6.91,7.13,7.46,6.91,6.36,46.98,68.0,50.0,34251.0,407,472,0.0,14.0,203.0,232.0,49000.0,Bajista
951,2024-05-05,7.12,7.4,7.01,72928.0,64012.0,18526.75,992921.0,3136.41,218760.27,600693.0,592.0,180458.24,180794.0,6.94,7.15,7.51,6.94,6.37,50.0,37.0,52.0,29197.0,417,499,0.0,6.0,320.0,284.0,47000.0,Alcista
952,2024-05-06,7.3,7.47,7.06,94264.0,63165.19,34674.92,1392557.0,3062.6,355135.3,873200.0,588.2,278669.01,248490.0,6.96,7.15,7.53,6.96,6.39,47.1,49.0,71.0,40027.0,482,531,0.0,25.0,339.0,249.0,59000.0,Bajista
953,2024-05-07,7.12,7.29,6.98,64947.0,62312.08,25598.79,1272898.0,3005.69,298796.68,815246.0,576.5,289488.71,266127.0,6.98,7.13,7.52,6.98,6.44,45.1,21.0,25.0,31028.0,495,494,0.0,28.0,296.0,205.0,42000.0,Bajista
954,2024-05-08,6.99,7.19,6.89,75550.0,61193.03,26121.19,1415152.0,2974.21,266934.81,830635.0,588.6,297016.62,249379.0,6.99,7.12,7.52,6.99,6.46,44.94,17.0,24.0,32040.0,426,494,0.0,24.0,230.0,177.0,49000.0,Lateral
955,2024-05-09,6.98,7.09,6.78,75016.0,63074.01,30660.81,1381957.0,3036.23,238561.75,686147.0,596.8,464857.6,332988.0,7.01,7.11,7.52,7.01,6.5,46.32,18.0,17.0,29314.0,475,464,0.0,16.0,188.0,257.0,50000.0,Lateral


In [16]:
n_clases = 3 

validation_predictions = voting_model.predict(clasifier_validation[columns].drop(columns=["Open_time"]))
display(validation_predictions)

array(['Bajista', 'Alcista', 'Alcista', 'Alcista', 'Alcista', 'Alcista',
       'Bajista', 'Alcista', 'Alcista', 'Alcista'], dtype=object)

##### Datos generados por auto ml con prophet:

In [17]:
auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
auto_mp_prophet_predictions = voting_model.predict(auto_ml_prophet_df[columns].drop(columns=["Open_time"]))
display(auto_mp_prophet_predictions)

array(['Lateral', 'Lateral', 'Alcista', 'Alcista', 'Lateral', 'Alcista',
       'Alcista', 'Alcista', 'Alcista', 'Alcista'], dtype=object)

##### Datos generados por auto ml con stats:

In [19]:
auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
auto_mp_stats_predictions = voting_model.predict(auto_ml_stats_df[columns].drop(columns=["Open_time"]))

display(auto_mp_stats_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista',
       'Bajista', 'Bajista', 'Bajista', 'Bajista'], dtype=object)

##### Datos generados por auto ml con modelos clasicos:

In [20]:
auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
auto_mp_predictions = voting_model.predict(auto_ml_df[columns].drop(columns=["Open_time"]))
display(auto_mp_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista',
       'Bajista', 'Bajista', 'Bajista', 'Bajista'], dtype=object)

##### Datos generados por auto ml con skforecast:

In [24]:
skforecast_df = pd.read_csv('skforecast/predicciones.csv')
skforecast_predictions = voting_model.predict(skforecast_df[columns[1:]])
display(skforecast_predictions)

array(['Bajista', 'Lateral', 'Alcista', 'Alcista', 'Alcista', 'Alcista',
       'Alcista', 'Alcista', 'Alcista', 'Alcista'], dtype=object)