### Dataset

In [84]:
import pandas as pd
from auto_ts import auto_timeseries
import dill
import talib
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam, RMSprop, SGD
from scikeras.wrappers import KerasRegressor
from keras.callbacks import EarlyStopping
from skopt import BayesSearchCV
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.backend as K
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from keras.losses import categorical_crossentropy
from sklearn.metrics import log_loss
from keras.utils import to_categorical
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

In [85]:
# Evito que ciertas columnas se transformen a notacion cientifica en las predicciones
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

In [86]:
columns = [
    'Open_time',
    'Open',
    'High',
    'Low',
    # 'Close',
    'Number of trades',
    'Close_BTCUSDT',
    'Volume_BTCUSDT',
    'Number_of_trades_BTCUSDT',
    'Close_ETHUSDT',
    'Volume_ETHUSDT',
    'Number_of_trades_ETHUSDT',
    'Close_BNBUSDT',
    'Volume_BNBUSDT',
    'Number_of_trades_BNBUSDT',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'buy_1000x_high_coinbase',
    'sell_1000x_high_coinbase',
    'total_trades_coinbase',	
    'Tweets_Utilizados',
    'Tweets_Utilizados_coin',
    'Tweets_Utilizados_referentes',
    'Tweets_Utilizados_whale_alert',
    'Buy_1000x_high',
    'sell_1000x_high',
    'total_trades_binance'
]

### Armado y entrenamiento de un clasificador a partir de los datos originales

#### Modelo Light GBM

In [87]:
complete_dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv') 
classifier_dataset = complete_dataset[columns]
classifier_dataset['Open_time'] = pd.to_datetime(classifier_dataset['Open_time'])
classifier_dataset['Tendencia'] = complete_dataset['Tendencia']

clasifier_validation = classifier_dataset[-5:]
classifier_dataset = classifier_dataset[:-5]

In [88]:
display(classifier_dataset.tail())

Unnamed: 0,Open_time,Open,High,Low,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Tendencia
898,2024-03-13,10.74,11.46,10.64,276468.0,73072.41,52659.71,2501197.0,4004.79,482305.78,1536498.0,630.5,2526002.56,1265237.0,9.45,9.7,11.82,9.45,7.07,73.43,64.0,81.0,92576.0,275,205,1.0,73.0,578.0,553.0,164000.0,Alcista
899,2024-03-14,11.37,11.89,10.68,536988.0,71388.94,71757.63,2994869.0,3881.7,648237.52,1919963.0,603.2,2119540.3,1038297.0,9.65,9.88,12.03,9.65,7.27,74.51,102.0,133.0,145727.0,211,181,4.0,29.0,754.0,677.0,327000.0,Alcista
900,2024-03-15,11.56,11.71,9.97,557152.0,69499.85,103334.04,3904445.0,3742.19,947537.41,2487337.0,632.7,3066312.79,1365283.0,9.8,9.97,12.07,9.8,7.52,63.27,88.0,83.0,147460.0,238,106,0.0,25.0,493.0,430.0,360000.0,Bajista
901,2024-03-16,10.81,10.9,9.5,330505.0,65300.63,55926.95,2729019.0,3523.09,548288.16,1798939.0,576.4,1811838.04,1025452.0,9.89,9.94,11.99,9.89,7.78,50.82,30.0,49.0,88095.0,670,471,0.0,20.0,513.0,403.0,209000.0,Bajista
902,2024-03-17,9.68,10.25,9.19,229683.0,68393.48,49742.22,2449156.0,3644.71,517790.99,1721355.0,571.7,1712920.34,802297.0,9.98,9.95,11.93,9.98,8.04,54.27,36.0,48.0,71390.0,693,413,0.0,21.0,295.0,277.0,150000.0,Alcista


In [89]:
classifier_dataset.shape

(903, 31)

In [90]:
X = classifier_dataset.drop(columns=["Tendencia", "Open_time"])
y = classifier_dataset["Tendencia"]

# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(classifier_dataset["Tendencia"])

y = y.to_numpy().reshape(-1, 1)
onehot_encoder = OneHotEncoder(sparse=False)
y_one_hot = onehot_encoder.fit_transform(y)


In [91]:
display(y_one_hot)

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [92]:
classifier = LGBMClassifier(objective='multiclass', num_class=3, metric='multi_logloss', verbosity=-1)

# Definir el espacio de búsqueda de hiperparámetros
param_space = {
    'num_leaves': (10, 300),
    'learning_rate': (0.01, 0.5), 
    'n_estimators': (50, 300),
    'min_child_samples': (5, 100),
    'subsample': (0.5, 1.0),  
    'colsample_bytree': (0.5, 1.0)
}
# Definir función de pérdida para la búsqueda bayesiana
def multi_log_loss(estimator, X_test, y_test):
    y_pred = estimator.predict_proba(X_test)
    loss = log_loss(y_test, y_pred)
    return loss

# Definir validación cruzada temporal
cv = TimeSeriesSplit(n_splits=10).split(X)

# Realizar la búsqueda bayesiana de hiperparámetros
bayes_search = BayesSearchCV(
    classifier, 
    param_space, 
    scoring=multi_log_loss, 
    cv=cv,
    n_iter=50,
    verbose=0
)

# Realizar la búsqueda bayesiana
bayes_result = bayes_search.fit(X, y)

In [93]:
# Show best results
print("Best score:", bayes_result.best_score_)
print("Best parameters:", bayes_result.best_params_)

# Entrenar el modelo con los mejores hiperparámetros
best_model = bayes_result.best_estimator_
best_model.fit(X, y)

Best score: 3.3233412813590393
Best parameters: OrderedDict([('colsample_bytree', 1.0), ('learning_rate', 0.5), ('min_child_samples', 24), ('n_estimators', 300), ('num_leaves', 10), ('subsample', 0.5)])


In [94]:
import json

# Obtener los hiperparámetros y puntajes de los 5 mejores modelos
top_n_models = 5
best_params_list = []
best_scores_list = []

for i in range(min(top_n_models, len(bayes_search.cv_results_['params']))):
    best_params_list.append(bayes_search.cv_results_['params'][i])
    best_scores_list.append(bayes_search.cv_results_['mean_test_score'][i])

# Guardar los hiperparámetros de los 5 mejores modelos en un archivo JSON
with open('gbm_classifier/top_5_hyperparameters.json', 'w') as f:
    json.dump({'best_params': best_params_list, 'best_scores': best_scores_list}, f)

# O imprimir los hiperparámetros
print("Top 5 mejores modelos:")
for i in range(len(best_params_list)):
    print("Modelo", i+1)
    print("Hiperparámetros:", best_params_list[i])
    print("Puntaje:", best_scores_list[i])


Top 5 mejores modelos:
Modelo 1
Hiperparámetros: OrderedDict([('colsample_bytree', 0.5885752220313429), ('learning_rate', 0.17082884484681185), ('min_child_samples', 6), ('n_estimators', 192), ('num_leaves', 231), ('subsample', 0.5085623038169307)])
Puntaje: 2.093090507673048
Modelo 2
Hiperparámetros: OrderedDict([('colsample_bytree', 0.7313732549225939), ('learning_rate', 0.47007092381868965), ('min_child_samples', 19), ('n_estimators', 291), ('num_leaves', 209), ('subsample', 0.9196262867394146)])
Puntaje: 2.847121139240301
Modelo 3
Hiperparámetros: OrderedDict([('colsample_bytree', 0.9392242353895535), ('learning_rate', 0.26428839588200526), ('min_child_samples', 13), ('n_estimators', 187), ('num_leaves', 32), ('subsample', 0.70993998152444)])
Puntaje: 2.541656866917237
Modelo 4
Hiperparámetros: OrderedDict([('colsample_bytree', 0.7804555919276301), ('learning_rate', 0.34732279373697456), ('min_child_samples', 54), ('n_estimators', 228), ('num_leaves', 145), ('subsample', 0.68558748

#### Armado del ensamble

In [95]:
# Crear una lista para almacenar los modelos individuales
individual_models = []

# Entrenar los modelos individuales
for params_str in best_params_list:
    model = LGBMClassifier(verbosity=-1, **params_str)
    model.fit(X, y)
    individual_models.append(model)

voting_model = VotingClassifier(estimators=[('model_'+str(i), model) for i, model in enumerate(individual_models)], voting='soft', verbose=0)
voting_model.fit(X, y)

with open('gbm_classifier/gbm_boosting_classifier.pkl', 'wb') as f:
    dill.dump(voting_model, f)

#### Clasificacion con el ensamble sobre las redicciones de los modelos generativos

In [96]:
with open('gbm_classifier/gbm_boosting_classifier.pkl', 'rb') as f:
    voting_model = dill.load(f)

In [97]:
voting_model.fit(X, y)

##### Datos originales:

In [98]:
display(clasifier_validation)

Unnamed: 0,Open_time,Open,High,Low,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Tendencia
903,2024-03-18,10.08,10.46,9.6,245319.0,67609.99,55691.08,2464515.0,3520.46,570901.29,1906387.0,555.4,2284301.81,994512.0,10.06,9.95,11.86,10.06,8.26,52.48,34.0,43.0,84706.0,696,471,0.0,43.0,343.0,228.0,154000.0,Bajista
904,2024-03-19,9.9,9.99,8.6,341363.0,61937.4,101005.32,3593832.0,3158.64,1049629.69,2647385.0,507.7,2551361.51,1213572.0,10.08,9.84,11.81,10.08,8.35,42.93,120.0,126.0,135180.0,961,509,1.0,56.0,534.0,433.0,221000.0,Bajista
905,2024-03-20,8.77,9.57,8.49,267797.0,67840.51,90420.59,3549793.0,3516.53,1207322.82,2987953.0,556.8,1425296.58,809335.0,10.14,9.8,11.68,10.14,8.6,49.21,185.0,117.0,112997.0,866,555,1.0,40.0,473.0,386.0,171000.0,Alcista
906,2024-03-21,9.48,9.58,9.07,156774.0,65501.27,53357.48,2388390.0,3492.85,602755.21,1791989.0,553.8,953921.37,563996.0,10.17,9.74,11.63,10.17,8.71,46.85,64.0,81.0,66543.0,692,533,0.0,24.0,350.0,290.0,101000.0,Bajista
907,2024-03-22,9.18,9.37,8.69,147578.0,63796.64,51482.38,2492881.0,3336.35,558848.89,1747756.0,553.8,1181298.51,712381.0,10.14,9.67,11.67,10.14,8.62,45.0,57.0,66.0,68616.0,681,546,0.0,41.0,252.0,206.0,92000.0,Bajista


In [99]:
n_clases = 3 

validation_predictions = voting_model.predict(clasifier_validation.drop(columns=["Open_time", "Tendencia"]))
display(validation_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Alcista', 'Alcista'],
      dtype=object)

##### Datos generados por auto ml con prophet:

In [101]:
auto_ml_prophet_df = pd.read_csv('auto_timeseries_models_prophet/predicciones.csv')
auto_mp_prophet_predictions = voting_model.predict(auto_ml_prophet_df.drop(columns=["Open_time", "Close"]))
display(auto_mp_prophet_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista'],
      dtype=object)

##### Datos generados por auto ml con stats:

In [102]:
auto_ml_stats_df = pd.read_csv('auto_timeseries_models/predicciones.csv')
auto_mp_stats_predictions = voting_model.predict(auto_ml_stats_df.drop(columns=["Open_time", "Close"]))

display(auto_mp_stats_predictions)

array(['Bajista', 'Bajista', 'Bajista', 'Bajista', 'Bajista'],
      dtype=object)

##### Datos generados por auto ml con modelos clasicos:

In [103]:
auto_ml_df = pd.read_csv('h2o_models/predicciones.csv')
auto_mp_predictions = voting_model.predict(auto_ml_df.drop(columns=["Open_time", "Next_Day_Target", "Close"]))
display(auto_mp_predictions)

array(['Alcista', 'Alcista', 'Alcista', 'Alcista', 'Alcista'],
      dtype=object)