El objetivo de este enfoque, sera generar un/os modelo/s para predecir los atributos del dia siguiente al ultimo disponible en el dataset. Aqui se aplicaran 2 enfoques:

- Un modelo que prediga todas las variablse en simultaneo (con el objetivo de captar la interrelacion entre las mismas).
- Un modelo que prediga solamente la variable target (incialmente se realizaran pruebas con la variable Close, y luego se procedera a usar la variable Tendencia).

Una vez realiza la prediccion de los atributos del dia siguiente, se procedera a realizar la prediccion de la Tendencia/Close, se realimientara el dataset, y se procedere a predecir otro dia, repitiendo esto N veces.

### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import accuracy_score
from pmdarima.arima import auto_arima
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from neuralprophet import NeuralProphet
from tensorflow.keras.layers import Reshape
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
import datetime
from prophet import Prophet
from prophet.diagnostics import performance_metrics
from sklearn.model_selection import GridSearchCV
from sklearn.base import RegressorMixin
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import make_scorer
from keras.callbacks import Callback
from sklearn.base import clone
from sklearn.ensemble import VotingRegressor
from tensorflow.keras.regularizers import l2
from skopt import BayesSearchCV
import tensorflow.keras.backend as K

pd.set_option('display.max_columns', None)

2024-04-09 08:10:15.893781: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# LSTM Predictor

### Dataset

In [2]:
columns = [
    'Open',
    'High',
    'Low',
    'Close',
    'Number of trades',
    'Close_BTCUSDT',
    'Volume_BTCUSDT',
    'Number_of_trades_BTCUSDT',
    'Close_ETHUSDT',
    'Volume_ETHUSDT',
    'Number_of_trades_ETHUSDT',
    'Close_BNBUSDT',
    'Volume_BNBUSDT',
    'Number_of_trades_BNBUSDT',
    'SMA_20',
    'EMA_20',
    'Upper_Band',
    'Middle_Band',
    'Lower_Band',
    'RSI',
    'buy_1000x_high_coinbase',
    'sell_1000x_high_coinbase',
    'total_trades_coinbase',	
    'Tweets_Utilizados',
    'Tweets_Utilizados_coin',
    'Tweets_Utilizados_referentes',
    'Tweets_Utilizados_whale_alert',
    'Buy_1000x_high',
    'sell_1000x_high',
    'total_trades_binance'
]

In [3]:
dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv')

dataset['Open_time'] = pd.to_datetime(dataset['Open_time'])
dates = dataset['Open_time'][:-5]

# dataset.drop(['Sentimiento'], axis=1, inplace=True)
# dataset.drop(['Sentimiento_coin'], axis=1, inplace=True)
# dataset.drop(['Sentimiento_referentes'], axis=1, inplace=True)
# dataset.drop(columns=['Open_time'], inplace=True)

dataset = dataset.round(2) # Limitar los valores float a 2 decimales en todo el dataframe

feature_dataset = dataset[columns]
# feature_dataset.drop(['Tendencia'], axis=1, inplace=True)

validation = feature_dataset[-5:]
feature_dataset = feature_dataset[:-5]

n_days_to_predict = 5

In [4]:
display(feature_dataset.head())
print(feature_dataset.shape)

display(validation.head())
display(validation.shape)

Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance
0,28.84,30.26,27.5,27.71,449178.0,42147.35,39776.84,1001487.0,2925.59,510130.73,1043885.0,335.5,956544.07,457187.0,31.85,30.83,38.3,31.85,25.39,44.11,0.0,0.0,0.0,139,135,1.0,81.0,525.0,364.0,270000.0
1,27.72,28.38,26.14,26.31,362304.0,41026.54,43372.26,1045389.0,2804.91,511325.46,928494.0,333.0,922077.23,417006.0,31.77,30.4,38.44,31.77,25.11,41.83,4.0,2.0,5273.0,93,122,2.0,87.0,472.0,331.0,204000.0
2,26.31,28.59,26.11,27.28,376232.0,41524.28,33511.53,884909.0,2850.45,411305.09,748804.0,367.7,1696420.04,653011.0,31.65,30.1,38.55,31.65,24.74,43.99,22.0,40.0,54144.0,112,145,0.0,64.0,594.0,495.0,216000.0
3,27.28,28.99,27.13,28.62,339737.0,43824.1,46381.23,1197815.0,3000.61,506896.76,992243.0,387.5,1163674.21,551245.0,31.62,29.96,38.56,31.62,24.67,46.92,15.0,23.0,39220.0,116,147,2.0,77.0,419.0,464.0,202000.0
4,28.61,32.33,28.5,31.94,735059.0,48141.61,66244.87,1771237.0,3309.91,648714.62,1446386.0,421.5,1440336.04,727854.0,31.64,30.15,38.58,31.64,24.69,53.42,24.0,35.0,63183.0,171,141,1.0,71.0,477.0,664.0,492000.0


(903, 30)


Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance
903,10.08,10.46,9.6,9.9,245319.0,67609.99,55691.08,2464515.0,3520.46,570901.29,1906387.0,555.4,2284301.81,994512.0,10.06,9.95,11.86,10.06,8.26,52.48,34.0,43.0,84706.0,696,471,0.0,43.0,343.0,228.0,154000.0
904,9.9,9.99,8.6,8.77,341363.0,61937.4,101005.32,3593832.0,3158.64,1049629.69,2647385.0,507.7,2551361.51,1213572.0,10.08,9.84,11.81,10.08,8.35,42.93,120.0,126.0,135180.0,961,509,1.0,56.0,534.0,433.0,221000.0
905,8.77,9.57,8.49,9.48,267797.0,67840.51,90420.59,3549793.0,3516.53,1207322.82,2987953.0,556.8,1425296.58,809335.0,10.14,9.8,11.68,10.14,8.6,49.21,185.0,117.0,112997.0,866,555,1.0,40.0,473.0,386.0,171000.0
906,9.48,9.58,9.07,9.18,156774.0,65501.27,53357.48,2388390.0,3492.85,602755.21,1791989.0,553.8,953921.37,563996.0,10.17,9.74,11.63,10.17,8.71,46.85,64.0,81.0,66543.0,692,533,0.0,24.0,350.0,290.0,101000.0
907,9.18,9.37,8.69,8.94,147578.0,63796.64,51482.38,2492881.0,3336.35,558848.89,1747756.0,553.8,1181298.51,712381.0,10.14,9.67,11.67,10.14,8.62,45.0,57.0,66.0,68616.0,681,546,0.0,41.0,252.0,206.0,92000.0


(5, 30)

### Scalers

In [5]:
scalers = {}
for col in feature_dataset.columns:
    scaler = MinMaxScaler(feature_range=(0, 1))
    feature_dataset[col] = scaler.fit_transform(np.array(feature_dataset[col]).reshape(-1, 1))
    scalers[col] = scaler


In [6]:
display(feature_dataset.head())
print(feature_dataset.shape)

display(validation.head())
display(validation.shape)

Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance
0,0.501993,0.517208,0.492593,0.479569,0.168108,0.460212,0.040979,0.046312,0.506304,0.126579,0.285966,0.303395,0.197596,0.238897,0.633771,0.615034,0.648008,0.633771,0.602961,0.345342,0.0,0.0,0.0,0.123457,0.136662,0.058824,0.536424,0.302018,0.212508,0.140182
1,0.479673,0.480653,0.464609,0.451664,0.134757,0.440649,0.045762,0.049256,0.474653,0.126913,0.24888,0.297919,0.18979,0.215055,0.63196,0.605239,0.650664,0.63196,0.595284,0.307138,0.016878,0.007092,0.026035,0.079772,0.11958,0.117647,0.576159,0.2686,0.191449,0.104869
2,0.451574,0.484737,0.463992,0.470999,0.140104,0.449336,0.032645,0.038494,0.486597,0.09888,0.191129,0.373932,0.365156,0.355091,0.629244,0.598405,0.652751,0.629244,0.585138,0.343331,0.092827,0.141844,0.267329,0.097816,0.149803,0.0,0.423841,0.345523,0.296107,0.111289
3,0.470905,0.492514,0.484979,0.497708,0.126093,0.489479,0.049765,0.059477,0.525979,0.125672,0.269369,0.417306,0.244504,0.294707,0.628565,0.595216,0.652941,0.628565,0.583219,0.392426,0.063291,0.08156,0.193644,0.101614,0.152431,0.117647,0.509934,0.235183,0.276324,0.103799
4,0.497409,0.557457,0.513169,0.563883,0.277858,0.56484,0.076188,0.097929,0.6071,0.165421,0.415326,0.491785,0.30716,0.3995,0.629018,0.599544,0.653321,0.629018,0.583767,0.50134,0.101266,0.124113,0.311958,0.153846,0.144547,0.058824,0.470199,0.271753,0.403957,0.258962


(903, 30)


Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance
903,10.08,10.46,9.6,9.9,245319.0,67609.99,55691.08,2464515.0,3520.46,570901.29,1906387.0,555.4,2284301.81,994512.0,10.06,9.95,11.86,10.06,8.26,52.48,34.0,43.0,84706.0,696,471,0.0,43.0,343.0,228.0,154000.0
904,9.9,9.99,8.6,8.77,341363.0,61937.4,101005.32,3593832.0,3158.64,1049629.69,2647385.0,507.7,2551361.51,1213572.0,10.08,9.84,11.81,10.08,8.35,42.93,120.0,126.0,135180.0,961,509,1.0,56.0,534.0,433.0,221000.0
905,8.77,9.57,8.49,9.48,267797.0,67840.51,90420.59,3549793.0,3516.53,1207322.82,2987953.0,556.8,1425296.58,809335.0,10.14,9.8,11.68,10.14,8.6,49.21,185.0,117.0,112997.0,866,555,1.0,40.0,473.0,386.0,171000.0
906,9.48,9.58,9.07,9.18,156774.0,65501.27,53357.48,2388390.0,3492.85,602755.21,1791989.0,553.8,953921.37,563996.0,10.17,9.74,11.63,10.17,8.71,46.85,64.0,81.0,66543.0,692,533,0.0,24.0,350.0,290.0,101000.0
907,9.18,9.37,8.69,8.94,147578.0,63796.64,51482.38,2492881.0,3336.35,558848.89,1747756.0,553.8,1181298.51,712381.0,10.14,9.67,11.67,10.14,8.62,45.0,57.0,66.0,68616.0,681,546,0.0,41.0,252.0,206.0,92000.0


(5, 30)

### Preparo el dataset para train: cada conjunto de entrenamiento, sera una seried de N dias previos, para predecir 1 dia siguiente.

In [7]:
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        end_ix = i + n_steps
        seq_x = data.iloc[i:end_ix, :].values
        seq_y = data.iloc[end_ix, :].values
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)


n_steps = 30  # Longitud de la secuencia de entrada
n_features = feature_dataset.shape[1]  # Número de características

# Crear las secuencias de entrada y salida
X, y = create_sequences(feature_dataset, n_steps)

In [8]:
print(X[0].shape) # Cada dato de entrenamiento, es un conjunto de 30 dias con sus 64 features
print(y[0].shape) # El target de cada dato, son los 64 features del dia siguiente
print(X.shape)
print(y.shape)
print(feature_dataset.shape)

(30, 30)
(30,)
(873, 30, 30)
(873, 30)
(903, 30)


In [9]:
print(len(X))
print(len(y))

873
873


### Obtencion de los mejores hiperparametros

In [10]:
def custom_scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    return -mse

In [11]:
def custom_scoring_validation(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    return -mse

In [12]:
def vmse(y_true, y_pred):
    return K.mean(K.square(y_true - y_pred), axis=-1)

In [13]:
import warnings
warnings.filterwarnings('ignore')

def create_model(activation, units, dropout, learning_rate, l2_penalty, depth, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units=int(units/2), activation=activation, input_shape=(n_steps, n_features), return_sequences=True, kernel_regularizer=l2(l2_penalty)))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())
    
    for _ in range(depth - 1):
        model.add(LSTM(units=units, activation=activation, return_sequences=True, kernel_regularizer=l2(l2_penalty)),)
        model.add(Dropout(dropout))
        model.add(BatchNormalization())
    
    model.add(LSTM(units=int(units*2), activation=activation, kernel_regularizer=l2(l2_penalty)))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())
    model.add(Dense(units=n_features))
    
    if optimizer == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss=vmse, metrics=['accuracy'])
    return model

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
regressor = KerasRegressor(build_fn=create_model, verbose=0, activation='relu', units=50, dropout=0.2, learning_rate=0.1, l2_penalty=0.001, depth=2, optimizer='adam')

cv = TimeSeriesSplit(n_splits=20).split(X)
param_space = {
    'depth': [2, 3, 4, 5],
    'activation': ['relu', 'tanh', 'swish', 'selu'],
    'units': [64, 128, 256, 512],
    'dropout': [0.1, 0.2, 0.3, 0.4],
    'learning_rate': [0.01, 0.001, 0.0001],
    'epochs': [10, 20, 30, 50, 100],
    'batch_size': [32, 64, 128],
    'optimizer': ['adam', 'rmsprop', 'sgd'],
    'l2_penalty': [0.001, 0.01, 0.1]
}

bayes_search = BayesSearchCV(regressor, param_space, scoring=custom_scoring, cv=cv, verbose=0)#10)
bayes_result = bayes_search.fit(X, y, callbacks=[early_stopping])



In [None]:
# Show best results
print("Best score:", bayes_result.best_score_)
print("Best parameters:", bayes_result.best_params_)

# Entrenar el modelo con los mejores hiperparámetros
best_model = bayes_result.best_estimator_
best_model.fit(X, y)

Best score: -0.01407215163397417
Best parameters: OrderedDict([('activation', 'swish'), ('batch_size', 32), ('depth', 2), ('dropout', 0.3), ('epochs', 100), ('l2_penalty', 0.01), ('learning_rate', 0.0001), ('optimizer', 'adam'), ('units', 512)])


### Predicciones con el mejor conjunto de hiper parametros

In [None]:
n_days_to_predict = 5
future_dataset = feature_dataset

dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv')
dataset['Open_time'] = pd.to_datetime(dataset['Open_time'])
dates = dataset['Open_time']

# Crear un DataFrame vacío para almacenar las predicciones desnormalizadas
predicted_values_desnormalized = pd.DataFrame(columns=future_dataset.columns)

# Lista para almacenar las fechas de las predicciones
predicted_dates = []

for _ in range(n_days_to_predict):
    # Predecir 1 día posterior al último día disponible en el dataset
    last_sequence = future_dataset.iloc[-n_steps:, :].values.reshape((1, n_steps, n_features))
    predictions = best_model.predict(last_sequence)

    # Agregar las predicciones sin desnormalizar a future_dataset
    predicted_values_normalized = pd.DataFrame(predictions, columns=future_dataset.columns)
    future_dataset = pd.concat([future_dataset, predicted_values_normalized], axis=0, ignore_index=True)

    # Desnormalizar las predicciones y agregarlas al DataFrame de predicciones desnormalizadas
    inverted_predictions = []
    for i in range(len(future_dataset.columns)):
        col = future_dataset.columns[i]
        scaler = scalers[col]
        prediction = predictions[:, i].reshape(-1, 1)
        inverted_prediction = scaler.inverse_transform(prediction)
        inverted_predictions.append(inverted_prediction)

    # Calcular la fecha del próximo día
    next_day_date = dates.iloc[-1] + pd.DateOffset(days=1)
    predicted_dates.append(next_day_date)

    # Actualizar la fecha del próximo día en el DataFrame principal
    dates = dates.append(pd.Series([next_day_date], name='Fecha'))

    # Crear un DataFrame con las predicciones desnormalizadas
    predicted_values_desnormalized = pd.concat([predicted_values_desnormalized,
                                                pd.DataFrame(np.concatenate(inverted_predictions, axis=1),
                                                             columns=future_dataset.columns)], 
                                                ignore_index=True)

# Agregar las fechas al DataFrame de predicciones desnormalizadas
predicted_values_desnormalized['Fecha'] = predicted_dates

print("Valores predichos para los próximos {} días:".format(n_days_to_predict))
display(future_dataset.tail(n_days_to_predict + 1))

print("Valores predichos desnormalizados para los próximos {} días:".format(n_days_to_predict))
display(predicted_values_desnormalized.tail(n_days_to_predict))


Valores predichos para los próximos 5 días:


Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance
902,0.120167,0.128135,0.115844,0.128164,0.083843,0.918331,0.054236,0.143388,0.694908,0.128726,0.503699,0.820811,0.368893,0.443672,0.138751,0.139408,0.147628,0.138751,0.127228,0.515583,0.151899,0.170213,0.352479,0.649573,0.501971,0.0,0.139073,0.156999,0.156988,0.075976
903,0.145938,0.139936,0.123826,0.127002,0.117857,0.944108,0.043603,0.136416,0.766444,0.151164,0.521597,0.749611,0.388442,0.459132,0.149115,0.148083,0.173791,0.151488,0.129398,0.660461,0.225255,0.203273,0.465957,0.309386,0.220505,-0.005968,0.299016,0.259091,0.210517,0.099516
904,0.142274,0.135385,0.119532,0.122008,0.114829,0.943397,0.039673,0.131208,0.766763,0.151632,0.524292,0.761903,0.390306,0.461762,0.149537,0.147666,0.175026,0.151434,0.128042,0.633117,0.215688,0.195648,0.464704,0.312922,0.218579,-0.007245,0.30225,0.250347,0.200213,0.097973
905,0.13888,0.131378,0.115788,0.117666,0.111902,0.942414,0.035301,0.125308,0.766938,0.151267,0.524935,0.770263,0.390368,0.461705,0.149832,0.147278,0.175866,0.151353,0.126835,0.609268,0.207779,0.189225,0.462304,0.315093,0.216597,-0.008629,0.304361,0.242077,0.191046,0.096684
906,0.135788,0.127934,0.112568,0.113988,0.109203,0.94161,0.030576,0.118943,0.767371,0.150255,0.524053,0.775542,0.389196,0.459778,0.150015,0.146947,0.176422,0.151271,0.125764,0.589175,0.20152,0.184046,0.459296,0.316223,0.214703,-0.010144,0.305641,0.234364,0.183017,0.095675
907,0.132907,0.124917,0.10972,0.110816,0.106741,0.94106,0.025625,0.11231,0.768111,0.148784,0.522095,0.778544,0.387236,0.456617,0.150036,0.146608,0.176719,0.151124,0.124728,0.572261,0.19657,0.179881,0.456013,0.316606,0.212948,-0.011763,0.306304,0.227222,0.176012,0.094896


Valores predichos desnormalizados para los próximos 5 días:


Unnamed: 0,Open,High,Low,Close,Number of trades,Close_BTCUSDT,Volume_BTCUSDT,Number_of_trades_BTCUSDT,Close_ETHUSDT,Volume_ETHUSDT,Number_of_trades_ETHUSDT,Close_BNBUSDT,Volume_BNBUSDT,Number_of_trades_BNBUSDT,SMA_20,EMA_20,Upper_Band,Middle_Band,Lower_Band,RSI,buy_1000x_high_coinbase,sell_1000x_high_coinbase,total_trades_coinbase,Tweets_Utilizados,Tweets_Utilizados_coin,Tweets_Utilizados_referentes,Tweets_Utilizados_whale_alert,Buy_1000x_high,sell_1000x_high,total_trades_binance,Fecha
0,10.973164,10.856914,9.577946,10.02171,318284.75,69870.3125,41749.699219,2345184.25,3917.467041,597845.9375,1777044.0,539.197571,1799242.75,828352.0625,10.437915,10.330834,13.308776,10.542747,8.119136,62.916302,53.385445,57.323086,94373.570312,334.782928,198.804352,-0.101455,45.151344,456.917908,360.880188,193994.484375,2024-03-23
1,10.789328,10.622845,9.36925,9.771141,310397.4375,69829.539062,38794.878906,2267518.25,3918.681152,599516.4375,1785428.75,544.808472,1807474.375,832785.625,10.456526,10.312521,13.373859,10.540338,8.069686,61.284393,51.118019,55.172874,94119.773438,338.506378,197.338562,-0.12316,45.639755,443.050598,344.733856,191111.34375,2024-03-24
2,10.618979,10.41678,9.187305,9.553308,302772.46875,69773.234375,35508.617188,2179543.75,3919.35083,598216.4375,1787432.0,548.625122,1807747.875,832689.1875,10.469585,10.295498,13.418129,10.536775,8.025675,59.861092,49.243637,53.361408,93633.640625,340.793243,195.830582,-0.14669,45.958565,429.934143,330.369751,188701.671875,2024-03-25
3,10.463839,10.23966,9.0308,9.368766,295742.4375,69727.179688,31956.802734,2084611.25,3921.001953,594603.6875,1784685.375,551.03479,1802572.0,829441.0625,10.477658,10.280959,13.447414,10.533132,7.986599,58.661968,47.760174,51.901089,93024.507812,341.982788,194.389267,-0.172451,46.151863,417.700867,317.788116,186816.8125,2024-03-26
4,10.31929,10.084472,8.892408,9.20963,289327.46875,69695.664062,28234.542969,1985700.875,3923.821045,589355.875,1778593.5,552.405579,1793918.5,824113.3125,10.478598,10.266093,13.463116,10.526654,7.948835,57.652557,46.587036,50.726379,92359.414062,342.385742,193.053604,-0.199964,46.251892,406.373962,306.810425,185360.828125,2024-03-27


### Guardado de los mejores hiperparametros

In [None]:
import json

# Obtener los hiperparámetros y puntajes de los 5 mejores modelos
top_n_models = 5
best_params_list = []
best_scores_list = []

for i in range(min(top_n_models, len(bayes_search.cv_results_['params']))):
    best_params_list.append(bayes_search.cv_results_['params'][i])
    best_scores_list.append(bayes_search.cv_results_['mean_test_score'][i])

# Guardar los hiperparámetros de los 5 mejores modelos en un archivo JSON
with open('top_5_hyperparameters.json', 'w') as f:
    json.dump({'best_params': best_params_list, 'best_scores': best_scores_list}, f)

# O imprimir los hiperparámetros
print("Top 5 mejores modelos:")
for i in range(len(best_params_list)):
    print("Modelo", i+1)
    print("Hiperparámetros:", best_params_list[i])
    print("Puntaje:", best_scores_list[i])


Top 5 mejores modelos:
Modelo 1
Hiperparámetros: OrderedDict([('activation', 'swish'), ('batch_size', 128), ('depth', 3), ('dropout', 0.2), ('epochs', 20), ('l2_penalty', 0.01), ('learning_rate', 0.001), ('optimizer', 'rmsprop'), ('units', 64)])
Puntaje: -0.02766625692726051
Modelo 2
Hiperparámetros: OrderedDict([('activation', 'relu'), ('batch_size', 64), ('depth', 4), ('dropout', 0.2), ('epochs', 50), ('l2_penalty', 0.001), ('learning_rate', 0.01), ('optimizer', 'rmsprop'), ('units', 128)])
Puntaje: -0.031976217665667626
Modelo 3
Hiperparámetros: OrderedDict([('activation', 'swish'), ('batch_size', 128), ('depth', 4), ('dropout', 0.2), ('epochs', 10), ('l2_penalty', 0.001), ('learning_rate', 0.001), ('optimizer', 'sgd'), ('units', 64)])
Puntaje: -0.029549170450538113
Modelo 4
Hiperparámetros: OrderedDict([('activation', 'swish'), ('batch_size', 128), ('depth', 5), ('dropout', 0.3), ('epochs', 20), ('l2_penalty', 0.01), ('learning_rate', 0.0001), ('optimizer', 'sgd'), ('units', 128)])

### Armado de un ensamble con los mejores 5 hiperparametros usando la mejor semilla en cada caso

In [None]:
def generate_prime_seeds(n):
    seeds = []
    num = 70001  # Comenzamos desde el primer número primo mayor que 70000
    while len(seeds) < n:
        is_prime = True
        for i in range(2, int(num**0.5) + 1):
            if num % i == 0:
                is_prime = False
                break
        if is_prime:
            seeds.append(num)
        num += 1
    return seeds


In [None]:
def predict_next_days(ensemble, feature_dataset, scalers, n_steps, n_features, n_days_to_predict):
    future_dataset = feature_dataset.copy()

    # Leer el conjunto de datos original para obtener las fechas
    dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv')
    dataset['Open_time'] = pd.to_datetime(dataset['Open_time'])
    dates = dataset['Open_time'][:-n_days_to_predict]

    # Crear un DataFrame vacío para almacenar las predicciones desnormalizadas
    predicted_values_desnormalized = pd.DataFrame(columns=future_dataset.columns)

    # Lista para almacenar las fechas de las predicciones
    predicted_dates = []

    for _ in range(n_days_to_predict):
        # Predecir 1 día posterior al último día disponible en el dataset
        last_sequence = future_dataset.iloc[-n_steps:, :].values.reshape((1, n_steps, n_features))
        predictions = ensemble.predict(last_sequence)

        # Agregar las predicciones sin desnormalizar a future_dataset
        predicted_values_normalized = pd.DataFrame(predictions, columns=future_dataset.columns)
        future_dataset = pd.concat([future_dataset, predicted_values_normalized], axis=0, ignore_index=True)

        # Desnormalizar las predicciones y agregarlas al DataFrame de predicciones desnormalizadas
        inverted_predictions = []
        for i in range(len(future_dataset.columns)):
            col = future_dataset.columns[i]
            scaler = scalers[col]
            prediction = predictions[:, i].reshape(-1, 1)
            inverted_prediction = scaler.inverse_transform(prediction)
            inverted_predictions.append(inverted_prediction)

        # Calcular la fecha del próximo día
        next_day_date = dates.iloc[-1] + pd.DateOffset(days=1)
        predicted_dates.append(next_day_date)

        # Actualizar la fecha del próximo día en el DataFrame principal
        dates = dates.append(pd.Series([next_day_date], name='Fecha'))

        # Crear un DataFrame con las predicciones desnormalizadas
        predicted_values_desnormalized = pd.concat([predicted_values_desnormalized,
                                                    pd.DataFrame(np.concatenate(inverted_predictions, axis=1),
                                                                 columns=future_dataset.columns)], 
                                                    ignore_index=True)

    # Agregar las fechas al DataFrame de predicciones desnormalizadas
    predicted_values_desnormalized['Fecha'] = predicted_dates

    return future_dataset, predicted_values_desnormalized

In [None]:
## Clase personalizada para hacer el ensamble, dado que sklearn no provee ninguna clase que permita hacer ensmble
## de modelos re regresion multivariados
class MultivariableVotingRegressor:
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        # Hacer predicciones con cada modelo
        predictions = [model.predict(X) for model in self.models]
    
        # Calcular el promedio de las predicciones
        average_predictions = np.mean(predictions, axis=0)
    
        return average_predictions


In [None]:

import json

# Leer los hiperparámetros desde el archivo JSON
with open('top_5_hyperparameters.json', 'r') as f:
    top_hyperparameters = json.load(f)


models = []
best_seeds= {}
prime_seeds = generate_prime_seeds(300)

for mode_number, params in enumerate(top_hyperparameters['best_params']):
    best_validation_errors = {}
    
    for seend_number, seed in enumerate(prime_seeds):
        model = KerasRegressor(build_fn=create_model, random_state=seed, **params)
        
        model.fit(X, y)
        
        model_predictions, _ = predict_next_days(model, feature_dataset, scalers, n_steps, n_features, 5)

        error = custom_scoring_validation(validation, model_predictions[-5:])
        print(f"model number: {mode_number}, seed number: {seend_number} error: {error}")
        
        if seed not in best_validation_errors or error < best_validation_errors[seed]:
            best_validation_errors[seed] = error
    
    best_seed_for_params = min(best_validation_errors, key=best_validation_errors.get)
    best_seeds[str(params)] = best_seed_for_params
    
    model = KerasRegressor(build_fn=create_model, random_state=best_seed_for_params, **params)
    model.fit(X, y)
    models.append(model)


ensemble = MultivariableVotingRegressor(models)
ensemble.fit(X, y)

with open('best_seeds.json', 'w') as f:
    json.dump(best_seeds, f)

Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
model number: 1, seed number: 59 error: -622831671352.3943
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch

ValueError: Input contains NaN.

In [None]:
future_dataset = feature_dataset

future_dataset, predicted_values_desnormalized = predict_next_days(ensemble, feature_dataset, scalers, n_steps, n_features, n_days_to_predict)

print("Valores predichos para los próximos {} días:".format(n_days_to_predict))
display(future_dataset.tail(n_days_to_predict + 1))

print("Valores predichos desnormalizados para los próximos {} días:".format(n_days_to_predict))
display(predicted_values_desnormalized.tail(n_days_to_predict))


### Rearmado del modelo a partir de las semillas

In [None]:
with open('best_seeds.json', 'r') as f:
    best_seeds = json.load(f)

# 21 Crear y entrenar los modelos con los hiperparámetros y semillas guardados
models = []
for params_str, seed in best_seeds.items():
    params = json.loads(params_str.replace("'", "\""))
    model = KerasRegressor(build_fn=create_model, random_state=seed, **params)
    model.fit(X, y)
    models.append(model)

In [None]:
# 2. Predecir 5 días en el futuro con los modelos entrenados
ensemble = MultivariableVotingRegressor(models)
ensemble.fit(X, y)
future_dataset, predicted_values_desnormalized = predict_next_days(ensemble, feature_dataset, scalers, n_steps, n_features, 5)
    
display(predicted_values_desnormalized.head())

# Light GBM Classifier