### Updated Extract Windows for the pre processing file

In [None]:
def extract_valid_windows_v4(df, date_col, input_window, target_window, input_columns, target_columns,  train_end_date, valid_end_date):
    """
    Extracts valid windows from a time series DataFrame for LSTM training.
    
    Args:
        df (pd.DataFrame): The time series DataFrame with a datetime column.
        date_col (str): The name of the datetime column.
        input_window (int): The number of timesteps for the input sequence.
        target_window (int): The number of timesteps for the target sequence.
        input_columns (list of str): List of column names to include in the input data.
        target_columns (list of str): List of column names to include in the target data.
        
    Returns:
        inputs (list of np.ndarray): List of valid input sequences.
        targets (list of np.ndarray): two values
    """
    # Sort by the datetime column to ensure the time series is ordered
    df = df.sort_values(by=date_col).reset_index(drop=True)

    train_end_date = pd.to_datetime(train_end_date)
    valid_end_date = pd.to_datetime(valid_end_date)
    
    # Ensure the datetime column is in pandas datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Identify valid consecutive rows (1-hour apart)
    time_diffs = df[date_col].diff().dt.total_seconds()
    valid_indices = time_diffs == 3600  # 1 hour = 3600 seconds
    
    # Mark valid sequences
    valid_sequence_flags = valid_indices | valid_indices.shift(-1, fill_value=False)
    df = df[valid_sequence_flags].reset_index(drop=True)

    # Prepare inputs and targets
    input_train = []
    input_valid = []
    input_test = []
    target_train = []
    target_valid = []
    target_test = []


    total_window = input_window + target_window

    for i in range(len(df) - total_window + 1):
        # Extract a potential window of size `total_window`
        window = df.iloc[i:i+total_window]
        window_end_date = window[date_col].iloc[-1]
        
        # Check if all rows in the window are 1-hour apart
        if (window[date_col].diff().dt.total_seconds()[1:] == 3600).all():
            # Split into input and target based on specified columns
            input_data = window.iloc[:input_window][input_columns].values
            target_data = window.iloc[input_window:][target_columns].values
            
            # Calculate differences and sign
            differences = target_data[-1, :] - target_data[0, :]
            differences = custom_sign(differences)
            
            # Categorize the window based on its end date
            if window_end_date <= train_end_date:
                input_train.append(input_data)
                target_train.append(differences)
            elif window_end_date <= valid_end_date:
                input_valid.append(input_data)
                target_valid.append(differences)
            else:
                input_test.append(input_data)
                target_test.append(differences)

    # Convert to numpy arrays
    inputs_train, inputs_valid, inputs_test = np.array(input_train), np.array(input_valid), np.array(input_test)
    targets_train, targets_valid, targets_test = np.array(target_train), np.array(target_valid), np.array(target_test)

    return inputs_train, inputs_valid, inputs_test, targets_train, targets_valid, targets_test

### Updated loop for the grid search file

the loop to create windows is updated, I used dates to hard cut train, validation and test windows.
Because of this some of the subseqeunt code also needs changes, including a slight change to the model function.

In [None]:
# Initialize empty lists to store aggregated inputs and targets
X_train,X_valid,X_test = [], [], []
Y_train,Y_valid,Y_test = [], [], []

# looping through symbols
symbols = filtered_df['crypto_symbol'].unique()
for s in symbols:
     try:
          sim_df = filtered_df[filtered_df['crypto_symbol'] == s]
          sim_df.reset_index(inplace=True, drop=True)
          print(s)
          # First Loop: Calculate intervals for each column without modifying the DataFrame
          intervals = {}
          for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
               lower, upper = pre.calculate_iqr_bounds(sim_df[column],outlier_threshold)
               intervals[column] = {'lower_bound': lower, 'upper_bound': upper}

          # getting the returns
          reduced_df = sim_df.copy()
          reduced_df['returns'] = pre.calculate_hourly_returns(reduced_df, 'date', 'close')
          reduced_df = reduced_df[reduced_df['returns'].notna()]
          reduced_df.reset_index(inplace=True, drop=True)

          # Second Loop: Filter rows based on the pre-calculated intervals
          for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
               lower_bound = intervals[column]['lower_bound']
               upper_bound = intervals[column]['upper_bound']
               # Apply filtering based on pre-calculated bounds
               reduced_df = reduced_df[(reduced_df[column] > lower_bound) & (reduced_df[column] < upper_bound)]

          reduced_df.reset_index(inplace=True, drop=True)

          # MinMax scaling
          scaler = MinMaxScaler(feature_range=(0, 1))  # Default range is (0, 1)

          scaled_df = reduced_df.copy()
          scaled_df['lending_rate'] = scaler.fit_transform(reduced_df[['lending_rate']])
          scaled_df['borrowing_rate'] = scaler.fit_transform(reduced_df[['borrowing_rate']])
          scaled_df['utilization_rate'] = scaler.fit_transform(reduced_df[['utilization_rate']])
          scaled_df['close'] = scaler.fit_transform(reduced_df[['close']])
          scaled_df['volume'] = scaler.fit_transform(reduced_df[['volume']])
          scaled_df['returns'] = scaler.fit_transform(reduced_df[['returns']])

          inputs_train, inputs_valid, inputs_test, targets_train, targets_valid, targets_test = pre.extract_valid_windows_v4(
               scaled_df,
               'date', 
               input_window, output_window, 
               input_columns, 
               ['lending_rate','borrowing_rate']
               ,train_end_date="2023-07-01 00:00:00", valid_end_date="2023-09-25 00:00:00")

          # Append results from the current DataFrame

          X_train.append(inputs_train)
          X_valid.append(inputs_valid)
          X_test.append(inputs_test)
          Y_train.append(targets_train)
          Y_valid.append(targets_valid)
          Y_test.append(targets_test)


     except Exception as e:
        # Handle any other exceptions
        print(f"Unexpected error in symbol {s}: {e}")

In [None]:
# Concatenate all inputs and targets into single arrays
X_train = [arr for arr in X_train if len(arr) > 0]
X_valid = [arr for arr in X_valid if len(arr) > 0]
X_test = [arr for arr in X_test if len(arr) > 0]

Y_train = [arr for arr in Y_train if len(arr) > 0]
Y_valid = [arr for arr in Y_valid if len(arr) > 0]
Y_test = [arr for arr in Y_test if len(arr) > 0]

X_train  = np.concatenate(X_train, axis=0) if X_train else np.array([])
X_valid = np.concatenate(X_valid, axis=0) if X_valid else np.array([])
X_test = np.concatenate(X_test, axis=0) if X_test else np.array([])

Y_train  = np.concatenate(Y_train, axis=0) if Y_train else np.array([])
Y_valid = np.concatenate(Y_valid, axis=0) if Y_valid else np.array([])
Y_test  = np.concatenate(Y_test, axis=0) if Y_test else np.array([])

In [None]:
# Lending rate
lending_rate_Y_train = Y_train[:, 0].reshape(Y_train.shape[0], 1) 
lending_rate_Y_valid = Y_valid[:, 0].reshape(Y_valid.shape[0], 1) 
lending_rate_Y_test = Y_test[:, 0].reshape(Y_test.shape[0], 1) 
# Borrowing rate
borrow_rate_Y_train = Y_train[:, 1].reshape(Y_train.shape[0], 1) 
borrow_rate_Y_valid = Y_valid[:, 1].reshape(Y_valid.shape[0], 1) 
borrow_rate_Y_test = Y_test[:, 1].reshape(Y_test.shape[0], 1) 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Masking, Reshape, Layer, Lambda, Concatenate, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.optimizers import Adadelta
from keras.layers import BatchNormalization

def train_v1(X_train,X_valid,X_test,Y_train,Y_valid,Y_test, epochs=10, batch_size=32, d1=0.1, d2 = 0.05, cell_size = 80, details=True):
    # Clearing the TensorFlow session to ensure the model starts with fresh weights and biases
    tf.keras.backend.clear_session()
    n_classes = 3

    cell_size_1 = cell_size
    cell_size_2 = cell_size_1//2
    
    # Model definition
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
    Lstm_layer_1 = LSTM(cell_size_1, return_sequences=True, stateful=False)(inputs)
    Batch_norm_1 = BatchNormalization()(Lstm_layer_1)
    Dropout_layer_1 = Dropout(d1)(Batch_norm_1)
    Lstm_layer_2 = LSTM(cell_size_2, return_sequences=False, stateful=False)(Dropout_layer_1)  # just halved
    Batch_norm_2 = BatchNormalization()(Lstm_layer_2)
    Drouput_layer_2 = Dropout(d2)(Batch_norm_2)
    predictions = Dense(n_classes, activation='softmax')(Drouput_layer_2)
    LSTM_base = Model(inputs=inputs, outputs=predictions)

    # optimizer
    optimizer = Adadelta(
    learning_rate=1.0,
    rho=0.8,
    epsilon=1e-7)      # Default , to prevent division by zero)

    LSTM_base.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])

    # Training the model
    history = LSTM_base.fit(x=X_train, y=Y_train,
                    validation_data=(X_valid, Y_valid),
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=False)
 
    if details == True:
        LSTM_base.summary()
        fig, ax1 = plt.subplots()

        # Plot losses on the primary y-axis
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss', color='tab:red')
        ax1.plot(history.history['loss'], label='Train Loss', color='red', linestyle='-')
        ax1.plot(history.history['val_loss'], label='Validation Loss', color='red', linestyle='--')
        ax1.tick_params(axis='y', labelcolor='tab:red')

        # Create a second y-axis for accuracy
        ax2 = ax1.twinx()
        ax2.set_ylabel('Accuracy', color='tab:blue')
        ax2.plot(history.history['accuracy'], label='Train Accuracy', color='blue', linestyle='-')
        ax2.plot(history.history['val_accuracy'], label='Validation Accuracy', color='blue', linestyle='--')
        ax2.tick_params(axis='y', labelcolor='tab:blue')

        # Combine legends from both axes
        fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=2)  # Legend outside the plot

        plt.title('Model Accuracy and Loss')
        plt.tight_layout()  # Adjust layout to avoid clipping
        plt.show()


    y_pred = LSTM_base.predict(X_test)
    y_pred = np.argmax(y_pred, axis=-1)

    return Y_test, y_pred

# example model
Y_test, y_pred = train_v1(X_train,X_valid,X_test,lending_rate_Y_train,lending_rate_Y_valid,lending_rate_Y_test)