In [52]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [53]:
# Define a function to calculate outlier bounds using IQR
def calculate_iqr_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = max(Q1 - 1.5 * IQR, 0)
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# calculate returns on valid windows
def calculate_hourly_returns(df, date_col, close_col):
    """
    Calculates returns based on the close price, only if the date difference is 1 hour.

    Args:
        df (pd.DataFrame): The DataFrame containing the time series data.
        date_col (str): The name of the datetime column.
        close_col (str): The name of the close price column.

    Returns:
        pd.Series: A Series containing the calculated returns or None for invalid rows.
    """
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Sort by date to ensure sequential order
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Calculate the time difference between consecutive rows in hours
    time_diff = df[date_col].diff().dt.total_seconds() / 3600
    
    # Calculate returns only for rows where time_diff == 1 hour
    returns = np.where(
        time_diff == 1,
        (df[close_col] - df[close_col].shift(1)) / df[close_col].shift(1),
        None
    )
    
    return pd.Series(returns, index=df.index)


# now we have a dataframe that does not have any NA and ay outlier, but its time series is corrupted, therefore we need valid windows
def extract_valid_windows(df, date_col, input_window, target_window, input_columns, target_columns):
    """
    Extracts valid windows from a time series DataFrame for LSTM training.
    
    Args:
        df (pd.DataFrame): The time series DataFrame with a datetime column.
        date_col (str): The name of the datetime column.
        input_window (int): The number of timesteps for the input sequence.
        target_window (int): The number of timesteps for the target sequence.
        input_columns (list of str): List of column names to include in the input data.
        target_columns (list of str): List of column names to include in the target data.
        
    Returns:
        inputs (list of np.ndarray): List of valid input sequences.
        targets (list of np.ndarray): List of corresponding target sequences.
    """
    # Sort by the datetime column to ensure the time series is ordered
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Ensure the datetime column is in pandas datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Identify valid consecutive rows (1-hour apart)
    time_diffs = df[date_col].diff().dt.total_seconds()
    valid_indices = time_diffs == 3600  # 1 hour = 3600 seconds
    
    # Mark valid sequences
    valid_sequence_flags = valid_indices | valid_indices.shift(-1, fill_value=False)
    df = df[valid_sequence_flags].reset_index(drop=True)

    # Prepare inputs and targets
    inputs, targets = [], []
    total_window = input_window + target_window

    for i in range(len(df) - total_window + 1):
        # Extract a potential window of size `total_window`
        window = df.iloc[i:i+total_window]
        
        # Check if all rows in the window are 1-hour apart
        if (window[date_col].diff().dt.total_seconds()[1:] == 3600).all():
            # Split into input and target based on specified columns
            input_data = window.iloc[:input_window][input_columns].values
            target_data = window.iloc[input_window:][target_columns].values
            inputs.append(input_data)
            targets.append(target_data)

    return np.array(inputs), np.array(targets)

In [54]:
merged_df = pd.read_csv('merged_crypto_data.csv')

In [55]:
# Drop the second occurrence of a specific column
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
# data without NA rows if we needed 
filtered_df = merged_df[(merged_df['borrowing_rate'] != -50)&(merged_df['lending_rate'] != -50)&(merged_df['utilization_rate'] != -50)]
filtered_df.reset_index(inplace=True, drop=True)
# date formatting
filtered_df['date'] = pd.to_datetime(filtered_df['date'])
# taking the columns we want and converting them to floats
filtered_df = filtered_df[['crypto_symbol', 'date', 'lending_rate',	'borrowing_rate','utilization_rate','close', 'volume']]
filtered_df[['lending_rate',	'borrowing_rate','utilization_rate','close', 'volume']] = filtered_df[['lending_rate','borrowing_rate','utilization_rate','close', 'volume']].astype(float)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['date'])


In [56]:

# Initialize empty lists to store aggregated inputs and targets
all_inputs = []
all_targets = []

# looping through symbols
symbols = filtered_df['crypto_symbol'].unique()
for s in symbols:
     sim_df = filtered_df[filtered_df['crypto_symbol'] == s]
     sim_df.reset_index(inplace=True, drop=True)
     print(s, len(sim_df))
     # First Loop: Calculate intervals for each column without modifying the DataFrame
     intervals = {}
     for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
          lower, upper = calculate_iqr_bounds(sim_df[column])
          intervals[column] = {'lower_bound': lower, 'upper_bound': upper}

     # Second Loop: Filter rows based on the pre-calculated intervals
     reduced_df = sim_df.copy()
     for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
          lower_bound = intervals[column]['lower_bound']
          upper_bound = intervals[column]['upper_bound']
          # Apply filtering based on pre-calculated bounds
          reduced_df = reduced_df[(reduced_df[column] > lower_bound) & (reduced_df[column] < upper_bound)]

     reduced_df['returns'] = calculate_hourly_returns(reduced_df, 'date', 'close')

     # MinMax scaling
     scaler = MinMaxScaler(feature_range=(0, 1))  # Default range is (0, 1)

     scaled_df = reduced_df.copy()
     scaled_df['lending_rate'] = scaler.fit_transform(reduced_df[['lending_rate']])
     scaled_df['borrowing_rate'] = scaler.fit_transform(reduced_df[['borrowing_rate']])
     scaled_df['utilization_rate'] = scaler.fit_transform(reduced_df[['utilization_rate']])
     scaled_df['close'] = scaler.fit_transform(reduced_df[['close']])
     scaled_df['volume'] = scaler.fit_transform(reduced_df[['volume']])
     scaled_df['returns'] = scaler.fit_transform(reduced_df[['returns']])

     inputs, targets = extract_valid_windows(
          scaled_df,
          'date', 
          40, 10, 
          ['lending_rate',	'borrowing_rate','utilization_rate','returns', 'volume'], 
          ['lending_rate','borrowing_rate']
          )
     
     # Append results from the current DataFrame
     all_inputs.append(inputs)
     all_targets.append(targets)
     

# Concatenate all inputs and targets into single arrays
all_inputs = np.concatenate(all_inputs, axis=0) if all_inputs else np.array([])
all_targets = np.concatenate(all_targets, axis=0) if all_targets else np.array([])
     
          

BATUSDT 28159
LINKUSDT 31934
KNCUSDT 27690
MKRUSDT 30094
MANAUSDT 27145
ZRXUSDT 28255
SNXUSDT 31087
WBTCUSDT 11729
ENJUSDT 27034
RENUSDT 27658
YFIUSDT 29900
UNIUSDT 29885
CRVUSDT 28455
BALUSDT 24855
ENSUSDT 20745
1INCHUSDT 14025


In [57]:
print(all_inputs.shape)
print(all_targets.shape)

(111053, 40, 5)
(111053, 10, 2)


In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

input_window = 40
target_window = 5

input_columns = ['high', 'low', 'close', 'volume', 'utilization_rate', 'stable_borrow_rate']

target_columns = ['borrowing_rate','lending_rate' ]

X_train, X_test, y_train, y_test = train_test_split(all_inputs, all_targets, test_size=0.2, random_state=12)


def create_lstm_model(input_shape, output_shape, dropout_rate=0.2):
    """Creates an LSTM model for time-series prediction."""
    model = Sequential([
        LSTM(64, activation='tanh', input_shape=(input_window, len(input_columns)), return_sequences=True),
        Dropout(dropout_rate),
        LSTM(32, activation='tanh', return_sequences=False),  # Return the last hidden state
        Dense(output_shape) 
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

lstm_model = create_lstm_model(input_shape=(input_window, len(input_columns)), 
                               output_shape=target_window * len(target_columns))

# Train the model
history = lstm_model.fit(
    X_train,
    y_train.reshape(y_train.shape[0], -1),  
    epochs=2,
    batch_size=32,
    validation_split=0.2  # Automatically splits 20% of X_train and y_train for validation
)


test_loss, test_mae = lstm_model.evaluate(X_test, y_test.reshape(y_test.shape[0], -1))
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

predictions = lstm_model.predict(X_test)

Epoch 1/2


  super().__init__(**kwargs)


ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 5 and 6 for '{{node sequential_5_1/lstm_10_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](sequential_5_1/lstm_10_1/strided_slice_2, sequential_5_1/lstm_10_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [?,5], [6,256].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(None, 5), dtype=float32)
  • states=('tf.Tensor(shape=(None, 64), dtype=float32)', 'tf.Tensor(shape=(None, 64), dtype=float32)')
  • training=True