In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [8]:
# Define a function to calculate outlier bounds using IQR
def calculate_iqr_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = max(Q1 - 1.5 * IQR, 0)
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# calculate returns on valid windows
def calculate_hourly_returns(df, date_col, close_col):
    """
    Calculates returns based on the close price, only if the date difference is 1 hour.

    Args:
        df (pd.DataFrame): The DataFrame containing the time series data.
        date_col (str): The name of the datetime column.
        close_col (str): The name of the close price column.

    Returns:
        pd.Series: A Series containing the calculated returns or None for invalid rows.
    """
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Sort by date to ensure sequential order
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Calculate the time difference between consecutive rows in hours
    time_diff = df[date_col].diff().dt.total_seconds() / 3600
    
    # Calculate returns only for rows where time_diff == 1 hour
    returns = np.where(
        time_diff == 1,
        (df[close_col] - df[close_col].shift(1)) / df[close_col].shift(1),
        None
    )
    
    return pd.Series(returns, index=df.index)


# now we have a dataframe that does not have any NA and ay outlier, but its time series is corrupted, therefore we need valid windows
def extract_valid_windows(df, date_col, input_window, target_window, input_columns, target_columns):
    """
    Extracts valid windows from a time series DataFrame for LSTM training.
    
    Args:
        df (pd.DataFrame): The time series DataFrame with a datetime column.
        date_col (str): The name of the datetime column.
        input_window (int): The number of timesteps for the input sequence.
        target_window (int): The number of timesteps for the target sequence.
        input_columns (list of str): List of column names to include in the input data.
        target_columns (list of str): List of column names to include in the target data.
        
    Returns:
        inputs (list of np.ndarray): List of valid input sequences.
        targets (list of np.ndarray): List of corresponding target sequences.
    """
    # Sort by the datetime column to ensure the time series is ordered
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Ensure the datetime column is in pandas datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Identify valid consecutive rows (1-hour apart)
    time_diffs = df[date_col].diff().dt.total_seconds()
    valid_indices = time_diffs == 3600  # 1 hour = 3600 seconds
    
    # Mark valid sequences
    valid_sequence_flags = valid_indices | valid_indices.shift(-1, fill_value=False)
    df = df[valid_sequence_flags].reset_index(drop=True)

    # Prepare inputs and targets
    inputs, targets = [], []
    total_window = input_window + target_window

    for i in range(len(df) - total_window + 1):
        # Extract a potential window of size `total_window`
        window = df.iloc[i:i+total_window]
        
        # Check if all rows in the window are 1-hour apart
        if (window[date_col].diff().dt.total_seconds()[1:] == 3600).all():
            # Split into input and target based on specified columns
            input_data = window.iloc[:input_window][input_columns].values
            target_data = window.iloc[input_window:][target_columns].values
            inputs.append(input_data)
            targets.append(target_data)

    return np.array(inputs), np.array(targets)

In [2]:
import mysql.connector
from mysql.connector import Error
import pandas as pd

# MySQL database connection function
def connect_to_database():
    try:
        # Establishing connection to the database
        connection = mysql.connector.connect(
            host='crypto-matter.c5eq66ogk1mf.eu-central-1.rds.amazonaws.com',
            database='Crypto',
            user='Jing',  # Replace with your actual first name
            password='Crypto12!'
        )

        if connection.is_connected():
            db_info = connection.get_server_info()
            print("Connected to MySQL database, MySQL Server version: ", db_info)
            return connection

    except Error as e:
        print("Error while connecting to MySQL", e)
        return None



# Function to query merged data from crypto_lending_borrowing and crypto_price tables
def query_merged_crypto_data(connection):
    query = """
    SELECT clb.*, cp.*
    FROM crypto_lending_borrowing clb
    JOIN crypto_price cp 
        ON clb.crypto_symbol = cp.crypto_symbol
        AND clb.date = cp.date
    WHERE UPPER(clb.crypto_symbol) IN ('1INCHUSDT', 'BALUSDT', 'BATUSDT', 'CRVUSDT', 'ENJUSDT', 'ENSUSDT', 'KNCUSDT', 'LINKUSDT', 'MANAUSDT', 'MKRUSDT', 'RENUSDT', 'SNXUSDT', 'UNIUSDT', 'WBTCUSDT', 'YFIUSDT', 'ZRXUSDT')
    """
    cursor = connection.cursor()

    try:
        # Execute the query
        cursor.execute(query)

        # Fetch all results
        results = cursor.fetchall()

        # Get column names from cursor description
        columns = [desc[0] for desc in cursor.description]

        # Convert results to a Pandas DataFrame
        df = pd.DataFrame(results, columns=columns)

        return df

    except Error as e:
        print(f"Error: {e}")
        return None
    finally:
        cursor.close()

# Function to close the database connection
def query_quit(connection):
    if connection.is_connected():
        connection.close()
        print("MySQL connection is closed")


connection = connect_to_database()

if connection:
    # Query merged data
    merged_df = query_merged_crypto_data(connection)

    if merged_df is not None and not merged_df.empty:
        # Display first few rows of the DataFrame
        print("\nMerged DataFrame:")
        print(merged_df.head())

        # Save DataFrame to CSV
        merged_df.to_csv('merged_crypto_data.csv', index=False)
        print("\nMerged data saved to 'merged_crypto_data.csv'")
    else:
        print("\nNo data found after merging.")

    # Close the connection
    query_quit(connection)

Connected to MySQL database, MySQL Server version:  8.0.39

Merged DataFrame:
   id crypto_symbol                date lending_rate borrowing_rate  \
0   1       BATUSDT 2020-12-02 14:00:00   -50.000000     -50.000000   
1   2       BATUSDT 2020-12-02 15:00:00   -50.000000     -50.000000   
2   3       BATUSDT 2020-12-02 16:00:00   -50.000000     -50.000000   
3   4       BATUSDT 2020-12-02 17:00:00   -50.000000     -50.000000   
4   5       BATUSDT 2020-12-02 18:00:00   -50.000000     -50.000000   

  utilization_rate stable_borrow_rate crypto_symbol                date  high  \
0       -50.000000           0.030000       BATUSDT 2020-12-02 14:00:00  0.24   
1       -50.000000           0.030000       BATUSDT 2020-12-02 15:00:00  0.24   
2         0.013598           0.030000       BATUSDT 2020-12-02 16:00:00  0.24   
3         0.014834           0.030000       BATUSDT 2020-12-02 17:00:00  0.24   
4         0.014834           0.030000       BATUSDT 2020-12-02 18:00:00  0.25   

    low 

In [3]:
# Drop the second occurrence of a specific column
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
# data without NA rows if we needed 
filtered_df = merged_df[(merged_df['borrowing_rate'] != -50)&(merged_df['lending_rate'] != -50)&(merged_df['utilization_rate'] != -50)]
filtered_df.reset_index(inplace=True, drop=True)
# date formatting
filtered_df['date'] = pd.to_datetime(filtered_df['date'])
# taking the columns we want and converting them to floats
filtered_df = filtered_df[['crypto_symbol', 'date', 'lending_rate',	'borrowing_rate','utilization_rate','close', 'volume']]
filtered_df[['lending_rate',	'borrowing_rate','utilization_rate','close', 'volume']] = filtered_df[['lending_rate','borrowing_rate','utilization_rate','close', 'volume']].astype(float)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['date'])


In [None]:

# Initialize empty lists to store aggregated inputs and targets
all_inputs = []
all_targets = []

# looping through symbols
symbols = filtered_df['crypto_symbol'].unique()
for s in symbols:
     sim_df = filtered_df[filtered_df['crypto_symbol'] == s]
     sim_df.reset_index(inplace=True, drop=True)
     print(s, len(sim_df))
     # First Loop: Calculate intervals for each column without modifying the DataFrame
     intervals = {}
     for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
          lower, upper = calculate_iqr_bounds(sim_df[column])
          intervals[column] = {'lower_bound': lower, 'upper_bound': upper}

     # Second Loop: Filter rows based on the pre-calculated intervals
     reduced_df = sim_df.copy()
     for column in ['lending_rate', 'borrowing_rate', 'utilization_rate']:
          lower_bound = intervals[column]['lower_bound']
          upper_bound = intervals[column]['upper_bound']
          # Apply filtering based on pre-calculated bounds
          reduced_df = reduced_df[(reduced_df[column] > lower_bound) & (reduced_df[column] < upper_bound)]

     reduced_df['returns'] = calculate_hourly_returns(reduced_df, 'date', 'close')

     # MinMax scaling
     scaler = MinMaxScaler(feature_range=(0, 1))  # Default range is (0, 1)

     scaled_df = reduced_df.copy()
     scaled_df['lending_rate'] = scaler.fit_transform(reduced_df[['lending_rate']])
     scaled_df['borrowing_rate'] = scaler.fit_transform(reduced_df[['borrowing_rate']])
     scaled_df['utilization_rate'] = scaler.fit_transform(reduced_df[['utilization_rate']])
     scaled_df['close'] = scaler.fit_transform(reduced_df[['close']])
     scaled_df['volume'] = scaler.fit_transform(reduced_df[['volume']])
     scaled_df['returns'] = scaler.fit_transform(reduced_df[['returns']])

     inputs, targets = extract_valid_windows(
          scaled_df,
          'date', 
          40, 10, 
          ['lending_rate',	'borrowing_rate','utilization_rate','returns', 'volume'], 
          ['lending_rate','borrowing_rate']
          )
     
     # Append results from the current DataFrame
     all_inputs.append(inputs)
     all_targets.append(targets)
     

# Concatenate all inputs and targets into single arrays
all_inputs = np.concatenate(all_inputs, axis=0) if all_inputs else np.array([])
all_targets = np.concatenate(all_targets, axis=0) if all_targets else np.array([])
     
          

BATUSDT 28159
LINKUSDT 31934


In [11]:
print(all_inputs.shape)
print(all_targets.shape)

(23556, 40, 5)
(23556, 10, 2)
