In [33]:
import numpy as np
import pandas as pd
import mysql.connector
from mysql.connector import Error
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# MySQL database connection function
def connect_to_database():
    try:
        # Establishing connection to the database
        connection = mysql.connector.connect(
            host='crypto-matter.c5eq66ogk1mf.eu-central-1.rds.amazonaws.com',
            database='Crypto',
            user='Jing',  # Replace with your actual first name
            password='Crypto12!'
        )

        if connection.is_connected():
            db_info = connection.get_server_info()
            print("Connected to MySQL database, MySQL Server version: ", db_info)
            return connection

    except Error as e:
        print("Error while connecting to MySQL", e)
        return None

# Function to query merged data from crypto_lending_borrowing and crypto_price tables
def query_merged_crypto_data(connection):
    query = """
    SELECT clb.lending_rate, clb.borrowing_rate, clb.utilization_rate, clb.stable_borrow_rate,
    cp.*, usb.yield
    FROM crypto_lending_borrowing clb
    JOIN crypto_price cp 
        ON clb.crypto_symbol = cp.crypto_symbol
        AND clb.date = cp.date
    LEFT JOIN US_Bond_Yield usb
        ON clb.date = usb.date
    WHERE UPPER(clb.crypto_symbol) IN ('1INCHUSDT', 'BALUSDT', 'BATUSDT', 'CRVUSDT', 'ENJUSDT', 'ENSUSDT', 'KNCUSDT', 'LINKUSDT', 'MANAUSDT', 'MKRUSDT', 'RENUSDT', 'SNXUSDT', 'UNIUSDT', 'WBTCUSDT', 'YFIUSDT', 'ZRXUSDT')
    """
    cursor = connection.cursor()

    try:
        # Execute the query
        cursor.execute(query)

        # Fetch all results
        results = cursor.fetchall()

        # Get column names from cursor description
        columns = [desc[0] for desc in cursor.description]

        # Convert results to a Pandas DataFrame
        df = pd.DataFrame(results, columns=columns)

        return df

    except Error as e:
        print(f"Error: {e}")
        return None
    finally:
        cursor.close()

# Function to close the database connection
def query_quit(connection):
    if connection.is_connected():
        connection.close()
        print("MySQL connection is closed")

# Define a function to calculate outlier bounds using IQR
def calculate_iqr_bounds(series, multiplier=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = max(Q1 - multiplier * IQR, 0)
    upper_bound = Q3 + multiplier * IQR
    return lower_bound, upper_bound

# calculate returns on valid windows
def calculate_hourly_returns(df, date_col, close_col):
    """
    Calculates returns based on the close price, only if the date difference is 1 hour.

    Args:
        df (pd.DataFrame): The DataFrame containing the time series data.
        date_col (str): The name of the datetime column.
        close_col (str): The name of the close price column.

    Returns:
        pd.Series: A Series containing the calculated returns or None for invalid rows.
    """
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Sort by date to ensure sequential order
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Calculate the time difference between consecutive rows in hours
    time_diff = df[date_col].diff().dt.total_seconds() / 3600
    
    # Calculate returns only for rows where time_diff == 1 hour
    returns = np.where(
        time_diff == 1,
        (df[close_col] - df[close_col].shift(1)) / df[close_col].shift(1),
        None
    )
    
    return pd.Series(returns, index=df.index)

# now we have a dataframe that does not have any NA and ay outlier, but its time series is corrupted, therefore we need valid windows
def extract_valid_windows_v3(df, date_col, input_window, target_window, input_columns, target_columns,  train_end_date, valid_end_date):
    """
    Extracts valid windows from a time series DataFrame for LSTM training.
    
    Args:
        df (pd.DataFrame): The time series DataFrame with a datetime column.
        date_col (str): The name of the datetime column.
        input_window (int): The number of timesteps for the input sequence.
        target_window (int): The number of timesteps for the target sequence.
        input_columns (list of str): List of column names to include in the input data.
        target_columns (list of str): List of column names to include in the target data.
        
    Returns:
        inputs (list of np.ndarray): List of valid input sequences.
        targets (list of np.ndarray): two values
    """
    # Sort by the datetime column to ensure the time series is ordered
    df = df.sort_values(by=date_col).reset_index(drop=True)

    train_end_date = pd.to_datetime(train_end_date)
    valid_end_date = pd.to_datetime(valid_end_date)
    
    # Ensure the datetime column is in pandas datetime format
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Identify valid consecutive rows (1-hour apart)
    time_diffs = df[date_col].diff().dt.total_seconds()
    valid_indices = time_diffs == 3600  # 1 hour = 3600 seconds
    
    # Mark valid sequences
    valid_sequence_flags = valid_indices | valid_indices.shift(-1, fill_value=False)
    df = df[valid_sequence_flags].reset_index(drop=True)

    # Prepare inputs and targets
    input_train = []
    input_valid = []
    input_test = []
    target_train = []
    target_valid = []
    target_test = []


    total_window = input_window + target_window

    for i in range(len(df) - total_window + 1):
        # Extract a potential window of size `total_window`
        window = df.iloc[i:i+total_window]
        window_end_date = window[date_col].iloc[-1]
        
        # Check if all rows in the window are 1-hour apart
        if (window[date_col].diff().dt.total_seconds()[1:] == 3600).all():
            # Split into input and target based on specified columns
            input_data = window.iloc[:input_window][input_columns].values
            target_data = window.iloc[input_window:][target_columns].values
            
            # Calculate differences and sign
            differences = target_data[-1, :] - target_data[0, :]
            differences = custom_sign(differences)
            
            # Categorize the window based on its end date
            if window_end_date <= train_end_date:
                input_train.append(input_data)
                target_train.append(differences)
            elif window_end_date <= valid_end_date:
                input_valid.append(input_data)
                target_valid.append(differences)
            else:
                input_test.append(input_data)
                target_test.append(differences)

    # Convert to numpy arrays
    inputs_train, inputs_valid, inputs_test = np.array(input_train), np.array(input_valid), np.array(input_test)
    targets_train, targets_valid, targets_test = np.array(target_train), np.array(target_valid), np.array(target_test)

    return inputs_train, inputs_valid, inputs_test, targets_train, targets_valid, targets_test

# Custom sign function
def custom_sign(x):
    return np.where(x > 0, 1, np.where(x == 0, 0, 2))

In [29]:
def create_llm_embeddings(dataframe, col, n_components=10):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    unique_values = dataframe[col].unique()
    
    # Get embeddings for the unique values
    embeddings = model.encode(unique_values, show_progress_bar=False)

    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Create a DataFrame to hold the reduced embeddings
    reduced_embeddings_df = pd.DataFrame(reduced_embeddings, columns=[f'{col}_embedding_{i+1}' for i in range(n_components)])

    reduced_embeddings_df[col] = unique_values

    dataframe = dataframe.merge(reduced_embeddings_df, on=col, how='left')

    return dataframe

In [None]:
connect = connect_to_database()

merged_df = query_merged_crypto_data(connect)

merged_df_emb = create_llm_embeddings(merged_df, "crypto_symbol", n_components=1)

merged_df_emb.head()

Connected to MySQL database, MySQL Server version:  8.0.39




       lending_rate borrowing_rate utilization_rate stable_borrow_rate  \
66769      0.000091       0.016295        -0.099061           0.038281   
381288     0.000001       0.201231        -0.137197           0.009337   
280447     0.000013       0.373045         0.201463           0.082865   
200594     0.000000       0.201408        -0.117472           0.033946   
414312     0.000000       0.200220         0.762957           0.031154   

       crypto_symbol                date   high    low  close adj_close  \
66769       LINKUSDT 2024-10-04 10:00:00  11.05  10.96  10.99     10.99   
381288       UNIUSDT 2024-10-04 10:00:00   6.71   6.67   6.67      6.67   
280447       ENJUSDT 2024-10-04 10:00:00   0.15   0.15   0.15      0.15   
200594       ZRXUSDT 2024-10-04 10:00:00   0.31   0.31   0.31      0.31   
414312       CRVUSDT 2024-10-04 10:00:00   0.26   0.26   0.26      0.26   

         volume market_cap coin_supply         yield  \
66769     78854       None        None  0.000069

In [41]:
merged_df_emb['date'] = pd.to_datetime(merged_df_emb['date'])

# Sort in descending order
merged_df_emb = merged_df_emb.sort_values(by='date', ascending=False)

# Print the top rows after sorting
print(merged_df_emb['date'].head())

66769    2024-10-04 10:00:00
381288   2024-10-04 10:00:00
280447   2024-10-04 10:00:00
200594   2024-10-04 10:00:00
414312   2024-10-04 10:00:00
Name: date, dtype: datetime64[ns]


In [None]:
symbols = merged_df_emb["crypto_symbol"].unique()
X_train,X_valid,X_test = []
Y_train,Y_valid,Y_test = []
loop = 0
for s in symbols:
     loop = loop + 1
     print(loop)
     sim_df = merged_df_emb[merged_df_emb['crypto_symbol'] == s]
     
     inputs_train, inputs_valid, inputs_test, targets_train, targets_valid, targets_test = extract_valid_windows_v3(sim_df, 'date', 48,48, ["date", "crypto_symbol_embedding_1","lending_rate"], ["date", "crypto_symbol_embedding_1","lending_rate"], train_end_date="2024-01-01 00:00:00", valid_end_date="2024-07-01 00:00:00")
     all_inputs.append(inputs)
     all_targets.append(targets)


X_train  = np.concatenate(X_train, axis=0) if X_train else np.array([])
X_valid = np.concatenate(X_valid, axis=0) if X_valid else np.array([])
X_test = np.concatenate(X_test, axis=0) if X_test else np.array([])

Y_train  = np.concatenate(Y_train, axis=0) if Y_train else np.array([])
Y_valid = np.concatenate(Y_valid, axis=0) if Y_valid else np.array([])
Y_test  = np.concatenate(Y_test, axis=0) if Y_test else np.array([])

     

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [35]:
print(all_inputs.shape)
print(all_targets.shape)

(472515, 48, 3)
(472515, 48, 3)


In [36]:
print(all_inputs[200,:,0])


[Timestamp('2020-12-10 22:00:00') Timestamp('2020-12-10 23:00:00')
 Timestamp('2020-12-11 00:00:00') Timestamp('2020-12-11 01:00:00')
 Timestamp('2020-12-11 02:00:00') Timestamp('2020-12-11 03:00:00')
 Timestamp('2020-12-11 04:00:00') Timestamp('2020-12-11 05:00:00')
 Timestamp('2020-12-11 06:00:00') Timestamp('2020-12-11 07:00:00')
 Timestamp('2020-12-11 08:00:00') Timestamp('2020-12-11 09:00:00')
 Timestamp('2020-12-11 10:00:00') Timestamp('2020-12-11 11:00:00')
 Timestamp('2020-12-11 12:00:00') Timestamp('2020-12-11 13:00:00')
 Timestamp('2020-12-11 14:00:00') Timestamp('2020-12-11 15:00:00')
 Timestamp('2020-12-11 16:00:00') Timestamp('2020-12-11 17:00:00')
 Timestamp('2020-12-11 18:00:00') Timestamp('2020-12-11 19:00:00')
 Timestamp('2020-12-11 20:00:00') Timestamp('2020-12-11 21:00:00')
 Timestamp('2020-12-11 22:00:00') Timestamp('2020-12-11 23:00:00')
 Timestamp('2020-12-12 00:00:00') Timestamp('2020-12-12 01:00:00')
 Timestamp('2020-12-12 02:00:00') Timestamp('2020-12-12 03:00:

In [37]:
print(all_targets[200,:,0])

[Timestamp('2020-12-13 22:00:00') Timestamp('2020-12-13 23:00:00')
 Timestamp('2020-12-14 00:00:00') Timestamp('2020-12-14 01:00:00')
 Timestamp('2020-12-14 02:00:00') Timestamp('2020-12-14 03:00:00')
 Timestamp('2020-12-14 04:00:00') Timestamp('2020-12-14 05:00:00')
 Timestamp('2020-12-14 06:00:00') Timestamp('2020-12-14 07:00:00')
 Timestamp('2020-12-14 08:00:00') Timestamp('2020-12-14 09:00:00')
 Timestamp('2020-12-14 10:00:00') Timestamp('2020-12-14 11:00:00')
 Timestamp('2020-12-14 12:00:00') Timestamp('2020-12-14 13:00:00')
 Timestamp('2020-12-14 14:00:00') Timestamp('2020-12-14 15:00:00')
 Timestamp('2020-12-14 16:00:00') Timestamp('2020-12-14 17:00:00')
 Timestamp('2020-12-14 18:00:00') Timestamp('2020-12-14 19:00:00')
 Timestamp('2020-12-14 20:00:00') Timestamp('2020-12-14 21:00:00')
 Timestamp('2020-12-14 22:00:00') Timestamp('2020-12-14 23:00:00')
 Timestamp('2020-12-15 00:00:00') Timestamp('2020-12-15 01:00:00')
 Timestamp('2020-12-15 02:00:00') Timestamp('2020-12-15 03:00: