# Investigating Missing Values & Data IMputation
-------------------

> <i>Description: The first part of the notebook consists of visualisation and print outs for missing values, represented through different granularites. The last part is data imputation using regression and aave plattform logic. This was not in the end needed, as we used a valid windows approach versus a panel data with masked values approach. </i>

In [1]:
import os
import pandas as pd
import numpy as np

file_path = os.getcwd()

rates_path = f"{file_path}\\rates_data_v4.csv"  # based on old csv

rates_df = pd.read_csv(rates_path)

In [3]:
# convert to datatetime
rates_df["date"] = pd.to_datetime(rates_df["Timestamp"])
rates_df.drop(columns=["Timestamp", "Unnamed: 0"], inplace=True)

Checking NaN Counts:

In [7]:
nan_counts = rates_df.isna().sum()
print("NaN counts in each column:")
print(nan_counts)

NaN counts in each column:
Symbol                    0
liquidityRate_avg         0
variableBorrowRate_avg    0
utilizationRate_avg       0
stableBorrowRate_avg      0
date                      0
dtype: int64


In [8]:
# checking total zero counts

zero_counts = (rates_df == 0).sum()
print("Zero counts in each column:")
print(zero_counts)

print(rates_df['liquidityRate_avg'].value_counts()[0])
print(rates_df["Symbol"].nunique())

Zero counts in each column:
Symbol                         0
liquidityRate_avg         364822
variableBorrowRate_avg    370888
utilizationRate_avg         1080
stableBorrowRate_avg      551248
date                           0
dtype: int64
364822
68


In [9]:
# Function for counting consecutive zeroes

def max_consecutive_zeros(df):
    max_zeros = {}
    
    for col in df.columns:
        # Create a mask of where values are zero
        mask = df[col] == 0
        
        # Use the mask to find groups of consecutive zeros
        groups = mask.astype(int).groupby((mask != mask.shift()).cumsum()).sum()
        
        # Get the maximum length of consecutive zeros
        max_zeros[col] = groups.max() if not groups.empty else 0
        
    return max_zeros

max_zeros_counts = max_consecutive_zeros(rates_df)
print("Maximum consecutive zeros in each column:")
print(max_zeros_counts)

Maximum consecutive zeros in each column:
{'Symbol': 0, 'liquidityRate_avg': 14428, 'variableBorrowRate_avg': 34340, 'utilizationRate_avg': 243, 'stableBorrowRate_avg': 109874, 'date': 0}


### Overview of zeroes per Coin per Column 

In [10]:
import pandas as pd

# Sample DataFrame
# rates_df = pd.DataFrame({
#     'symbol': ['A', 'B', 'A', 'B', 'A', 'C', 'C', 'C', 'A', 'A'],
#     'lending_rate': [0, 0.05, 0, 0, 0.02, 0, 0, 0, 0, 0],
#     'borrowing_rate': [0, 0, 0, 0.03, 0, 0, 0, 0, 0, 0.01]
# })

def count_consecutive_zeros(series):
    """Count the maximum number of consecutive zeros in a Series."""
    max_count = 0
    current_count = 0

    for value in series:
        if value == 0:
            current_count += 1
            max_count = max(max_count, current_count)
        else:
            current_count = 0

    return max_count

# Initialize an empty list to store results
results = []

# Group by 'symbol'
grouped = rates_df.groupby('Symbol')

# Iterate over each numeric column
for column in rates_df.select_dtypes(include='number').columns:
    # Calculate zeros and max consecutive zeros for each symbol and column
    summary = grouped[column].agg(
        zero_count=lambda x: (x == 0).sum(),  # Count of zeros
        max_consecutive_zeros=count_consecutive_zeros,  # Maximum consecutive zeros
        count_ = lambda x: (x.notnull()).sum()
    ).reset_index()
    
    # Add column name to the results
    summary['column'] = column
    results.append(summary)

# Concatenate all results into a single DataFrame
final_result = pd.concat(results, ignore_index=True)

# Reorder columns for clarity
final_result = final_result[['Symbol', 'column', 'zero_count', 'max_consecutive_zeros', 'count_']]

final_result_sorted = final_result.sort_values(by=['max_consecutive_zeros', 'zero_count'], ascending=False)

from IPython.display import HTML

# Display DataFrame as scrollable output
display(HTML(final_result_sorted.to_html(max_rows=300, max_cols=5)))


Unnamed: 0,Symbol,column,zero_count,max_consecutive_zeros,count_
70,AAAVE,variableBorrowRate_avg,34319,34319,34319
77,AAVE,variableBorrowRate_avg,34319,34319,34319
206,AAAVE,stableBorrowRate_avg,34319,34319,34319
213,AAVE,stableBorrowRate_avg,34319,34319,34319
236,ASUSD,stableBorrowRate_avg,34257,34257,34257
263,SUSD,stableBorrowRate_avg,34257,34257,34257
234,ASNX,stableBorrowRate_avg,34203,34203,34203
262,SNX,stableBorrowRate_avg,34203,34203,34203
245,AXSUSHI,stableBorrowRate_avg,32309,32309,32309
233,ARENFIL,stableBorrowRate_avg,29021,29021,29021


### Data Imputation using AAVE Plattform Algorithm Logic

Variable Borrow Rate={ 
Base Rate+ U*Multiplier + (if U > threshold for U High multiplier * U)
​
if Utilization<Kink
if Utilization≥Kink}
​
liquidation rate = variable borrow rate 
 
variable_borrow_rate/liqudidation_rate = alpha + ß * U  + (Max(U-ß1, 0)/(U-ß1))* ß2 * U

liqudidation_rate = alpha + ß * variable_borrow_rate

variable_borrow_rate = alpha + ß * liqudidation_rate

1. find missing values
2. create lists of consecutive missing entries
3. look left and right from missing entries, try to get length in each direction equivalent to length of missing entries or nearest amount and concatenate these together
4. drop sparse entries
5. for missing variable borrow rate: calculate parameters for variable borrow rate from U, 
calculate parameters for variable borrow rate from liduidation rate
for missing variable liquidation rate: calculate paramaters for liqudidation rate from U, calculate paramters for variable borrow rate
6. check for Error, accept imputation according to threshold
7. predict in for missing entries if either rate exists first (less entropy) and if not exists use U for prediction, if neither exists

In [None]:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')


class MissingDataHandler:
    def __init__(self, df, variable_borrow_rate_column='variable_borrow_rate',
                 liquidation_rate_column='liquidation_rate', utilization_rate_column='utilization_rate'):
        self.df = df
        self.variable_borrow_rate_column = variable_borrow_rate_column
        self.liquidation_rate_column = liquidation_rate_column
        self.utilization_rate_column = utilization_rate_column
        self.missing_indices = {self.variable_borrow_rate_column: [], self.liquidation_rate_column: []}
        self.training_data = {self.variable_borrow_rate_column: {
        'utilization_regression_store': [],
        'liquidation_regression_store': []},self.liquidation_rate_column: {
        'utilization_regression_store': [],
        'liquidation_regression_store': []}}
        self.params = {self.variable_borrow_rate_column: {'from_utilization': [], 'from_liquidation_rate':[]},
                       self.liquidation_rate_column: {'from_utilization': [], 'from_variable_borrow_rate':[]}}
        self.r_squared = {self.variable_borrow_rate_column: {'from_utilization': [], 'from_liquidation_rate':[]},
                          self.liquidation_rate_column: {'from_utilization': [], 'from_variable_borrow_rate':[]}}

        # Initialize the missing data identification and training data preparation
        self._identify_missing_windows()
        self._prepare_training_data()

    def _identify_missing_windows(self):
            df = self.df
        # For each column we want to check, find missing windows for each symbol
            for column in self.missing_indices.keys():
                missing_indices = self._get_consecutive_missing(df, column)
                self.missing_indices[column] = missing_indices

    def _get_consecutive_missing(self, df, column):
        # Find start and end indices of consecutive missing values in the column
        missing_indices = []
        start = None
        is_missing = df[column] == 0
        
        for i, missing in enumerate(is_missing):
            if missing and start is None:
                start = i
            elif not missing and start is not None:
                missing_indices.append((start, i - 1))
                start = None
        if start is not None:
            missing_indices.append((start, len(is_missing) - 1))
        
        return missing_indices

    def _prepare_training_data(self):
            
            for column in self.missing_indices:
                missing_windows = self.missing_indices[column]

                df = self.df

                for start, end in missing_windows:
                    window_length = end - start + 1
                    if window_length < 24:
                        window_length = max(24, window_length * 2)

                    left_borrow = df[self.variable_borrow_rate_column].iloc[max(0, start - window_length):start].reset_index(drop=True)
                    right_borrow = df[self.variable_borrow_rate_column].iloc[end + 1:min(len(self.df), end + 1 + window_length)]

                    left_util = df[self.utilization_rate_column].iloc[max(0, start - window_length):start].reset_index(drop=True)
                    right_util = df[self.utilization_rate_column].iloc[end + 1:min(len(self.df), end + 1 + window_length)]

                    left_liq = df[self.liquidation_rate_column].iloc[max(0, start - window_length):start].reset_index(drop=True)
                    right_liq = df[self.liquidation_rate_column].iloc[end + 1:min(len(self.df), end + 1 + window_length)]

                    borrow = pd.concat([left_borrow, right_borrow]).replace(0, np.nan)
                    util = pd.concat([left_util, right_util]).replace(0, np.nan)
                    liq = pd.concat([left_liq, right_liq]).replace(0, np.nan)

            
                    valid_borrow = ~np.isnan(borrow)
                    valid_util = ~np.isnan(util)
                    valid_liq = ~np.isnan(liq)
                
                    # match borrow wih util
                    common_valid_indices = np.logical_and(valid_util, valid_borrow)  # Valid indices for both util and borrow
                    matched_util = util[common_valid_indices]
                    matched_borrow = borrow[common_valid_indices]

                    # For liquidation rate and borrow rate regression
                    common_valid_indices_liq = np.logical_and(valid_liq, valid_borrow)  # Valid indices for both liq and borrow
                    matched_liq = liq[common_valid_indices_liq]
                    matched_borrow_liq = borrow[common_valid_indices_liq]

                    # # match liq wih util
                    common_valid_indices_liq_util = np.logical_and(valid_liq, valid_util)
                    matched_liq_util = liq[common_valid_indices_liq_util]
                    matched_util_liq = util[common_valid_indices_liq_util]

                    if column == self.variable_borrow_rate_column:
                        if matched_util.size > 3 and matched_borrow.size > 3:
                            self.training_data[column]['utilization_regression_store'].append({
                            'utilization_rate': matched_util,
                            'variable_borrow_rate': matched_borrow})
                        else:
                            self.training_data[column]['utilization_regression_store'].append({
                            'utilization_rate': [],  
                            'variable_borrow_rate': []})
                    else:
                        if matched_liq_util.size > 3 and matched_util_liq.size > 1:
                            self.training_data[column]['utilization_regression_store'].append({
                            'utilization_rate': matched_util_liq,
                            'liquidation_rate': matched_liq_util})
                        else:
                            self.training_data[column]['utilization_regression_store'].append({
                            'utilization_rate': [],  
                            'liquidation_rate': [] })
                
                    if matched_liq.size > 1 and matched_borrow_liq.size > 1:
                        self.training_data[column]['liquidation_regression_store'].append({
                        'liquidation_rate': matched_liq,
                        'variable_borrow_rate': matched_borrow_liq})
                    else:
                        self.training_data[column]['liquidation_regression_store'].append({
                        'liquidation_rate': [], 
                        'variable_borrow_rate': []})
        
    def utilization_formula(self, U, alpha, beta, beta1, beta2):
        return alpha + beta * U + (np.maximum(U - beta1, 0) / (np.maximum(U - beta1, 1e-10))) * beta2 * U

    def fit_utilization_to_borrow_rate(self):
        for training_data in self.training_data[self.variable_borrow_rate_column]['utilization_regression_store']:
            U = training_data['utilization_rate']
            y = training_data['variable_borrow_rate']

            if len(U) == 0 or len(y) == 0:
                params = np.empty((0,))
                r2 = np.empty((0,))
                self.params[self.variable_borrow_rate_column]['from_utilization'].append(params)
                self.r_squared[self.variable_borrow_rate_column]['from_utilization'].append(r2)
            else:
                params, _ = curve_fit(self.utilization_formula, U, y, maxfev=10000)
                self.params[self.variable_borrow_rate_column]['from_utilization'].append(params)
            
                y_pred = self.utilization_formula(U, *params)
                ss_res = np.sum((y - y_pred) ** 2)
                ss_tot = np.sum((y - np.mean(y)) ** 2)
                r2 = 1 - (ss_res / ss_tot)
                self.r_squared[self.variable_borrow_rate_column]['from_utilization'].append(r2)
        
        for training_data in self.training_data[self.variable_borrow_rate_column]['liquidation_regression_store']:
            L = training_data['liquidation_rate']
            y = training_data['variable_borrow_rate']
            
            if len(L) == 0 or len(y) == 0:
                lin_params = np.empty((0,))
                r2 = np.empty((0,))
                self.r_squared[self.variable_borrow_rate_column][f'from_liquidation_rate'].append(r2)
                self.params[self.variable_borrow_rate_column][f'from_liquidation_rate'].append(lin_params)
            else:
                L = training_data['liquidation_rate'].values.reshape(-1,1)
                lin_reg = LinearRegression().fit(L,y)
                r2 = lin_reg.score(L, y)
                lin_params = lin_reg.coef_
                self.r_squared[self.variable_borrow_rate_column][f'from_liquidation_rate'].append(r2)
                self.params[self.variable_borrow_rate_column][f'from_liquidation_rate'].append(lin_params)
     
    def fit_utilization_to_liquidation_rate(self):
        for training_data in self.training_data[self.liquidation_rate_column]['utilization_regression_store']:
            U = training_data['utilization_rate']
            y = training_data['liquidation_rate']

            if len(U) == 0 or len(y) == 0:
                params = np.empty((0,))
                r2 = np.empty((0,))
                self.params[self.liquidation_rate_column]['from_utilization'].append(params)
                self.r_squared[self.liquidation_rate_column]['from_utilization'].append(r2)
            else:
                params, _ = curve_fit(self.utilization_formula, U, y, maxfev=10000)
                self.params[self.liquidation_rate_column]['from_utilization'].append(params)
                
                y_pred = self.utilization_formula(U, *params)
                ss_res = np.sum((y - y_pred) ** 2)
                ss_tot = np.sum((y - np.mean(y)) ** 2)
                r2 = 1 - (ss_res / ss_tot)
                self.r_squared[self.liquidation_rate_column]['from_utilization'].append(r2)
        
        for training_data in self.training_data[self.liquidation_rate_column]['liquidation_regression_store']:
            V = training_data['variable_borrow_rate']
            y = training_data['liquidation_rate']
            
            if len(V) == 0 or len(y) == 0:
                lin_params = np.empty((0,))
                r2 = np.empty((0,))
                self.r_squared[self.liquidation_rate_column][f'from_variable_borrow_rate'].append(r2)
                self.params[self.liquidation_rate_column][f'from_variable_borrow_rate'].append(lin_params)
            else:
                V = training_data['variable_borrow_rate'].values.reshape(-1,1)
                lin_reg = LinearRegression().fit(V,y)
                r2 = lin_reg.score(V, y)
                lin_params = lin_reg.coef_
                self.r_squared[self.liquidation_rate_column][f'from_variable_borrow_rate'].append(r2)
                self.params[self.liquidation_rate_column][f'from_variable_borrow_rate'].append(lin_params)
    
    def impute_data(self):

        def utilization_formula(U, alpha, beta, beta1, beta2):
            return alpha + beta * U + (np.maximum(U - beta1, 0) / (np.maximum(U - beta1, 1e-10))) * beta2 * U

        # missing indices
        variable_borrow_rate_missing = self.missing_indices[self.variable_borrow_rate_column]
        liquidation_rate_missing = self.missing_indices[self.liquidation_rate_column]
        
        # regression scores
        variable_borrow_rate_r2_U = self.r_squared[self.variable_borrow_rate_column]['from_utilization']
        variable_borrow_rate_r2_L = self.r_squared[self.variable_borrow_rate_column]['from_liquidation_rate']
        liquidation_rate_r2_U = self.r_squared[self.liquidation_rate_column]['from_utilization']
        liquidation_rate_r2_V = self.r_squared[self.liquidation_rate_column]['from_variable_borrow_rate']

        # parameters
        variable_borrow_rate_params_U = self.params[self.variable_borrow_rate_column]['from_utilization']
        variable_borrow_rate_params_L = self.params[self.variable_borrow_rate_column]['from_liquidation_rate']
        liquidation_rate_params_U = self.params[self.liquidation_rate_column]['from_utilization']
        liquidation_rate_params_V = self.params[self.liquidation_rate_column]['from_variable_borrow_rate']

        df = self.df

        for idx, (start, end) in enumerate(variable_borrow_rate_missing):
            r2_U = variable_borrow_rate_r2_U[idx]
            r2_L = variable_borrow_rate_r2_L[idx]
            U_params = variable_borrow_rate_params_U[idx]
            L_params = variable_borrow_rate_params_L[idx]

            if U_params.size == 0 or L_params.size == 0:
                continue
            
            data = self.df.iloc[start:end+1]
            imputed_mask = data[self.variable_borrow_rate_column] != 0

            if r2_L >= 0.9:
                valid_liq = ~imputed_mask & (data[self.liquidation_rate_column] != 0)
                self.df.loc[data[valid_liq].index, self.variable_borrow_rate_column] = (
                    L_params[0] * data.loc[valid_liq, self.liquidation_rate_column]
                )
                imputed_mask = df[self.variable_borrow_rate_column] != 0

            if r2_U >= 0.9:
                valid_util = ~imputed_mask & (data[self.utilization_rate_column] != 0)
                self.df.loc[data[valid_util].index, self.variable_borrow_rate_column] = utilization_formula(
                    data.loc[valid_util, self.utilization_rate_column], *U_params
                )

        for idx, (start, end) in enumerate(liquidation_rate_missing):
            r2_U = liquidation_rate_r2_U[idx]
            r2_V = liquidation_rate_r2_V[idx]
            U_params = liquidation_rate_params_U[idx]
            V_params = liquidation_rate_params_V[idx]
            
            if U_params.size == 0 or V_params.size == 0:
                continue

            data = self.df.iloc[start:end+1]
            imputed_mask = data[self.liquidation_rate_column] != 0  

            if r2_V >= 0.9:
                valid_var = ~imputed_mask & (data[self.variable_borrow_rate_column] != 0)
                self.df.loc[data[valid_var].index, self.liquidation_rate_column] = (
                    L_params[0] * data.loc[valid_var, self.variable_borrow_rate_column]
                )
                imputed_mask = df[self.liquidation_rate_column] != 0

            if r2_U >= 0.9:
                valid_util = ~imputed_mask & (data[self.utilization_rate_column] != 0)
                self.df.loc[data[valid_util].index, self.liquidation_rate_column] = utilization_formula(
                    data.loc[valid_util, self.utilization_rate_column], *U_params
                )

        return df

Has to be run per coin in a loop:

In [12]:
rates_data = pd.DataFrame(columns=rates_df.columns.to_list())

symbols = rates_df["Symbol"].unique()

for symbol in symbols:
    df = rates_df[rates_df["Symbol"]==symbol]
    print(symbol)
    data_handler = MissingDataHandler(df, "variableBorrowRate_avg", 
                                     "liquidityRate_avg", "utilizationRate_avg")
    data_handler.fit_utilization_to_borrow_rate()
    data_handler.fit_utilization_to_liquidation_rate()
    imputed_data = data_handler.impute_data()

    rates_data = pd.concat([rates_data, imputed_data], ignore_index=True)

print(rates_data['liquidityRate_avg'].value_counts()[0])
print(rates_data['variableBorrowRate_avg'].value_counts()[0])

DAI
TUSD
USDC
USDT
SUSD
BAT
LINK
KNC
MKR
MANA
ZRX
SNX
WBTC
BUSD
ENJ
REN
YFI
AAVE
UNI
AUSDT
AWBTC
AWETH
AYFI
AZRX
AUNI
AAAVE
ABAT
ABUSD
ADAI
AENJ
AKNC
ALINK
AMANA
AMKR
AREN
ASNX
ASUSD
ATUSD
AUSDC
ACRV
AGUSD
ABAL
AXSUSHI
ARENFIL
ARAI
AAMPL
AUSDP
ADPI
AFRAX
AFEI
ASTETH
AENS
AUST
ACVX
A1INCH
ALUSD
AAMMWETH
AAMMDAI
AAMMUSDC
AAMMUSDT
AAMMWBTC
WETH
LUSD
CRV
BAL
ENS
1INCH
FRAX
346218
348114


In [13]:
# storing file
file_path_imputed_data = f"{file_path}\\rates_data_imputed.csv"
rates_data.to_csv(file_path_imputed_data)