## Cross-Validation Native Baseline

In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Define percentage-based Theil's U (consistent with your modeling)
def theil_u(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    y_true_pct = (y_true[1:] - y_true[:-1]) / y_true[:-1]
    y_pred_pct = (y_pred[1:] - y_true[:-1]) / y_true[:-1]

    numerator = np.sqrt(np.mean((y_true_pct - y_pred_pct)**2))
    denominator = np.sqrt(np.mean(y_true_pct**2)) + np.sqrt(np.mean(y_pred_pct**2))

    return numerator / denominator

# Load and preprocess data
df = pd.read_csv('../../Data/Data_larger dataset/train_set.csv')
df['block_time'] = pd.to_datetime(df['block_time'])
df = df.sort_values('block_time')

# Cross-validation setup
n_total = len(df)
fold_size = 144
initial_train_size = 9665

mae_list = []
rmse_list = []
theil_u_list = []

for i in range(5):
    train_end = initial_train_size + i * fold_size
    test_start = train_end
    test_end = test_start + fold_size

    train_data = df.iloc[:train_end]
    test_data = df.iloc[test_start:test_end]

    # Naive prediction: mean of last 144 values in training
    avg_fee = train_data['block_median_fee_rate'].iloc[-fold_size:].mean()
    naive_pred = [avg_fee] * fold_size
    actual = test_data['block_median_fee_rate'].values

    # Evaluation
    mae = mean_absolute_error(actual, naive_pred)
    rmse = mean_squared_error(actual, naive_pred, squared=False)
    u = theil_u(actual, naive_pred)

    # Store results
    mae_list.append(mae)
    rmse_list.append(rmse)
    theil_u_list.append(u)



print(f"Naive Baseline (CV avg):")
print(f"MAE = {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE = {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"Theil's U = {np.mean(theil_u_list):.4f} ± {np.std(theil_u_list):.4f}")



Naive Baseline (CV avg):
MAE = 2.7975 ± 1.3002
RMSE = 3.2924 ± 1.4758
Theil's U = 0.5582 ± 0.0741




## Entire Dataset to Predict the Next 144 Blocks and Compare with Actual Values

In [4]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def theil_u(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    y_true_pct = (y_true[1:] - y_true[:-1]) / y_true[:-1]
    y_pred_pct = (y_pred[1:] - y_true[:-1]) / y_true[:-1]

    numerator = np.sqrt(np.mean((y_true_pct - y_pred_pct)**2))
    denominator = np.sqrt(np.mean(y_true_pct**2)) + np.sqrt(np.mean(y_pred_pct**2))

    return numerator / denominator

# Load datasets
train_df = pd.read_csv('../../Data/Data_larger dataset/train_set.csv')
test_df = pd.read_csv('../../Data/Data_larger dataset/test_set.csv')

# Sort and preprocess
train_df['block_time'] = pd.to_datetime(train_df['block_time'])
test_df['block_time'] = pd.to_datetime(test_df['block_time'])

train_df = train_df.sort_values('block_time')
test_df = test_df.sort_values('block_time')

# Naive prediction: average of last 144 fee rates in training set
naive_avg = train_df['block_median_fee_rate'].iloc[-144:].mean()
naive_pred = [naive_avg] * len(test_df)

# Actual values from test set
actual = test_df['block_median_fee_rate'].values

# Evaluation
mae = mean_absolute_error(actual, naive_pred)
rmse = mean_squared_error(actual, naive_pred, squared=False)
theils_u_score = theil_u(actual, naive_pred)

print(f"Naive Baseline on Test Set:\nMAE = {mae:.4f}, RMSE = {rmse:.4f}, Theil's U = {theils_u_score:.4f}")


Naive Baseline on Test Set:
MAE = 1.4479, RMSE = 1.7789, Theil's U = 0.5230


