In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

import pandas as pd


Define Evaluation Metrics

Since  predicting a continuous variable (block_median_fee_rate), regression metrics like:

Mean Absolute Error (MAE)

Root Mean Squared Error (RMSE)

R² Score

In [2]:
!pip install xgboost statsmodels tensorflow




In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense



In [4]:
df = pd.read_csv('bitcoin_data_cleaned.csv')
df = df.set_index('block_time')
df = df.drop(columns=['timestamp'])

# Define features (X) and target (y)
X = df.drop(columns=['block_median_fee_rate'])
y = df['block_median_fee_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
# Store performance metrics for each model
results = {}

# 1. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
results['Random Forest'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R²': r2_score(y_test, rf_y_pred)
}



In [10]:
results

{'Random Forest': {'MAE': 5.991443900256219e-08,
  'RMSE': 1.3488465262316422e-07,
  'R²': 0.9517162245256123}}

In [11]:
feature_importances = rf_model.feature_importances_
feature_names = X_train.columns

# Sort feature importance values
sorted_idx = np.argsort(feature_importances)[::-1]

for idx in sorted_idx:
    print(f"{feature_names[idx]}: {feature_importances[idx]}")


block_height: 0.5113282096245942
hash_rate: 0.24036485127137847
difficulty: 0.06863439553879135
block_interval: 0.043315673031735724
tx_count: 0.028883753372984874
transaction_count: 0.025152585656664197
mempool_size_mb: 0.024923505941739393
total_fee: 0.018950333903754227
max_fee_rate: 0.015773619218924513
mempool_usage: 0.011386717708480727
block_weight: 0.005651126148097612
block_version: 0.005374000626670254
mempool_min_fee: 0.00025440063400363285
fee_rate_std: 6.8273221807573654e-06
fee_rate_90th: 0.0
fee_rate_10th: 0.0
median_fee_rate: 0.0
avg_fee_rate: 0.0


Key Observations:

Most Important Features:

block_height: This is the most important feature with a significant weight of 0.511.
hash_rate: This is the second most important feature, contributing around 0.240 to the model.
difficulty and block_interval: These also have moderate importance, suggesting that they contribute to the predictive power of the model.

Less Important Features:

Features like fee_rate_std, fee_rate_90th, fee_rate_10th, median_fee_rate, and avg_fee_rate have extremely low or even zero importance. This indicates that the model does not find these features useful for prediction.

Mempool Features:

mempool_size_mb and mempool_usage have some importance but are not very strong contributors compared to features like block_height and hash_rate.

Suggested Action:

Based on these observations:

1. Remove Features with Very Low Importance:
 remove features like fee_rate_90th, fee_rate_10th, median_fee_rate, and avg_fee_rate, as their importance values are 0.0.

Consider removing mempool_min_fee and fee_rate_std as they contribute very little to the model's performance.

2. Retain Key Features:

Keep the highly important features (block_height, hash_rate, difficulty, block_interval) and other moderately important ones like tx_count and mempool_size_mb.

3. Rebuild the Model:

Retrain the Random Forest model without the low-importance features and observe if the performance metrics (MAE, RMSE, R²) improve or stay the same.

In [12]:
# Define a subset of important features based on the feature importance results
important_features = ['block_height', 'hash_rate', 'difficulty', 'block_interval',
                      'tx_count', 'transaction_count', 'mempool_size_mb', 'total_fee', 'max_fee_rate']

# Use the important features for training the Random Forest
X_train_imp = X_train[important_features]
X_test_imp = X_test[important_features]

# Train the model with only important features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_imp, y_train)
rf_y_pred = rf_model.predict(X_test_imp)

# Store the performance results for the Random Forest model
results['Random Forest (Important Features)'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R²': r2_score(y_test, rf_y_pred)
}


In [13]:
results

{'Random Forest': {'MAE': 5.991443900256219e-08,
  'RMSE': 1.3488465262316422e-07,
  'R²': 0.9517162245256123},
 'Random Forest (Important Features)': {'MAE': 5.6041769383172356e-08,
  'RMSE': 1.2974173105356173e-07,
  'R²': 0.9553279870653149}}

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor


# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']  # Removed 'auto'
}

# Initialize RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV with error_score='raise' to catch any errors during search
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                               n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1, error_score='raise')

# Fit the random search model
rf_random.fit(X_train_imp, y_train)

# Get the best parameters
best_params = rf_random.best_params_
print("Best parameters found: ", best_params)

# Evaluate the best model on the test set
best_rf_model = rf_random.best_estimator_
rf_y_pred_best = best_rf_model.predict(X_test_imp)

# Store the performance results
results = {}
results['Random Forest (Tuned)'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred_best),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred_best)),
    'R²': r2_score(y_test, rf_y_pred_best)
}

# Print the results
print("Performance after tuning:")
for metric, value in results['Random Forest (Tuned)'].items():
    print(f"{metric}: {value:.6f}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   3.1s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  12.8s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   8.6s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  12.1s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=  33.0s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   3.0s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimato



[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   3.6s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  36.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  12.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.2s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  11.3s
[CV] END max_depth=50, max_features

In [16]:
print("Scale of target variable (y_test):")
print(y_test.describe())


Scale of target variable (y_test):
count    9.584000e+03
mean     4.556317e-07
std      6.138816e-07
min      3.013940e-08
25%      1.280832e-07
50%      2.400000e-07
75%      4.775455e-07
max      3.298996e-06
Name: block_median_fee_rate, dtype: float64


Our MAE and RMSE are close to zero, likely due to the small scale of the target variable, block_median_fee_rate. 
 low error values are expected and appropriate for this scale.

However, the block median fee rate is typically expressed in satoshis per virtual byte (sat/vB), with typical values around 2-3 sat/vB, as seen in the the website https://mempool.space/.

To address this, I’ve updated the Python script (DO_real_time.py) to convert the fee rate from BTC/vB to sat/vB. We should ensure all code reflects this change and handles the new units consistently.



In [6]:
# 2. XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
results['XGBoost'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R²': r2_score(y_test, xgb_y_pred)
}



In [7]:
# 3. LSTM Model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(units=50))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Reshape data for LSTM
X_train_lstm = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_model = create_lstm_model((X_train_lstm.shape[1], 1))
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=2)
lstm_y_pred = lstm_model.predict(X_test_lstm)
results['LSTM'] = {
    'MAE': mean_absolute_error(y_test, lstm_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lstm_y_pred)),
    'R²': r2_score(y_test, lstm_y_pred)
}



Epoch 1/10


  super().__init__(**kwargs)


1198/1198 - 8s - 6ms/step - loss: 7.7420e-05
Epoch 2/10
1198/1198 - 6s - 5ms/step - loss: 5.5063e-06
Epoch 3/10
1198/1198 - 6s - 5ms/step - loss: 4.8869e-06
Epoch 4/10
1198/1198 - 6s - 5ms/step - loss: 5.4228e-06
Epoch 5/10
1198/1198 - 6s - 5ms/step - loss: 6.0531e-06
Epoch 6/10
1198/1198 - 6s - 5ms/step - loss: 1.7262e-06
Epoch 7/10
1198/1198 - 6s - 5ms/step - loss: 2.2949e-06
Epoch 8/10
1198/1198 - 6s - 5ms/step - loss: 2.1165e-06
Epoch 9/10
1198/1198 - 6s - 5ms/step - loss: 1.5940e-06
Epoch 10/10
1198/1198 - 6s - 5ms/step - loss: 1.4012e-06
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [8]:
# 4. Compare results
print("Model Comparison Results:")
for model, metrics in results.items():
    print(f"{model}: MAE = {metrics['MAE']}, RMSE = {metrics['RMSE']}, R² = {metrics['R²']}")


Model Comparison Results:
Random Forest: MAE = 5.991443900256219e-08, RMSE = 1.3488465262316422e-07, R² = 0.9517162245256123
XGBoost: MAE = 3.881243214433048e-07, RMSE = 6.139094598098831e-07, R² = -0.00019527177549383268
LSTM: MAE = 0.000373715335135604, RMSE = 0.0003785754692153475, R² = -380346.9542639499
