Define Evaluation Metrics

Since  predicting a continuous variable (block_median_fee_rate), regression metrics like:

Mean Absolute Error (MAE)

Root Mean Squared Error (RMSE)

R² Score

In [38]:
!pip install xgboost statsmodels tensorflow




In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense



Multiply each of these columns by 10^8 to convert BTC/vB to sat/vB  (for BTC/vB, Our MAE and RMSE are close to zero, likely due to the small scale of the target variable, block_median_fee_rate.
 low error values are expected and appropriate for this scale.

However, the block median fee rate is typically expressed in satoshis per virtual byte (sat/vB), with typical values around 2-3 sat/vB, as seen in the the website https://mempool.space/.

To address this, I’ve updated the Python script (DO_real_time.py) to convert the fee rate from BTC/vB to sat/vB. We should ensure all code reflects this change and handles the new units consistently.)

In [141]:
df = pd.read_csv('bitcoin_history1_clean.csv')
df = df.set_index('block_time')


# List of fee rate columns that need to be converted from BTC/vB to sat/vB
fee_rate_columns = ['max_fee_rate', 'avg_fee_rate', 'median_fee_rate', 'fee_rate_10th', 'fee_rate_90th', 'fee_rate_std', 'block_median_fee_rate']


for col in fee_rate_columns:
    df[col] = df[col] * 10**8


print(df[fee_rate_columns].head())


                     max_fee_rate  avg_fee_rate  median_fee_rate  \
block_time                                                         
2023-08-15 13:04:26         200.0      2.546653          2.31746   
2023-08-15 13:07:55         200.0      2.547332          2.31746   
2023-08-15 13:08:04         200.0      2.547714          2.31746   
2023-08-15 13:16:21         200.0      2.548790          2.31746   
2023-08-15 13:26:12         200.0      2.550336          2.31746   

                     fee_rate_10th  fee_rate_90th  fee_rate_std  \
block_time                                                        
2023-08-15 13:04:26       2.206349       2.492063      2.449531   
2023-08-15 13:07:55       2.206349       2.492754      2.449685   
2023-08-15 13:08:04       2.206349       2.500000      2.449535   
2023-08-15 13:16:21       2.206349       2.500000      2.451369   
2023-08-15 13:26:12       2.206349       2.500000      2.456636   

                     block_median_fee_rate  
block_ti

In [142]:

# Define features (X) and target (y)
X = df.drop(columns=['block_median_fee_rate'])
y = df['block_median_fee_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [143]:
df.isnull().sum()

block_height             0
tx_count                 0
mempool_size_mb          0
max_fee_rate             0
avg_fee_rate             0
median_fee_rate          0
fee_rate_10th            0
fee_rate_90th            0
fee_rate_std             0
difficulty               0
hash_rate                0
total_fee                0
mempool_usage            0
transaction_count        0
block_weight             0
block_version            0
block_interval           0
block_median_fee_rate    0
dtype: int64

In [112]:
# Store performance metrics for each model
results = {}

# 1. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
results['Random Forest'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R²': r2_score(y_test, rf_y_pred)
}



In [113]:
results

{'Random Forest': {'MAE': 6.755992554455091,
  'RMSE': 13.976045597797938,
  'R²': 0.955314983919364}}

Normalize Features

In [114]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [115]:
# Fit Random Forest with scaled features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
rf_y_pred = rf_model.predict(X_test_scaled)
results['Random Forest (Normalize Features)'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R²': r2_score(y_test, rf_y_pred)
}


In [116]:
results

{'Random Forest': {'MAE': 6.755992554455091,
  'RMSE': 13.976045597797938,
  'R²': 0.955314983919364},
 'Random Forest (Normalize Features)': {'MAE': 6.757978417242709,
  'RMSE': 13.977807581468374,
  'R²': 0.9553037161782597}}

In [117]:
feature_importances = rf_model.feature_importances_
feature_names = X_train.columns

# Sort feature importance values
sorted_idx = np.argsort(feature_importances)[::-1]

for idx in sorted_idx:
    print(f"{feature_names[idx]}: {feature_importances[idx]}")


hash_rate: 0.42599317369576944
block_height: 0.2107037097402561
fee_rate_90th: 0.15170122700978725
block_interval: 0.045959488363430036
tx_count: 0.029306655739644125
fee_rate_10th: 0.024811832195348375
fee_rate_std: 0.02263195733161012
transaction_count: 0.01736424369478485
avg_fee_rate: 0.013302675949254217
mempool_size_mb: 0.0108250943066205
total_fee: 0.010226004737173397
mempool_usage: 0.00847592559289571
max_fee_rate: 0.008399314604768419
median_fee_rate: 0.007401598992715999
block_weight: 0.0055012651639994884
block_version: 0.004497917223795293
difficulty: 0.0028979156581467737


Key Observations:

Most Important Features:

block_height: This is the most important feature with a significant weight of 0.511.
hash_rate: This is the second most important feature, contributing around 0.240 to the model.
difficulty and block_interval: These also have moderate importance, suggesting that they contribute to the predictive power of the model.

Less Important Features:

Features like fee_rate_std, fee_rate_90th, fee_rate_10th, median_fee_rate, and avg_fee_rate have extremely low or even zero importance. This indicates that the model does not find these features useful for prediction.

Mempool Features:

mempool_size_mb and mempool_usage have some importance but are not very strong contributors compared to features like block_height and hash_rate.

Suggested Action:

Based on these observations:

1. Remove Features with Very Low Importance:
 remove features like fee_rate_90th, fee_rate_10th, median_fee_rate, and avg_fee_rate, as their importance values are 0.0.

Consider removing mempool_min_fee and fee_rate_std as they contribute very little to the model's performance.

2. Retain Key Features:

Keep the highly important features (block_height, hash_rate, difficulty, block_interval) and other moderately important ones like tx_count and mempool_size_mb.

3. Rebuild the Model:

Retrain the Random Forest model without the low-importance features and observe if the performance metrics (MAE, RMSE, R²) improve or stay the same.

In [118]:
# Define a subset of important features based on the feature importance results
important_features = ['block_height', 'hash_rate', 'difficulty', 'block_interval',
                      'tx_count', 'transaction_count', 'mempool_size_mb', 'total_fee', 'max_fee_rate']

# # Get the indices of important features from the original DataFrame
# important_feature_indices = [X_train.columns.get_loc(col) for col in important_features]
# 
# # Use the indices to select important features from the scaled arrays
# X_train_imp = X_train_scaled[:, important_feature_indices]
# X_test_imp = X_test_scaled[:, important_feature_indices]


X_train_imp = X_train[important_features]
X_test_imp = X_test[important_features]

# Train the model with only important features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_imp, y_train)
rf_y_pred = rf_model.predict(X_test_imp)

# Store the performance results for the Random Forest model
results['Random Forest (Important Features)'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R²': r2_score(y_test, rf_y_pred)
}


In [119]:
results

{'Random Forest': {'MAE': 6.755992554455091,
  'RMSE': 13.976045597797938,
  'R²': 0.955314983919364},
 'Random Forest (Normalize Features)': {'MAE': 6.757978417242709,
  'RMSE': 13.977807581468374,
  'R²': 0.9553037161782597},
 'Random Forest (Important Features)': {'MAE': 6.64908811049062,
  'RMSE': 13.69954116933218,
  'R²': 0.9570656053048433}}

In [120]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor


# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']  # Removed 'auto'
}

# Initialize RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV with error_score='raise' to catch any errors during search
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                               n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1, error_score='raise')

# Fit the random search model
rf_random.fit(X_train_imp, y_train)

# Get the best parameters
best_params = rf_random.best_params_
print("Best parameters found: ", best_params)

# Evaluate the best model on the test set
best_rf_model = rf_random.best_estimator_
rf_y_pred_best = best_rf_model.predict(X_test_imp)

# Store the performance results
results = {}
results['Random Forest (Tuned)'] = {
    'MAE': mean_absolute_error(y_test, rf_y_pred_best),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred_best)),
    'R²': r2_score(y_test, rf_y_pred_best)
}




Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   2.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=  16.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=  12.0s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estima



[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   2.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   3.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   1.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   4.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   5.7s
[CV] END max_depth=10, max_feature

In [121]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666}}

In [122]:
print("Scale of target variable (y_test):")
print(y_test.describe())


Scale of target variable (y_test):
count    5340.000000
mean       56.612392
std        66.121701
min         2.880136
25%        16.000000
50%        31.960500
75%        67.066104
max       348.000000
Name: block_median_fee_rate, dtype: float64


In [123]:
y_test

block_time
2023-10-21 13:38:02    16.000000
2023-09-08 02:29:49    15.000000
2023-09-02 20:07:32    22.056738
2023-10-16 11:51:40    15.068809
2023-09-18 17:05:38    51.631206
                         ...    
2023-11-06 09:24:06    50.042553
2024-01-29 19:36:05    43.000000
2023-10-04 02:31:44     4.028303
2023-12-09 15:51:07    57.000000
2023-09-08 10:00:01    15.926471
Name: block_median_fee_rate, Length: 5340, dtype: float64

In [124]:
# 2. # Use the important features for training the XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_imp, y_train)
xgb_y_pred = xgb_model.predict(X_test_imp)
results['XGBoost (Important Features)'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R²': r2_score(y_test, xgb_y_pred)
}



In [125]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666},
 'XGBoost (Important Features)': {'MAE': 8.157239716745382,
  'RMSE': 15.012403004893006,
  'R²': 0.9484422769075563}}

In [126]:
# use all the features  for XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
results['XGBoost'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R²': r2_score(y_test, xgb_y_pred)
}

In [127]:
# Fit XGBoost with scaled features
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Predict and evaluate
xgb_y_pred = xgb_model.predict(X_test_scaled)
results['XGBoost (Scaled features)'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R²': r2_score(y_test, xgb_y_pred)
}


In [128]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666},
 'XGBoost (Important Features)': {'MAE': 8.157239716745382,
  'RMSE': 15.012403004893006,
  'R²': 0.9484422769075563},
 'XGBoost': {'MAE': 8.42647087287646,
  'RMSE': 15.469208442586403,
  'R²': 0.9452568878154366},
 'XGBoost (Scaled features)': {'MAE': 8.086808847095037,
  'RMSE': 14.688989870059165,
  'R²': 0.9506397712782884}}

 Since use impotant features not improve the performance, So we try improve the performance of XGBoost model through feature engineering and fine-tuning:

In [129]:
df

Unnamed: 0_level_0,block_height,tx_count,mempool_size_mb,max_fee_rate,avg_fee_rate,median_fee_rate,fee_rate_10th,fee_rate_90th,fee_rate_std,difficulty,hash_rate,total_fee,mempool_usage,transaction_count,block_weight,block_version,block_interval,block_median_fee_rate
block_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-08-15 13:04:26,803299,29424.0,6.205224,200.000000,2.546653,2.317460,2.206349,2.492063,2.449531,5.239118e+13,6.574572e+20,0.155298,47014961.6,3401.00,3993402.0,793116672,405.0,8.000000
2023-08-15 13:07:55,803300,29433.0,6.205224,200.000000,2.547332,2.317460,2.206349,2.492754,2.449685,5.239118e+13,6.574572e+20,0.155378,47014961.6,5048.00,3993409.0,557588480,209.0,6.397059
2023-08-15 13:08:04,803301,29441.0,6.205224,200.000000,2.547714,2.317460,2.206349,2.500000,2.449535,5.239118e+13,6.574572e+20,0.155421,47014961.6,6228.03,3997929.0,536870912,9.0,6.345588
2023-08-15 13:16:21,803302,29452.0,6.205224,200.000000,2.548790,2.317460,2.206349,2.500000,2.451369,5.239118e+13,6.574572e+20,0.155512,47014961.6,4308.00,3993126.0,783720448,497.0,6.389706
2023-08-15 13:26:12,803303,29460.0,6.205224,200.000000,2.550336,2.317460,2.206349,2.500000,2.456636,5.239118e+13,6.574572e+20,0.155637,47014961.6,3335.00,3993490.0,830971904,591.0,7.059715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-11 16:54:13,829994,114970.0,40.197975,77.659574,2.166685,2.301587,1.098837,2.507937,0.838035,7.550217e+13,6.198125e+20,0.747854,263447344.0,2171.00,3993477.0,849379328,570.0,20.000000
2024-02-11 17:14:00,829995,115225.0,40.232427,77.659574,2.167829,2.301587,1.098837,2.507937,0.843237,7.550217e+13,6.198125e+20,0.748885,263766816.0,3031.00,3993059.0,797360128,1187.0,25.199050
2024-02-11 17:24:07,829996,115399.0,40.257759,77.659574,2.169418,2.301587,1.098837,2.507937,0.855287,7.550217e+13,6.198125e+20,0.749860,263989824.0,2676.00,3993783.0,574103552,607.0,17.000000
2024-02-11 17:36:01,829997,115652.0,40.294566,77.659574,2.170365,2.301587,1.098837,2.507937,0.856538,7.550217e+13,6.198125e+20,0.750889,264319392.0,2256.00,3992981.0,547356676,714.0,21.248718


Feature engineering

a) Interaction Terms:
Create interaction features that combine existing features to capture non-linear relationships. For example:

Transaction Count × Mempool Size: This might capture congestion in the network.
Fee Rate × Transaction Count: Can help model how fee rates change with transaction volumes.

In [130]:

X_train['tx_count_mempool_size'] = X_train['tx_count'] * X_train['mempool_size_mb']
X_test['tx_count_mempool_size'] = X_test['tx_count'] * X_test['mempool_size_mb']


b) Historical Rolling Features:
Create features based on historical data trends, such as:

Rolling averages (mean fee rate over the past N blocks).
Rolling standard deviation (how variable the fee rates are).

In [131]:
# Example: Calculate rolling average of 'avg_fee_rate'
X_train['rolling_avg_fee'] = X_train['avg_fee_rate'].rolling(window=10).mean()
X_test['rolling_avg_fee'] = X_test['avg_fee_rate'].rolling(window=10).mean()


c) Lagged Features:
Introduce lagged values of important features like fee rates, transaction counts, or mempool size to capture temporal dependencies.

In [132]:
X_train['lagged_avg_fee_rate'] = X_train['avg_fee_rate'].shift(1)
X_test['lagged_avg_fee_rate'] = X_test['avg_fee_rate'].shift(1)


d) Log Transformation:
Apply log transformation to skewed data. Features like transaction counts or mempool size might have long tails. Applying a log transformation can help.

In [133]:
X_train['log_tx_count'] = np.log1p(X_train['tx_count'])
X_test['log_tx_count'] = np.log1p(X_test['tx_count'])


In [134]:
# Feature engineering  for XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
results['XGBoost (Feature engineering)'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R²': r2_score(y_test, xgb_y_pred)
}

In [135]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666},
 'XGBoost (Important Features)': {'MAE': 8.157239716745382,
  'RMSE': 15.012403004893006,
  'R²': 0.9484422769075563},
 'XGBoost': {'MAE': 8.42647087287646,
  'RMSE': 15.469208442586403,
  'R²': 0.9452568878154366},
 'XGBoost (Scaled features)': {'MAE': 8.086808847095037,
  'RMSE': 14.688989870059165,
  'R²': 0.9506397712782884},
 'XGBoost (Feature engineering)': {'MAE': 8.56154125996463,
  'RMSE': 15.311160299774247,
  'R²': 0.9463697887726877}}

In [136]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Initialize RandomizedSearchCV
xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
xgb_random.fit(X_train, y_train)

# Get the best parameters
best_params = xgb_random.best_params_
print("Best parameters found: ", best_params)

# Evaluate the best model
xgb_best = xgb_random.best_estimator_
xgb_y_pred_best = xgb_best.predict(X_test)

results['XGBoost (Tuned)'] = {
    'MAE': mean_absolute_error(y_test, xgb_y_pred_best),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred_best)),
    'R²': r2_score(y_test, xgb_y_pred_best)
}




Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'subsample': 1.0, 'reg_lambda': 1.5, 'reg_alpha': 0.5, 'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.5}


In [137]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666},
 'XGBoost (Important Features)': {'MAE': 8.157239716745382,
  'RMSE': 15.012403004893006,
  'R²': 0.9484422769075563},
 'XGBoost': {'MAE': 8.42647087287646,
  'RMSE': 15.469208442586403,
  'R²': 0.9452568878154366},
 'XGBoost (Scaled features)': {'MAE': 8.086808847095037,
  'RMSE': 14.688989870059165,
  'R²': 0.9506397712782884},
 'XGBoost (Feature engineering)': {'MAE': 8.56154125996463,
  'RMSE': 15.311160299774247,
  'R²': 0.9463697887726877},
 'XGBoost (Tuned)': {'MAE': 6.697133522330637,
  'RMSE': 12.838870429801553,
  'R²': 0.9622908335367845}}

Neural net works- LSTM

In [138]:
!pip install scikeras




Use google colab  GPU to training the following LSTM model to save time

In [144]:
# 3. LSTM Model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(units=50))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Reshape data for LSTM
X_train_lstm = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_model = create_lstm_model((X_train_lstm.shape[1], 1))
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=2)
lstm_y_pred = lstm_model.predict(X_test_lstm)
results['LSTM'] = {
    'MAE': mean_absolute_error(y_test, lstm_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lstm_y_pred)),
    'R²': r2_score(y_test, lstm_y_pred)
}

Epoch 1/10


  super().__init__(**kwargs)


668/668 - 4s - 7ms/step - loss: 5924.9253
Epoch 2/10
668/668 - 3s - 5ms/step - loss: 4803.0620
Epoch 3/10
668/668 - 4s - 6ms/step - loss: 4626.8213
Epoch 4/10
668/668 - 4s - 5ms/step - loss: 4613.3721
Epoch 5/10
668/668 - 3s - 5ms/step - loss: 4613.5342
Epoch 6/10
668/668 - 3s - 5ms/step - loss: 4596.4043
Epoch 7/10
668/668 - 3s - 5ms/step - loss: 4579.4526
Epoch 8/10
668/668 - 3s - 5ms/step - loss: 4519.6309
Epoch 9/10
668/668 - 3s - 5ms/step - loss: 3516.1948
Epoch 10/10
668/668 - 3s - 5ms/step - loss: 3043.9177
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [145]:
results

{'Random Forest (Tuned)': {'MAE': 6.292347635890187,
  'RMSE': 12.50958145205259,
  'R²': 0.9642003435286666},
 'XGBoost (Important Features)': {'MAE': 8.157239716745382,
  'RMSE': 15.012403004893006,
  'R²': 0.9484422769075563},
 'XGBoost': {'MAE': 8.42647087287646,
  'RMSE': 15.469208442586403,
  'R²': 0.9452568878154366},
 'XGBoost (Scaled features)': {'MAE': 8.086808847095037,
  'RMSE': 14.688989870059165,
  'R²': 0.9506397712782884},
 'XGBoost (Feature engineering)': {'MAE': 8.56154125996463,
  'RMSE': 15.311160299774247,
  'R²': 0.9463697887726877},
 'XGBoost (Tuned)': {'MAE': 6.697133522330637,
  'RMSE': 12.838870429801553,
  'R²': 0.9622908335367845},
 'LSTM': {'MAE': 27.85122086798246,
  'RMSE': 52.48905780690147,
  'R²': 0.3697238767812937}}