In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np



In [2]:
data = pd.read_csv(r"f:\Paper\ResearchData\TxnPrediction Data\TxFeeData\TxFeeData_100K.csv")
data = data.drop(data.columns[0], axis=1)
data = data.drop(columns=[ 'maxFeePerGas', 'transactionFee'])
data.head()

Unnamed: 0,gasUsed,gasPrice,maxPriorityFeePerGas,baseFees,value,priorityFee,voteCount,activeValidators
0,267804.0,21.946848,0.0,5877454.0,0.030827,0.0,23471,753577
1,169261.0,23.946848,2.0,3714745.0,0.0,338522.0,23471,753577
2,195610.0,23.946848,2.0,4293023.0,0.0,391220.0,23471,753577
3,55825.0,25.126125,3.179277,1225183.0,0.0,177483.14595,23471,753577
4,55863.0,24.946848,3.0,1226017.0,0.0,167589.0,23471,753577


In [3]:
x = data.drop(columns=['baseFees', 'priorityFee'])
y = data['baseFees']
total_samples = len(x)
split_index = int(0.7 * total_samples)

x_train = x[:split_index]
x_test = x[split_index:]

y_train = y[:split_index]
y_test = y[split_index:]

In [4]:
x_train.describe()

Unnamed: 0,gasUsed,gasPrice,maxPriorityFeePerGas,value,voteCount,activeValidators
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,112550.1915,13.83183,0.623559,13923960.0,24606.446943,753716.439186
std,98574.860963,2.412163,0.873083,26992360.0,1645.768235,94.907091
min,21000.0,10.368861,0.0,0.0,21939.0,753577.0
25%,37110.0,12.28702,0.1,0.0,23494.0,753633.0
50%,70000.0,13.269418,0.1,0.0,23862.0,753709.0
75%,177054.75,14.46089,1.0,12899700.0,25155.0,753800.0
max,485715.0,26.468588,4.85,125000000.0,30397.0,753887.0


In [5]:
df = pd.DataFrame({"Actual baseFee": y_test})

In [6]:
def train_model(model):
    print('Model => {} '.format(type(model).__name__))
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    df['Predicted baseFee({})'.format(type(model).__name__)] = y_pred
    #print(df.head())
    
  
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  
    r2 = r2_score(y_test, y_pred)


    result = {
        'Model': type(model).__name__,
        'MeanAbsoluteError': mae,
        'RootMeanSquaredError': rmse,
        'R2Score': r2
    }

    print('Mean Absolute Error (MAE): "]',mae)
    print('Root Mean Squared Error (RMSE):', rmse)
    print('R2 Score:', r2)
    print()
    
    return result


In [7]:
results = []
models = [XGBRegressor(), ExtraTreesRegressor(), KNeighborsRegressor(),LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
for model in models:
    results.append(train_model(model))

Model => XGBRegressor 
Mean Absolute Error (MAE): "] 15999.419754095583
Root Mean Squared Error (RMSE): 35900.38743539779
R2 Score: 0.9991874628808147

Model => ExtraTreesRegressor 
Mean Absolute Error (MAE): "] 13509.214422019168
Root Mean Squared Error (RMSE): 38519.01414344256
R2 Score: 0.9990646045231433

Model => KNeighborsRegressor 
Mean Absolute Error (MAE): "] 182158.39769269756
Root Mean Squared Error (RMSE): 320379.7117180254
R2 Score: 0.9352894707888452

Model => LinearRegression 
Mean Absolute Error (MAE): "] 126600.82352498935
Root Mean Squared Error (RMSE): 176949.71849444954
R2 Score: 0.9802601116859357

Model => RandomForestRegressor 
Mean Absolute Error (MAE): "] 14482.520243567677
Root Mean Squared Error (RMSE): 40218.71381830916
R2 Score: 0.9989802321996555

Model => GradientBoostingRegressor 
Mean Absolute Error (MAE): "] 35694.18219613367
Root Mean Squared Error (RMSE): 62799.425667228556
R2 Score: 0.9975136842399596



In [8]:
print(results)

[{'Model': 'XGBRegressor', 'MeanAbsoluteError': 15999.419754095583, 'RootMeanSquaredError': 35900.38743539779, 'R2Score': 0.9991874628808147}, {'Model': 'ExtraTreesRegressor', 'MeanAbsoluteError': 13509.214422019168, 'RootMeanSquaredError': 38519.01414344256, 'R2Score': 0.9990646045231433}, {'Model': 'KNeighborsRegressor', 'MeanAbsoluteError': 182158.39769269756, 'RootMeanSquaredError': 320379.7117180254, 'R2Score': 0.9352894707888452}, {'Model': 'LinearRegression', 'MeanAbsoluteError': 126600.82352498935, 'RootMeanSquaredError': 176949.71849444954, 'R2Score': 0.9802601116859357}, {'Model': 'RandomForestRegressor', 'MeanAbsoluteError': 14482.520243567677, 'RootMeanSquaredError': 40218.71381830916, 'R2Score': 0.9989802321996555}, {'Model': 'GradientBoostingRegressor', 'MeanAbsoluteError': 35694.18219613367, 'RootMeanSquaredError': 62799.425667228556, 'R2Score': 0.9975136842399596}]


In [9]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,MeanAbsoluteError,RootMeanSquaredError,R2Score
0,XGBRegressor,15999.419754,35900.387435,0.999187
1,ExtraTreesRegressor,13509.214422,38519.014143,0.999065
2,KNeighborsRegressor,182158.397693,320379.711718,0.935289
3,LinearRegression,126600.823525,176949.718494,0.98026
4,RandomForestRegressor,14482.520244,40218.713818,0.99898
5,GradientBoostingRegressor,35694.182196,62799.425667,0.997514


In [10]:
y_naive = []
for i in range(len(y_test)):
    naive_value = (sum(y_train[-1000+i:]) + sum(y_test[:i]))/1000
    y_naive.append(naive_value)



In [11]:
df["Estimated"] = y_naive
df.head()

Unnamed: 0,Actual baseFee,Predicted baseFee(XGBRegressor),Predicted baseFee(ExtraTreesRegressor),Predicted baseFee(KNeighborsRegressor),Predicted baseFee(LinearRegression),Predicted baseFee(RandomForestRegressor),Predicted baseFee(GradientBoostingRegressor),Estimated
70000,228235.9,227514.0,228243.3,327168.9,27881.01,227759.4,215967.8,1295994.0
70001,1086838.0,1086688.0,1111429.0,1288473.0,1063431.0,1106588.0,1151626.0,1294464.0
70002,228235.9,227532.9,228021.3,251304.9,27320.35,227773.7,215967.8,1294069.0
70003,3298856.0,3343645.0,3340758.0,4097852.0,3729761.0,3386374.0,3484553.0,1293478.0
70004,1986185.0,1960628.0,2028201.0,2247066.0,2148024.0,2028519.0,2002703.0,1295086.0


In [12]:
# df.to_csv("baseFee_500K_prediction.csv")

In [13]:
# results_df.to_csv("baseFee_500K_results.csv")