In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import requests
import os
import time

In [None]:
import os
import requests
import pandas as pd
import numpy as np

# Etherscan API details
API_KEY = 'PN5M93ND4RZB23TBG1IMRWKVAUT4U3STD4'
URL = f'https://api.etherscan.io/api?module=gastracker&action=gasoracle&apikey={API_KEY}'
csv_file = 'gas_data.csv'

# Fetch historical gas fee data from Etherscan and create a dataframe
def fetch_historical_data(days=500):
    historical_data = []
    for day in range(days):
        # Simulating fetching data for each half-hour interval (this would ideally come from a proper API endpoint or database)
        for hour in range(0, 24):  # Increase interval to every 3 hours to reduce API calls
            timestamp = pd.Timestamp.now() - pd.Timedelta(days=day, hours=hour)
            response = requests.get(URL)
            if response.status_code == 200:
                gas_data = response.json()
                if gas_data['status'] == '1':
                    gas_fee = float(gas_data['result']['ProposeGasPrice'])
                    historical_data.append({'timestamp': timestamp, 'gas_fee': gas_fee, 'trading_volume': np.nan})
    return pd.DataFrame(historical_data)

# Load data - Fetch or read historical gas fee data
if os.path.exists(csv_file):
    dataset = pd.read_csv(csv_file, parse_dates=['timestamp'])
    dataset.set_index('timestamp', inplace=True)
else:
    dataset = pd.DataFrame()
    for day in range(500):
        print(f"Fetching data for day {day + 1} out of 500")
        day_data = fetch_historical_data(days=1)
        dataset = pd.concat([dataset, day_data])
        # Save intermediate results to avoid data loss on interruption
        dataset.to_csv(csv_file)
    if not dataset.empty:
        dataset.set_index('timestamp', inplace=True)
        dataset.to_csv(csv_file)
    else:
        raise ValueError("Failed to fetch historical data. Please check the API or network connection.")

# Save the dataset to CSV
dataset.to_csv(csv_file)
print("Data extraction completed and saved to CSV.")


Fetching data for day 1 out of 500
Fetching data for day 2 out of 500
Fetching data for day 3 out of 500
Fetching data for day 4 out of 500
Fetching data for day 5 out of 500
Fetching data for day 6 out of 500
Fetching data for day 7 out of 500
Fetching data for day 8 out of 500
Fetching data for day 9 out of 500
Fetching data for day 10 out of 500
Fetching data for day 11 out of 500
Fetching data for day 12 out of 500
Fetching data for day 13 out of 500
Fetching data for day 14 out of 500
Fetching data for day 15 out of 500
Fetching data for day 16 out of 500
Fetching data for day 17 out of 500
Fetching data for day 18 out of 500
Fetching data for day 19 out of 500
Fetching data for day 20 out of 500
Fetching data for day 21 out of 500
Fetching data for day 22 out of 500
Fetching data for day 23 out of 500
Fetching data for day 24 out of 500
Fetching data for day 25 out of 500
Fetching data for day 26 out of 500
Fetching data for day 27 out of 500
Fetching data for day 28 out of 500
F

In [None]:
# Load the historical gas fee data from CSV
csv_file = '/public_gas_fee_data.csv'
if os.path.exists(csv_file):
    dataset = pd.read_csv(csv_file, parse_dates=['timestamp'])
    dataset.set_index('timestamp', inplace=True)
else:
    raise FileNotFoundError("The CSV file containing historical gas fee data could not be found.")

# Add time of day and day of week as features
dataset['time_of_day'] = dataset.index.hour / 23.0  # Normalize to [0, 1]
dataset['day_of_week'] = dataset.index.dayofweek / 6.0  # Normalize to [0, 1]

# Resample data to half-hour intervals, filling missing values with zeros
dataset = dataset.resample('30T').ffill().fillna(0)

# If the dataset is empty or contains only zeros, add random placeholder data
if dataset['gas_fee'].sum() == 0:
    print("Dataset is empty or contains only zeros. Adding placeholder data.")
    placeholder_data = {
        'timestamp': pd.date_range(start=pd.Timestamp.now() - pd.Timedelta(days=1), periods=48, freq='30T'),
        'gas_fee': np.random.uniform(10, 100, 48),
        'time_of_day': np.linspace(0, 1, 48),
        'day_of_week': np.linspace(0, 1, 48)
    }
    dataset = pd.DataFrame(placeholder_data)
    dataset.set_index('timestamp', inplace=True)

# Data quality checks
# Check for missing values
missing_values = dataset.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Check for duplicate timestamps
duplicate_count = dataset.index.duplicated().sum()
print(f"Number of duplicate timestamps: {duplicate_count}")

# Check for any rows with all zeros
dataset_zero_rows = (dataset == 0).all(axis=1).sum()
print(f"Number of rows with all zero values: {dataset_zero_rows}")

# Feature Scaling
scaler = MinMaxScaler()
dataset[['gas_fee', 'time_of_day', 'day_of_week']] = scaler.fit_transform(dataset[['gas_fee', 'time_of_day', 'day_of_week']])

# Prepare the data for time series forecasting
window_size = 15  # Increase window size to capture more historical context

def create_dataset(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])  # Features: gas_fee, time_of_day, day_of_week
        y.append(data[i + window_size, 0])  # Target: gas_fee
    return np.array(X), np.array(y)

# Convert the dataset to numpy array
data = dataset[['gas_fee', 'time_of_day', 'day_of_week']].values

# Create dataset
X, y = create_dataset(data, window_size)

# Ensure that there is enough data for training and testing
if len(X) == 0 or len(y) == 0:
    raise ValueError("Not enough data available to create training and testing sets. Please ensure the dataset contains sufficient historical data.")

# Reshape X to be 3D (samples, time steps, features)
if len(X.shape) == 2:
    X = X.reshape((X.shape[0], X.shape[1], 3))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Missing values in each column:
gas_fee           0
trading_volume    0
time_of_day       0
day_of_week       0
dtype: int64
Number of duplicate timestamps: 0
Number of rows with all zero values: 1


  dataset = dataset.resample('30T').ffill().fillna(0)
