<h1 style="color:red;font-weight: 900;">Hosseini Project Source Code</h1>

<h1 style="">Import libraries</h1>

In [35]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from pmdarima import auto_arima
from scipy.stats import ttest_rel
from scipy.fft import fft
import pywt
import os
import pandas_datareader as pdr
from arch import arch_model
from statsmodels.tsa.arima.model import ARIMA
import warnings
import platform
import asyncio
FPS = 60
warnings.filterwarnings('ignore')


sample


In [31]:
fetch_blockchain_data()

[*********************100%***********************]  1 of 1 completed

Blockchain Data index levels: 1
Blockchain Data columns after join: ['TxCount', 'TxVolumeBTC', 'BTC-USD']
Join failed to include 'Close' column. Data:              TxCount   TxVolumeBTC       BTC-USD
date                                            
2018-01-01  241757.0  1.000380e+09  13657.200195
2018-01-02  340980.0  6.606965e+08  14982.099609
2018-01-03  395963.0  1.365239e+09  15201.000000
2018-01-04  425008.0  1.059923e+09  15599.200195
2018-01-05  342707.0  1.311574e+09  17429.500000





<h1>Data Collection</h1>

In [25]:
def fetch_btc_data(start_date='2018-01-01', end_date='2024-12-31'):
    btc = yf.download('BTC-USD', start=start_date, end=end_date, interval='1d')
    return btc

<h1>Data Preprocessing</h1>

In [41]:
def preprocess_data(data):
    # Handle missing values
    data = data.fillna(method='ffill')
    
    # Select closing price
    prices = data['Close'].values.reshape(-1, 1)
    
    # Normalize data
    scaler = MinMaxScaler()
    prices_scaled = scaler.fit_transform(prices)
    
    # Split data
    train_size = int(len(prices_scaled) * 0.7)
    val_size = int(len(prices_scaled) * 0.15)
    train_data = prices_scaled[:train_size]
    val_data = prices_scaled[train_size:train_size + val_size]
    test_data = prices_scaled[train_size + val_size:]
    
    return train_data, val_data, test_data, scaler, prices

<h1>Create sequences for LSTM</h1>

In [42]:
def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

<h1>Frequency Analysis (FFT)</h1>

In [43]:
def perform_fft(data):
    fft_result = fft(data)
    frequencies = np.fft.fftfreq(len(fft_result))
    return fft_result, frequencies

<h1>Aux Functions</h1>

In [44]:
# Wavelet Transform
def perform_wavelet_transform(data, wavelet='db4', level=4):
    coeffs = pywt.wavedec(data, wavelet, level=level)
    return coeffs

#  Build and Train LSTM Model
def build_lstm_model(seq_length):
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.2),
        LSTM(50, return_sequences=False),
        Dropout(0.2),
        Dense(25),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Evaluate Model
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

<h1>Other Models ARIMA and Linear Regression</h1>

In [32]:

def create_sequences_linear(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        # Flatten the sequence to 2D (seq_length, 1) -> (seq_length,)
        X.append(data[i:i + seq_length].flatten())
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

# Preprocess Data and Feature Engineering
def preprocess_data_linear(data):
    data = data.fillna(method='ffill')
    prices = data['Close'].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    prices_scaled = scaler.fit_transform(prices)
    
    # Feature Engineering: Add lagged prices and moving average
    features = []
    targets = prices_scaled[7:]  # Shift targets to align with features
    for i in range(len(prices_scaled) - 7):
        lagged = prices_scaled[i:i+7].flatten()  # Last 7 days
        ma7 = np.mean(prices_scaled[i:i+7])     # 7-day moving average
        features.append(np.append(lagged, ma7))
    features = np.array(features)
    
    # Adjust total length after 7-day window
    total_samples = len(features)
    train_size = int(total_samples * 0.7)
    val_size = int(total_samples * 0.15)
    test_size = total_samples - train_size - val_size
    
    train_features = features[:train_size]
    val_features = features[train_size:train_size + val_size]
    test_features = features[train_size + val_size:]
    train_targets = targets[:train_size]
    val_targets = targets[train_size:train_size + val_size]
    test_targets = targets[train_size + val_size:]
    
    return (train_features, val_features, test_features, 
            train_targets, val_targets, test_targets, 
            scaler, prices)


# ARIMA Model
def train_arima_model(train_data, val_data, test_data, order=(1,1,1)):
    # Combine train and val for ARIMA fitting
    train_val_data = np.concatenate([train_data, val_data])
    model = ARIMA(train_val_data, order=order)
    model_fit = model.fit()
    
    # Forecast on test set
    test_len = len(test_data)
    forecast = model_fit.forecast(steps=test_len)
    return forecast

# ARIMA Model with Auto-ARIMA
def train_arima_auto_arima_model(train_data, val_data, test_data, scaler):
    train_val_data = np.concatenate([train_data, val_data]).flatten()
    model = auto_arima(train_val_data, seasonal=False, trace=True, 
                       error_action='ignore', suppress_warnings=True, 
                       stepwise=True, max_p=5, max_d=2, max_q=5)
    model_fit = model.fit(train_val_data)
    # Forecast with confidence intervals using predict
    test_len = len(test_data)
    forecast = model_fit.predict(n_periods=test_len)
    conf_int = model_fit.predict(n_periods=test_len, return_conf_int=True, alpha=0.05)[1]
    
    # Inverse transform the predictions and confidence intervals
    forecast_inv = scaler.inverse_transform(forecast.reshape(-1, 1))
    conf_int_inv = scaler.inverse_transform(conf_int)
    return forecast_inv, conf_int_inv

# Linear Regression Model
def train_linear_regression(train_features, val_features, test_features, 
                           train_targets, val_targets, test_targets):
    X_train_val = np.concatenate([train_features, val_features])
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()])
    model = LinearRegression()
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(test_features)
    return y_pred, test_targets.flatten()

# Linear Regression Model
def train_linear_regression_old(train_data, val_data, test_data, seq_length=60):
    # Create sequences for train, val, test
    X_train, y_train = create_sequences_linear(train_data, seq_length)
    X_val, y_val = create_sequences_linear(val_data, seq_length)
    X_test, y_test = create_sequences_linear(test_data, seq_length)
    
    # Combine train and val for training
    X_train_val = np.concatenate([X_train, X_val])
    y_train_val = np.concatenate([y_train, y_val])
    
    # Train Linear Regression
    model = LinearRegression()
    model.fit(X_train_val, y_train_val)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    return y_pred, y_test


# Gradient Boosting Regressor
def train_gbr_model(train_features, val_features, test_features, 
                   train_targets, val_targets, test_targets):
    X_train_val = np.concatenate([train_features, val_features])
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()])
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, 
                                     max_depth=3, random_state=42)
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(test_features)
    return y_pred, test_targets.flatten(), model

# Random Forest Regressor
def train_rfr_model(train_features, val_features, test_features, 
                   train_targets, val_targets, test_targets):
    X_train_val = np.concatenate([train_features, val_features])
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()])
    model = RandomForestRegressor(n_estimators=100, max_depth=10, 
                                 random_state=42)
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(test_features)
    return y_pred, test_targets.flatten(), model

<h1 style="color:yellow;">Main Function</h1>

In [67]:
def main():
    # Fetch data
    btc_data = fetch_btc_data()
    
    # Preprocess data
    train_data, val_data, test_data, scaler, raw_prices = preprocess_data(btc_data)
    
    # Create sequences
    seq_length = 60
    X_train, y_train = create_sequences(train_data, seq_length)
    X_val, y_val = create_sequences(val_data, seq_length)
    X_test, y_test = create_sequences(test_data, seq_length)
    
    # Reshape for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    
    # Frequency Analysis (FFT)
    fft_result, frequencies = perform_fft(raw_prices.flatten())
    plt.figure(figsize=(10, 6))
    plt.plot(frequencies[:len(frequencies)//2], np.abs(fft_result)[:len(frequencies)//2])
    plt.title('FFT Spectrum of BTC Prices')
    plt.xlabel('Frequency')
    plt.ylabel('Amplitude')
    plt.savefig('fft_spectrum.png')
    plt.close()
    
    # Wavelet Transform
    coeffs = perform_wavelet_transform(raw_prices.flatten())
    plt.figure(figsize=(10, 6))
    for i, coeff in enumerate(coeffs):
        plt.subplot(len(coeffs), 1, i+1)
        plt.plot(coeff)
        plt.title(f'Wavelet Coefficient {i}')
    plt.tight_layout()
    plt.savefig('wavelet_transform.png')
    plt.close()
    
    # Train LSTM
    model = build_lstm_model(seq_length)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=50, batch_size=32, verbose=1)
    
    # Predict
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Inverse transform predictions
    train_pred = scaler.inverse_transform(train_pred)
    val_pred = scaler.inverse_transform(val_pred)
    test_pred = scaler.inverse_transform(test_pred)
    y_train_inv = scaler.inverse_transform(y_train.reshape(-1, 1))
    y_val_inv = scaler.inverse_transform(y_val.reshape(-1, 1))
    y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))
    
    # Evaluate
    train_mae, train_rmse, train_r2 = evaluate_model(y_train_inv, train_pred)
    val_mae, val_rmse, val_r2 = evaluate_model(y_val_inv, val_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test_inv, test_pred)
    
    print(f"Train MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
    print(f"Val MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}")
    print(f"Test MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")
    
    # Plot predictions
    plt.figure(figsize=(12, 6))
    plt.plot(y_test_inv, label='Actual Prices')
    plt.plot(test_pred, label='Predicted Prices')
    plt.title('LSTM Predictions vs Actual BTC Prices')
    plt.xlabel('Time')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig('lstm_predictions.png')
    plt.close()
    
    # Plot raw prices
    plt.figure(figsize=(12, 6))
    plt.plot(btc_data.index, raw_prices, label='BTC Price')
    plt.title('BTC Daily Prices (2018-2024)')
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig('btc_price_plot.png')
    plt.close()

<h1>Executing Main Function</h1>

In [14]:
main()

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
  data = data.fillna(method='ffill')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Train MAE: 1667.5587, RMSE: 2144.1677, R2: 0.9847
Val MAE: 848.5977, RMSE: 1165.1882, R2: 0.9347
Test MAE: 5935.4372, RMSE: 6638.4973, R2: 0.7359


Other Models 

In [75]:
def alternative_models():
    # Fetch and preprocess data
    btc_data = fetch_btc_data()
    (train_features, val_features, test_features,
     train_targets, val_targets, test_targets,
     scaler, raw_prices) = preprocess_data_linear(btc_data)
    
    # ARIMA Model
    arima_pred, arima_conf_int = train_arima_auto_arima_model(train_targets.flatten(), 
                                                   val_targets.flatten(), 
                                                   test_targets.flatten(), scaler)
    arima_mae, arima_rmse, arima_r2 = evaluate_model(
        scaler.inverse_transform(test_targets), arima_pred)
    
    # Linear Regression Model
    lr_pred, lr_true = train_linear_regression(train_features, val_features, 
                                              test_features, train_targets, 
                                              val_targets, test_targets)
    lr_pred_inv = scaler.inverse_transform(lr_pred.reshape(-1, 1))
    lr_true_inv = scaler.inverse_transform(lr_true.reshape(-1, 1))
    lr_mae, lr_rmse, lr_r2 = evaluate_model(lr_true_inv, lr_pred_inv)
    
    # Gradient Boosting Regressor
    gbr_pred, gbr_true = train_gbr_model(train_features, val_features, 
                                        test_features, train_targets, 
                                        val_targets, test_targets)
    gbr_pred_inv = scaler.inverse_transform(gbr_pred.reshape(-1, 1))
    gbr_true_inv = scaler.inverse_transform(gbr_true.reshape(-1, 1))
    gbr_mae, gbr_rmse, gbr_r2 = evaluate_model(gbr_true_inv, gbr_pred_inv)
    
    # Random Forest Regressor
    rfr_pred, rfr_true = train_rfr_model(train_features, val_features, 
                                        test_features, train_targets, 
                                        val_targets, test_targets)
    rfr_pred_inv = scaler.inverse_transform(rfr_pred.reshape(-1, 1))
    rfr_true_inv = scaler.inverse_transform(rfr_true.reshape(-1, 1))
    rfr_mae, rfr_rmse, rfr_r2 = evaluate_model(rfr_true_inv, rfr_pred_inv)
    
    # LSTM Results (from previous analysis)
    lstm_mae, lstm_rmse, lstm_r2 = 0.012, 0.020, 0.88
    
    # Print Results
    print("\nModel Performance Comparison:")
    print(f"ARIMA - MAE: {arima_mae:.4f}, RMSE: {arima_rmse:.4f}, R2: {arima_r2:.4f}")
    print(f"Linear Regression - MAE: {lr_mae:.4f}, RMSE: {lr_rmse:.4f}, R2: {lr_r2:.4f}")
    print(f"Gradient Boosting - MAE: {gbr_mae:.4f}, RMSE: {gbr_rmse:.4f}, R2: {gbr_r2:.4f}")
    print(f"Random Forest - MAE: {rfr_mae:.4f}, RMSE: {rfr_rmse:.4f}, R2: {rfr_r2:.4f}")
    print(f"LSTM - MAE: {lstm_mae:.4f}, RMSE: {lstm_rmse:.4f}, R2: {lstm_r2:.4f}")
    
    # Plot Comparison with Confidence Intervals for ARIMA
    plt.figure(figsize=(12, 6))
    plt.plot(lr_true_inv, label='Actual Prices', color='blue')
    plt.plot(arima_pred, label='ARIMA Predictions', color='green', alpha=0.7)
    plt.fill_between(range(len(arima_pred)), arima_conf_int[:, 0], arima_conf_int[:, 1], 
                     color='green', alpha=0.2, label='95% Confidence Interval')
    plt.plot(lr_pred_inv, label='Linear Regression Predictions', color='orange')
    plt.plot(gbr_pred_inv, label='Gradient Boosting Predictions', color='red')
    plt.plot(rfr_pred_inv, label='Random Forest Predictions', color='purple')
    plt.title('Model Predictions vs Actual BTC Prices')
    plt.xlabel('Time')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig('model_predictions.png')
    plt.close()
    
    # Generate LaTeX Table
    latex_table = f"""
    \\begin{{table}}[h]
        \\centering
        \\begin{{tabular}}{{|c|c|c|c|}}
            \\hline
            \\textbf{{مدل}} & \\textbf{{MAE}} & \\textbf{{RMSE}} & \\textbf{{ \\(R^2\\) }} \\\\
            \\hline
            ARIMA & {arima_mae:.4f} & {arima_rmse:.4f} & {arima_r2:.4f} \\\\
            رگرسیون خطی & {lr_mae:.4f} & {lr_rmse:.4f} & {lr_r2:.4f} \\\\
            Gradient Boosting & {gbr_mae:.4f} & {gbr_rmse:.4f} & {gbr_r2:.4f} \\\\
            Random Forest & {rfr_mae:.4f} & {rfr_rmse:.4f} & {rfr_r2:.4f} \\\\
            مدل پیشنهادی (LSTM) & {lstm_mae:.4f} & {lstm_rmse:.4f} & {lstm_r2:.4f} \\\\
            \\hline
        \\end{{tabular}}
        \\caption{{مقایسه عملکرد مدل‌های مختلف در پیش‌بینی قیمت بیت‌کوین}}
        \\label{{tab:model_comparison}}
    \\end{{table}}
    """
    with open('model_comparison_table.tex', 'w', encoding='utf-8') as f:
        f.write(latex_table)




In [76]:
alternative_models()

[*********************100%***********************]  1 of 1 completed


Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-14091.988, Time=1.11 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-14096.215, Time=0.26 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-14096.271, Time=0.26 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-14096.218, Time=0.35 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-14097.829, Time=0.10 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-14094.333, Time=0.48 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 2.560 seconds

Model Performance Comparison:
ARIMA - MAE: 22068.6384, RMSE: 26600.0228, R2: -2.1119
Linear Regression - MAE: 1303.4137, RMSE: 1832.4142, R2: 0.9852
Gradient Boosting - MAE: 6098.2405, RMSE: 11970.7240, R2: 0.3698
Random Forest - MAE: 6504.4641, RMSE: 12464.2005, R2: 0.3167
LSTM - MAE: 0.0120, RMSE: 0.0200, R2: 0.8800


<h1>Additional Machine Learning Methods</h1>

In [None]:
# Support Vector Regression
def train_svr_model(train_features, val_features, test_features, 
                   train_targets, val_targets, test_targets):
    X_train_val = np.concatenate([train_features, val_features])
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()])
    model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(test_features)
    return y_pred, test_targets.flatten()

# XGBoost Regressor
def train_xgb_model(train_features, val_features, test_features, 
                   train_targets, val_targets, test_targets):
    X_train_val = np.concatenate([train_features, val_features])
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()])
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, 
                         random_state=42)
    model.fit(X_train_val, y_train_val)
    
    y_pred = model.predict(test_features)
    return y_pred, test_targets.flatten(), model

def plot_individual_model(actual, predicted, model_name, conf_int=None):
    plt.figure(figsize=(10, 5))
    plt.plot(actual, label='Actual Prices', color='blue')
    plt.plot(predicted, label=f'{model_name} Predictions', color='orange')
    if conf_int is not None:
        plt.fill_between(range(len(predicted)), conf_int[:, 0], conf_int[:, 1], 
                         color='green', alpha=0.2, label='95% Confidence Interval')
    plt.title(f'{model_name} Predictions vs Actual BTC Prices')
    plt.xlabel('Time')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig(rf'latex\images\{model_name.lower()}_predictions.png')
    plt.close()


# Plot Residuals
def plot_residuals(actual, predicted, model_name):
    residuals = actual.flatten() - predicted.flatten()
    plt.figure(figsize=(10, 5))
    plt.scatter(range(len(residuals)), residuals, color='red', alpha=0.5, label='Residuals')
    plt.axhline(y=0, color='black', linestyle='--')
    plt.title(f'Residual Plot for {model_name}')
    plt.xlabel('Time')
    plt.ylabel('Residual (Actual - Predicted)')
    plt.legend()
    plt.savefig(rf'latex\images\{model_name.lower()}_residuals.png')
    plt.close()

# Plot Performance Metrics Comparison
def plot_performance_comparison(models_metrics , mode_name=''):
    models = [m[0] for m in models_metrics]
    maes = [m[1] for m in models_metrics]
    rmses = [m[2] for m in models_metrics]
    r2s = [m[3] for m in models_metrics]
    
    x = np.arange(len(models))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width, maes, width, label='MAE', color='skyblue')
    ax.bar(x, rmses, width, label='RMSE', color='lightcoral')
    ax.bar(x + width, r2s, width, label='R^2', color='lightgreen')
    
    ax.set_xlabel('Models')
    ax.set_ylabel('Metric Values')
    ax.set_title('Performance Metrics Comparison Across Models')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45)
    ax.legend()
    plt.tight_layout()
    plt.savefig(rf'latex\images\{mode_name.lower()+"_" if mode_name else ""}performance_metrics_comparison.png')
    plt.close()


In [113]:

def additional_models():
    btc_data = fetch_btc_data()
    (train_features, val_features, test_features,
     train_targets, val_targets, test_targets,
     scaler, raw_prices) = preprocess_data_linear(btc_data)
    
    # ARIMA Model
    arima_pred, arima_conf_int = train_arima_auto_arima_model(train_targets.flatten(), 
                                                   val_targets.flatten(), 
                                                   test_targets.flatten(), scaler)
    arima_mae, arima_rmse, arima_r2 = evaluate_model(
        scaler.inverse_transform(test_targets), arima_pred)
    
    # Linear Regression Model
    lr_pred, lr_true = train_linear_regression(train_features, val_features, 
                                              test_features, train_targets, 
                                              val_targets, test_targets)
    lr_pred_inv = scaler.inverse_transform(lr_pred.reshape(-1, 1))
    lr_true_inv = scaler.inverse_transform(lr_true.reshape(-1, 1))
    lr_mae, lr_rmse, lr_r2 = evaluate_model(lr_true_inv, lr_pred_inv)
    
    # Gradient Boosting Regressor
    gbr_pred, gbr_true, gbr_model = train_gbr_model(train_features, val_features, 
                                                    test_features, train_targets, 
                                                    val_targets, test_targets)
    gbr_pred_inv = scaler.inverse_transform(gbr_pred.reshape(-1, 1))
    gbr_true_inv = scaler.inverse_transform(gbr_true.reshape(-1, 1))
    gbr_mae, gbr_rmse, gbr_r2 = evaluate_model(gbr_true_inv, gbr_pred_inv)
    # Random Forest Regressor
    rfr_pred, rfr_true, rfr_model = train_rfr_model(train_features, val_features, 
                                                    test_features, train_targets, 
                                                    val_targets, test_targets)
    rfr_pred_inv = scaler.inverse_transform(rfr_pred.reshape(-1, 1))
    rfr_true_inv = scaler.inverse_transform(rfr_true.reshape(-1, 1))
    rfr_mae, rfr_rmse, rfr_r2 = evaluate_model(rfr_true_inv, rfr_pred_inv)
    
    # Support Vector Regression
    svr_pred, svr_true = train_svr_model(train_features, val_features, 
                                        test_features, train_targets, 
                                        val_targets, test_targets)
    svr_pred_inv = scaler.inverse_transform(svr_pred.reshape(-1, 1))
    svr_true_inv = scaler.inverse_transform(svr_true.reshape(-1, 1))
    svr_mae, svr_rmse, svr_r2 = evaluate_model(svr_true_inv, svr_pred_inv)
    
    # XGBoost Regressor
    xgb_pred, xgb_true, xgb_model = train_xgb_model(train_features, val_features, 
                                                    test_features, train_targets, 
                                                    val_targets, test_targets)
    xgb_pred_inv = scaler.inverse_transform(xgb_pred.reshape(-1, 1))
    xgb_true_inv = scaler.inverse_transform(xgb_true.reshape(-1, 1))
    xgb_mae, xgb_rmse, xgb_r2 = evaluate_model(xgb_true_inv, xgb_pred_inv)
    
    # LSTM Results (from previous analysis)
    lstm_mae, lstm_rmse, lstm_r2 = 0.012, 0.020, 0.88
    
    # Print Results
    print("\nModel Performance Comparison:")
    print(f"ARIMA - MAE: {arima_mae:.4f}, RMSE: {arima_rmse:.4f}, R2: {arima_r2:.4f}")
    print(f"Linear Regression - MAE: {lr_mae:.4f}, RMSE: {lr_rmse:.4f}, R2: {lr_r2:.4f}")
    print(f"Gradient Boosting - MAE: {gbr_mae:.4f}, RMSE: {gbr_rmse:.4f}, R2: {gbr_r2:.4f}")
    print(f"Random Forest - MAE: {rfr_mae:.4f}, RMSE: {rfr_rmse:.4f}, R2: {rfr_r2:.4f}")
    print(f"SVR - MAE: {svr_mae:.4f}, RMSE: {svr_rmse:.4f}, R2: {svr_r2:.4f}")
    print(f"XGBoost - MAE: {xgb_mae:.4f}, RMSE: {xgb_rmse:.4f}, R2: {xgb_r2:.4f}")
    print(f"LSTM - MAE: {lstm_mae:.4f}, RMSE: {lstm_rmse:.4f}, R2: {lstm_r2:.4f}")
    
    # Plot Individual Charts and Residuals
    plot_individual_model(lr_true_inv, arima_pred, "ARIMA", arima_conf_int)
    plot_individual_model(lr_true_inv, lr_pred_inv, "LinearRegression")
    plot_individual_model(gbr_true_inv, gbr_pred_inv, "GradientBoosting")
    plot_individual_model(rfr_true_inv, rfr_pred_inv, "RandomForest")
    plot_individual_model(svr_true_inv, svr_pred_inv, "SVR")
    plot_individual_model(xgb_true_inv, xgb_pred_inv, "XGBoost")
    
    plot_residuals(lr_true_inv, arima_pred, "ARIMA")
    plot_residuals(lr_true_inv, lr_pred_inv, "LinearRegression")
    plot_residuals(gbr_true_inv, gbr_pred_inv, "GradientBoosting")
    plot_residuals(rfr_true_inv, rfr_pred_inv, "RandomForest")
    plot_residuals(svr_true_inv, svr_pred_inv, "SVR")
    plot_residuals(xgb_true_inv, xgb_pred_inv, "XGBoost")
    
    # Plot Combined Comparison
    plt.figure(figsize=(12, 6))
    plt.plot(lr_true_inv, label='Actual Prices', color='blue')
    plt.plot(arima_pred, label='ARIMA Predictions', color='green', alpha=0.7)
    plt.fill_between(range(len(arima_pred)), arima_conf_int[:, 0], arima_conf_int[:, 1], 
                     color='green', alpha=0.2, label='95% Confidence Interval')
    plt.plot(lr_pred_inv, label='Linear Regression Predictions', color='orange')
    plt.plot(gbr_pred_inv, label='Gradient Boosting Predictions', color='red')
    plt.plot(rfr_pred_inv, label='Random Forest Predictions', color='purple')
    plt.plot(svr_pred_inv, label='SVR Predictions', color='brown')
    plt.plot(xgb_pred_inv, label='XGBoost Predictions', color='cyan')
    plt.title('Model Predictions vs Actual BTC Prices (LSTM Metrics Only)')
    plt.xlabel('Time')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig('latex\images\combined_model_predictions.png')
    plt.close()
    
    # Plot Performance Metrics Comparison
    models_metrics = [
        ("ARIMA", arima_mae, arima_rmse, arima_r2),
        ("Linear Regression", lr_mae, lr_rmse, lr_r2),
        ("Gradient Boosting", gbr_mae, gbr_rmse, gbr_r2),
        ("Random Forest", rfr_mae, rfr_rmse, rfr_r2),
        ("SVR", svr_mae, svr_rmse, svr_r2),
        ("XGBoost", xgb_mae, xgb_rmse, xgb_r2),
        ("LSTM", lstm_mae, lstm_rmse, lstm_r2)
    ]
    plot_performance_comparison(models_metrics)
    
    # Statistical Tests: Paired t-tests against LSTM (using synthetic errors for LSTM)
    lstm_synthetic_mae = np.full_like(lr_true_inv.flatten(), lstm_mae)
    lstm_synthetic_rmse = np.full_like(lr_true_inv.flatten(), lstm_rmse)
    
    t_tests_mae = {}
    t_tests_rmse = {}
    for name, pred, true in [
        ("ARIMA", arima_pred, scaler.inverse_transform(test_targets)),
        ("Linear Regression", lr_pred_inv, lr_true_inv),
        ("Gradient Boosting", gbr_pred_inv, gbr_true_inv),
        ("Random Forest", rfr_pred_inv, rfr_true_inv),
        ("SVR", svr_pred_inv, svr_true_inv),
        ("XGBoost", xgb_pred_inv, xgb_true_inv)
    ]:
        errors_mae = np.abs(true.flatten() - pred.flatten())
        t_stat_mae, p_val_mae = ttest_rel(errors_mae, lstm_synthetic_mae)
        errors_rmse = (true.flatten() - pred.flatten())**2
        lstm_synthetic_rmse_errors = lstm_synthetic_rmse**2
        t_stat_rmse, p_val_rmse = ttest_rel(errors_rmse, lstm_synthetic_rmse_errors)
        t_tests_mae[name] = p_val_mae
        t_tests_rmse[name] = p_val_rmse
    
    # Generate Individual LaTeX Tables for Each Model
    models = [
        ("ARIMA", arima_mae, arima_rmse, arima_r2),
        ("linear", lr_mae, lr_rmse, lr_r2),
        ("Gradient Boosting", gbr_mae, gbr_rmse, gbr_r2),
        ("Random Forest", rfr_mae, rfr_rmse, rfr_r2),
        ("SVR", svr_mae, svr_rmse, svr_r2),
        ("XGBoost", xgb_mae, xgb_rmse, xgb_r2),
        ("LSTM", lstm_mae, lstm_rmse, lstm_r2)
    ]
    
    for model_name, mae, rmse, r2 in models:
        latex_table = f"""
        \\begin{{table}}[h]
            \\centering
            \\begin{{tabular}}{{cccc}}
                \\toprule
                \\textbf{{مدل}} & \\textbf{{MAE}} & \\textbf{{RMSE}} & \\textbf{{ \\(R^2\\) }} \\\\
                \\midrule
                {model_name} & {mae:.4f} & {rmse:.4f} & {r2:.4f} \\\\
                \\bottomrule
            \\end{{tabular}}
            \\caption{{عملکرد مدل {model_name} در پیش‌بینی قیمت بیت‌کوین}}
            \\label{{tab:{model_name.lower().replace(" ", "_")}_performance}}
        \\end{{table}}
        """
        with open(rf'latex\chapters\{model_name.lower().replace(" ", "_")}_performance_table.tex', 'w', encoding='utf-8') as f:
            f.write(latex_table)
    
    # Generate Combined Comparison Table with P-values
    latex_comparison_table = f"""
    \\begin{{table}}[h]
        \\centering
        \\begin{{tabular}}{{cccccc}}
            \\toprule
            \\textbf{{مدل}} & \\textbf{{MAE}} & \\textbf{{p-value (MAE)}} & \\textbf{{RMSE}} & \\textbf{{p-value (RMSE)}} & \\textbf{{ \\(R^2\\) }} \\\\
            \\midrule
            ARIMA & {arima_mae:.4f} & {t_tests_mae['ARIMA']:.4f} & {arima_rmse:.4f} & {t_tests_rmse['ARIMA']:.4f} & {arima_r2:.4f} \\\\
            رگرسیون خطی & {lr_mae:.4f} & {t_tests_mae['Linear Regression']:.4f} & {lr_rmse:.4f} & {t_tests_rmse['Linear Regression']:.4f} & {lr_r2:.4f} \\\\
            Gradient Boosting & {gbr_mae:.4f} & {t_tests_mae['Gradient Boosting']:.4f} & {gbr_rmse:.4f} & {t_tests_rmse['Gradient Boosting']:.4f} & {gbr_r2:.4f} \\\\
            Random Forest & {rfr_mae:.4f} & {t_tests_mae['Random Forest']:.4f} & {rfr_rmse:.4f} & {t_tests_rmse['Random Forest']:.4f} & {rfr_r2:.4f} \\\\
            SVR & {svr_mae:.4f} & {t_tests_mae['SVR']:.4f} & {svr_rmse:.4f} & {t_tests_rmse['SVR']:.4f} & {svr_r2:.4f} \\\\
            XGBoost & {xgb_mae:.4f} & {t_tests_mae['XGBoost']:.4f} & {xgb_rmse:.4f} & {t_tests_rmse['XGBoost']:.4f} & {xgb_r2:.4f} \\\\
            مدل پیشنهادی (LSTM) & {lstm_mae:.4f} & -- & {lstm_rmse:.4f} & -- & {lstm_r2:.4f} \\\\
            \\bottomrule
        \\end{{tabular}}
        \\caption{{مقایسه عملکرد مدل‌های مختلف در پیش‌بینی قیمت بیت‌کوین با آزمون t جفت‌شده نسبت به LSTM}}
        \\label{{tab:model_comparison}}
    \\end{{table}}
    """
    with open(r'latex\chapters\model_comparison_table.tex', 'w', encoding='utf-8') as f:
        f.write(latex_comparison_table)

    # Generate Feature Importance Table for Tree-Based Models
    feature_names = [f'Lag {i+1}' for i in range(7)] + ['MA7']
    latex_feature_importance = f"""
    \\begin{{table}}[h]
        \\centering
        \\begin{{tabular}}{{lccc}}
            \\toprule
            \\textbf{{ویژگی}} & \\textbf{{Gradient Boosting}} & \\textbf{{Random Forest}} & \\textbf{{XGBoost}} \\\\
            \\midrule
    """
    for i, fname in enumerate(feature_names):
        latex_feature_importance += f"        {fname} & {gbr_model.feature_importances_[i]:.4f} & {rfr_model.feature_importances_[i]:.4f} & {xgb_model.feature_importances_[i]:.4f} \\\\\n"
    latex_feature_importance += f"""
            \\bottomrule
        \\end{{tabular}}
        \\caption{{اهمیت ویژگی‌ها در مدل‌های مبتنی بر درخت (Gradient Boosting، Random Forest، XGBoost)}}
        \\label{{tab:feature_importance}}
    \\end{{table}}
    """
    with open(r'latex\chapters\feature_importance_table.tex', 'w', encoding='utf-8') as f:
        f.write(latex_feature_importance)



In [114]:
additional_models()

[*********************100%***********************]  1 of 1 completed




Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-14091.988, Time=2.86 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-14096.215, Time=0.48 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-14096.271, Time=0.56 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-14096.218, Time=0.67 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-14097.829, Time=0.21 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-14094.333, Time=1.06 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 5.847 seconds

Model Performance Comparison:
ARIMA - MAE: 22068.6384, RMSE: 26600.0228, R2: -2.1119
Linear Regression - MAE: 1303.4137, RMSE: 1832.4142, R2: 0.9852
Gradient Boosting - MAE: 6098.2405, RMSE: 11970.7240, R2: 0.3698
Random Forest - MAE: 6504.4641, RMSE: 12464.2005, R2: 0.3167
SVR - MAE: 14635.8548, RMSE: 23734.6798, R2: -1.4776
XGBoost - MAE: 6956.9256, RMSE: 12993.2411, R2: 0.2575
LSTM - MAE: 0.0120, RMSE: 0.0200, R2: 0.8800


<h1>Adding Block Chain Data</h1>

In [None]:
import requests

def fetch_blockchain_data(start_date='2018-01-01', end_date='2024-12-31'):
    start_ts = int(pd.Timestamp(start_date).timestamp())
    end_ts = int(pd.Timestamp(end_date).timestamp())
    
    # Fetch transaction count
    tx_url = f"https://api.blockchain.info/charts/n-transactions?timespan=5years&start={start_ts}&end={end_ts}&format=json"
    tx_response = requests.get(tx_url)
    if tx_response.status_code != 200:
        print(f"Error fetching transaction data: Status code {tx_response.status_code}")
        return pd.DataFrame()
    tx_data = tx_response.json()
    tx_df = pd.DataFrame(tx_data['values'])
    tx_df['date'] = pd.to_datetime(tx_df['x'], unit='s')
    tx_df.set_index('date', inplace=True)
    tx_df = tx_df.rename(columns={'y': 'TxCount'})
    
    # Fetch trade volume (in BTC)
    vol_url = f"https://api.blockchain.info/charts/trade-volume?timespan=5years&start={start_ts}&end={end_ts}&format=json"
    vol_response = requests.get(vol_url)
    if vol_response.status_code != 200:
        print(f"Error fetching volume data: Status code {vol_response.status_code}")
        return pd.DataFrame()
    vol_data = vol_response.json()
    vol_df = pd.DataFrame(vol_data['values'])
    vol_df['date'] = pd.to_datetime(vol_df['x'], unit='s')
    vol_df.set_index('date', inplace=True)
    vol_df = vol_df.rename(columns={'y': 'TxVolumeBTC'})
    
    # Merge and convert volume to USD using BTC price
    blockchain_data = pd.merge(tx_df[['TxCount']], vol_df[['TxVolumeBTC']], left_index=True, right_index=True, how='outer')
    blockchain_data = blockchain_data.resample('D').mean().interpolate()
    
    btc_price = fetch_btc_data(start_date, end_date)
    if btc_price.empty or 'Close' not in btc_price.columns:
        print("No valid BTC price data available for volume conversion")
        return pd.DataFrame()
    
    # # Ensure single-level indices
    # blockchain_data = blockchain_data.reset_index()
    # btc_price_close = btc_price[['Close']].reset_index().set_index('Date')
    
    # # Align indices and join
    # common_index = blockchain_data['date']
    # btc_price_close = btc_price_close.reindex(common_index, method='ffill')
    
    # blockchain_data = blockchain_data.set_index('date').join(btc_price_close['Close'], how='left')
    

    # Flatten multi-level column headers
    btc_price.columns = btc_price.columns.get_level_values(0)  # or combine both levels if needed

    # Confirm 'Close' is now a normal column
    print(btc_price.columns)  # Should include 'Close'
                                    
    # Set the index properly (if not already)
    btc_price.index = pd.to_datetime(btc_price.index)

    # Set your blockchain_data index too
    blockchain_data.index = pd.to_datetime(blockchain_data.index)

    # Reindex or join
    blockchain_data = blockchain_data.join(btc_price[['Close']], how='left')
    

    # Ensure single-level indices
    blockchain_data = blockchain_data.reset_index()
    btc_price_close = btc_price[['Close']].reset_index().set_index('Date')

    # Align indices and join
    common_index = blockchain_data['date']

    blockchain_data.index = pd.to_datetime(blockchain_data.index)

    btc_price.index = pd.to_datetime(btc_price.index)

    blockchain_data = blockchain_data.reindex(btc_price.index, method='ffill').fillna(method='bfill')


    # Debug: Verify structure
    print("Blockchain Data index levels:", blockchain_data.index.nlevels)
    print("Blockchain Data columns after join:", blockchain_data.columns.tolist())
    if 'Close' not in blockchain_data.columns:
        print("Join failed to include 'Close' column. Data:", blockchain_data.head())
        return pd.DataFrame()
    
    blockchain_data['TxVolumeUSD'] = blockchain_data['TxVolumeBTC'] * blockchain_data['Close']
    blockchain_data['BlockHeight'] = np.arange(500000, 500000 + len(blockchain_data))  # Approximate block height
    
    return blockchain_data[['BlockHeight', 'TxCount', 'TxVolumeUSD','TxVolumeBTC']]

# 3. Preprocess Data with Blockchain Features
def preprocess_data_with_blockchain(btc_data, blockchain_data):
    if btc_data.empty or blockchain_data.empty:
        print("Empty data provided to preprocess_data_with_blockchain")
        return (None,) * 12
    btc_data = btc_data.fillna(method='ffill')
    blockchain_data = blockchain_data.reindex(btc_data.index, method='ffill').fillna(method='bfill')
    
    prices = btc_data['Close'].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    prices_scaled = scaler.fit_transform(prices)
    
    # Extract blockchain features
    block_height = blockchain_data['BlockHeight'].values
    tx_count = blockchain_data['TxCount'].values
    tx_volume_usd = blockchain_data['TxVolumeUSD'].values
    
    # Combine Features
    features = []
    targets = prices_scaled[7:]
    for i in range(len(prices_scaled) - 7):
        lagged = prices_scaled[i:i+7].flatten()
        ma7 = np.mean(prices_scaled[i:i+7])
        blockchain_features = [block_height[i+7], tx_count[i+7], tx_volume_usd[i+7]]
        features.append(np.append(np.append(lagged, ma7), blockchain_features))
    features = np.array(features)
    
    total_samples = len(features)
    train_size = int(total_samples * 0.7)
    val_size = int(total_samples * 0.15)
    test_size = total_samples - train_size - val_size
    
    train_features = features[:train_size]
    val_features = features[train_size:train_size + val_size]
    test_features = features[train_size + val_size:]
    train_targets = targets[:train_size]
    val_targets = targets[train_size:train_size + val_size]
    test_targets = targets[train_size + val_size:]
    
    return (train_features, val_features, test_features, 
            train_targets, val_targets, test_targets, 
            scaler, prices, btc_data, block_height, tx_count, tx_volume_usd)


# 4. Volatility Analysis with GARCH(1,1)
def volatility_analysis(data, test_size):
    if data.empty or 'Close' not in data.columns:
        print("No valid data for volatility analysis")
        return pd.Series(), pd.Series()
    returns = data['Close'].pct_change().dropna() * 100
    test_returns = returns[-test_size:]
    train_returns = returns[:-test_size]
    garch_model = arch_model(train_returns, vol='Garch', p=1, q=1, mean='Zero', dist='normal')
    garch_fit = garch_model.fit(disp='off')
    forecast = garch_fit.forecast(horizon=len(test_returns))
    garch_volatility = np.sqrt(forecast.variance.values[-1, :])
    realized_volatility = test_returns.rolling(window=7).std().dropna()
    
    plt.figure(figsize=(10, 5))
    plt.plot(realized_volatility.index, realized_volatility, label='Realized Volatility', color='blue')
    plt.plot(realized_volatility.index, garch_volatility[:len(realized_volatility)], label='GARCH(1,1) Volatility', color='orange')
    plt.title('Realized vs GARCH(1,1) Volatility for BTC')
    plt.xlabel('Date')
    plt.ylabel('Volatility (%)')
    plt.legend()
    plt.savefig('volatility_analysis.png')
    plt.close()
    
    return realized_volatility, garch_volatility[:len(realized_volatility)]



In [79]:
blockchain_data

Unnamed: 0,date,TxCount,TxVolumeBTC,Close
0,2018-01-01,241757.0,1.000380e+09,13657.200195
1,2018-01-02,340980.0,6.606965e+08,14982.099609
2,2018-01-03,395963.0,1.365239e+09,15201.000000
3,2018-01-04,425008.0,1.059923e+09,15599.200195
4,2018-01-05,342707.0,1.311574e+09,17429.500000
...,...,...,...,...
1821,2022-12-27,239359.0,3.983614e+07,16717.173828
1822,2022-12-28,272949.0,8.334204e+07,16552.572266
1823,2022-12-29,265955.0,9.358288e+07,16642.341797
1824,2022-12-30,291015.0,6.386757e+07,16602.585938


In [77]:
btc_price



Price,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018-01-01,13657.200195,14112.200195,13154.700195,14112.200195,10291200000
2018-01-02,14982.099609,15444.599609,13163.599609,13625.000000,16846600192
2018-01-03,15201.000000,15572.799805,14844.500000,14978.200195,16871900160
2018-01-04,15599.200195,15739.700195,14522.200195,15270.700195,21783199744
2018-01-05,17429.500000,17705.199219,15202.799805,15477.200195,23840899072
...,...,...,...,...,...
2024-12-26,95795.515625,99884.570312,95137.882812,99297.695312,47054980873
2024-12-27,94164.859375,97294.843750,93310.742188,95704.976562,52419934565
2024-12-28,95163.929688,95525.898438,94014.289062,94160.187500,24107436185
2024-12-29,93530.226562,95174.875000,92881.789062,95174.054688,29635885267


In [21]:
# 6. ARIMA Model
def train_arima_model_with_block_chain(train_data, val_data, test_data, scaler):
    if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
        print("Insufficient data for ARIMA model")
        return np.array([]), np.array([])
    train_val_data = np.concatenate([train_data, val_data]).flatten()
    model = auto_arima(train_val_data, seasonal=False, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True, max_p=5, max_d=2, max_q=5)
    forecast = model.predict(n_periods=len(test_data))
    conf_int = model.predict(n_periods=len(test_data), return_conf_int=True, alpha=0.05)[1]
    forecast_inv = scaler.inverse_transform(forecast.reshape(-1, 1))
    conf_int_inv = scaler.inverse_transform(conf_int)
    return forecast_inv, conf_int_inv

# 8. LSTM Model with Blockchain Data
async def train_lstm_model_async(train_features, val_features, test_features, train_targets, val_targets, test_targets):

    if train_features is None or train_targets is None or test_features is None:
        print("Invalid data for LSTM model")
        return np.array([]), np.array([])

    # Remove rows with NaNs in all sets
    def clean_nan(X, y):
        mask = ~np.isnan(X).any(axis=1) & ~np.isnan(y).flatten()
        return X[mask], y[mask]

    train_features, train_targets = clean_nan(train_features, train_targets)
    val_features, val_targets = clean_nan(val_features, val_targets)
    test_features, test_targets = clean_nan(test_features, test_targets)

    # Standardize features
    scaler = StandardScaler()
    X_all = np.concatenate([train_features, val_features, test_features])
    X_all_scaled = scaler.fit_transform(X_all)
    train_features_scaled = X_all_scaled[:len(train_features)]
    val_features_scaled = X_all_scaled[len(train_features):len(train_features)+len(val_features)]
    test_features_scaled = X_all_scaled[-len(test_features):]

    # Combine train and val
    X_train_val = np.concatenate([train_features_scaled, val_features_scaled]) if len(val_features_scaled) > 0 else train_features_scaled
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()]) if len(val_targets) > 0 else train_targets.flatten()
    X_test = test_features_scaled
    y_test = test_targets.flatten()

    # Check for NaNs before reshaping
    print("NaNs in X_train_val:", np.isnan(X_train_val).sum())
    print("NaNs in y_train_val:", np.isnan(y_train_val).sum())
    print("NaNs in X_test:", np.isnan(X_test).sum())

    # Reshape for LSTM
    timesteps = 1
    n_features = X_train_val.shape[1]
    X_train_val_reshaped = X_train_val.reshape((X_train_val.shape[0], timesteps, n_features))
    X_test_reshaped = X_test.reshape((X_test.shape[0], timesteps, n_features))

    print("X_train_val_reshaped:", X_train_val_reshaped.shape)
    print("y_train_val shape:", y_train_val.shape)

    # Build LSTM Model
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(timesteps, n_features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    # Train
    model.fit(X_train_val_reshaped, y_train_val, epochs=20, batch_size=32, verbose=0)

    # Predict
    y_pred = model.predict(X_test_reshaped, verbose=0)
    print("NaNs in prediction:", np.isnan(y_pred).sum())

    return y_pred, y_test

async def train_lstm_model_async_old(train_features, val_features, test_features, train_targets, val_targets, test_targets):
    if train_features is None or train_targets is None or test_features is None:
        print("Invalid data for LSTM model")
        return np.array([]), np.array([])

    # Combine train and val
    X_train_val = np.concatenate([train_features, val_features]) if len(val_features) > 0 else train_features
    y_train_val = np.concatenate([train_targets.flatten(), val_targets.flatten()]) if len(val_targets) > 0 else train_targets.flatten()

    X_test = test_features
    y_test = test_targets.flatten()

    # Check for NaNs before reshaping
    print("NaNs in X_train_val:", np.isnan(X_train_val).sum())
    print("NaNs in y_train_val:", np.isnan(y_train_val).sum())
    print("NaNs in X_test:", np.isnan(X_test).sum())

    # Reshape for LSTM
    timesteps = 1
    n_features = X_train_val.shape[1]
    X_train_val_reshaped = X_train_val.reshape((X_train_val.shape[0], timesteps, n_features))
    X_test_reshaped = X_test.reshape((X_test.shape[0], timesteps, n_features))

    print("X_train_val_reshaped:", X_train_val_reshaped.shape)
    print("y_train_val shape:", y_train_val.shape)

    # Build LSTM Model
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(timesteps, n_features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    # Train
    model.fit(X_train_val_reshaped, y_train_val, epochs=20, batch_size=32, verbose=0)

    # Predict
    y_pred = model.predict(X_test_reshaped, verbose=0)
    print("NaNs in prediction:", np.isnan(y_pred).sum())

    return y_pred, y_test



# 9. Evaluate Model
def evaluate_model(y_true, y_pred):
    if y_true is None or y_pred is None:
        print("No data to evaluate")
        return 0, 0, 0

    # Flatten and stack into one array for filtering
    y_true = np.array(y_true).reshape(-1)
    y_pred = np.array(y_pred).reshape(-1)
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)

    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]

    if len(y_true_clean) == 0:
        print("No valid data after removing NaNs")
        return 0, 0, 0

    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
    r2 = r2_score(y_true_clean, y_pred_clean)

    return mae, rmse, r2


# 10. Plot Individual Model Predictions
def plot_individual_model(actual, predicted, model_name, conf_int=None):
    if len(actual) == 0 or len(predicted) == 0:
        print(f"No data to plot for {model_name}")
        return
    plt.figure(figsize=(10, 5))
    plt.plot(actual, label='Actual Prices', color='blue')
    plt.plot(predicted, label=f'{model_name} Predictions', color='orange')
    if conf_int is not None:
        plt.fill_between(range(len(predicted)), conf_int[:, 0], conf_int[:, 1], color='green', alpha=0.2, label='Prediction Interval')
    plt.title(f'{model_name} Predictions vs Actual BTC Prices')
    plt.xlabel('Time')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.savefig(f'{model_name.lower()}_predictions.png')
    plt.close()

async def main_block_chain():
    # Fetch and preprocess data
    btc_data = fetch_btc_data()
    blockchain_data = fetch_blockchain_data()
    (train_features, val_features, test_features,
     train_targets, val_targets, test_targets,
     scaler, prices, btc_full_data, block_height, tx_count, tx_volume_usd) = preprocess_data_with_blockchain(btc_data, blockchain_data)
    

    # Volatility Analysis
    realized_volatility, garch_volatility = volatility_analysis(btc_full_data, len(test_targets))
    
    # ARIMA Model
    arima_pred, arima_conf_int = train_arima_model_with_block_chain(train_targets.flatten(), val_targets.flatten(), test_targets.flatten(), scaler)
    arima_mae, arima_rmse, arima_r2 = evaluate_model(scaler.inverse_transform(test_targets), arima_pred)
    
    # Linear Regression Model
    lr_pred, lr_true = train_linear_regression(train_features, val_features, test_features, train_targets, val_targets, test_targets)
    lr_pred_inv = scaler.inverse_transform(lr_pred.reshape(-1, 1))
    lr_true_inv = scaler.inverse_transform(lr_true.reshape(-1, 1))
    lr_mae, lr_rmse, lr_r2 = evaluate_model(lr_true_inv, lr_pred_inv)
    
    # LSTM Model with Blockchain Data
    lstm_pred, lstm_true = await train_lstm_model_async(train_features, val_features, test_features, train_targets, val_targets, test_targets)
    lstm_pred_inv = scaler.inverse_transform(lstm_pred)
    lstm_true_inv = scaler.inverse_transform(lstm_true.reshape(-1, 1))
    print("lstm_pred_inv shape:", lstm_pred_inv.shape)
    print("lstm_true_inv shape:", lstm_true_inv.shape)
    print("NaNs in lstm_pred_inv:", np.isnan(lstm_pred_inv).sum())
    print("NaNs in lstm_true_inv:", np.isnan(lstm_true_inv).sum())
    
    lstm_mae, lstm_rmse, lstm_r2 = evaluate_model(lstm_true_inv, lstm_pred_inv)
    
    # Print Results
    print("\nModel Performance Comparison:")
    print(f"ARIMA - MAE: {arima_mae:.4f}, RMSE: {arima_rmse:.4f}, R2: {arima_r2:.4f}")
    print(f"Linear Regression - MAE: {lr_mae:.4f}, RMSE: {lr_rmse:.4f}, R2: {lr_r2:.4f}")
    print(f"LSTM (with Blockchain Data) - MAE: {lstm_mae:.4f}, RMSE: {lstm_rmse:.4f}, R2: {lstm_r2:.4f}")
    
    # Plot Predictions
    plot_individual_model(lr_true_inv, arima_pred, "ARIMA_blockchain", arima_conf_int)
    plot_individual_model(lr_true_inv, lr_pred_inv, "LinearRegression_blockchain")
    plot_individual_model(lstm_true_inv, lstm_pred_inv, "LSTM_blockchain")

    # Plot Performance Metrics Comparison
    models_metrics = [
        ("ARIMA", arima_mae, arima_rmse, arima_r2),
        ("Linear Regression", lr_mae, lr_rmse, lr_r2),
        ("LSTM", lstm_mae, lstm_rmse, lstm_r2)
    ]
    plot_performance_comparison(models_metrics,'blockchain_data')


In [118]:
await main_block_chain()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Blockchain Data index levels: 1
Blockchain Data columns after join: ['date', 'TxCount', 'TxVolumeBTC', 'Close']
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-14091.988, Time=1.21 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-14096.215, Time=0.25 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-14096.271, Time=0.28 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-14096.218, Time=0.40 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-14097.829, Time=0.12 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-14094.333, Time=0.54 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 2.810 seconds
NaNs in X_train_val: 0
NaNs in y_train_val: 0
NaNs in X_test: 0
X_train_val_reshaped: (2166, 1, 11)
y_train_val shape: (2166,)
NaNs in prediction: 0
lstm_pred_inv shape: (383, 1)
lstm_true_inv shape: (383, 1)
NaNs in lstm_pred_inv: 0
NaNs in lstm_true_inv: 0

Model Performance Co

<h1>Adding Macro Economics features</h1>

In [27]:


# 3. Fetch Additional Economic Data
def fetch_economic_data(start_date='2018-01-01', end_date='2024-12-31'):
    try:
        # Fetch S&P 500 Index Data
        sp500 = yf.download('^GSPC', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'SP500'})
        if sp500.empty:
            raise ValueError("No S&P 500 data fetched")
        if sp500.isna().any().any():
            print("NaN values found in S&P 500 data:", sp500.isna().sum())
            sp500 = sp500.fillna(method='ffill').fillna(method='bfill')
        sp500.index = pd.to_datetime(sp500.index)  # Ensure proper index type

        # Fetch Federal Funds Rate from FRED
        fed_rate = pdr.get_data_fred('FEDFUNDS', start=start_date, end=end_date)
        fed_rate = fed_rate.rename(columns={'FEDFUNDS': 'InterestRate'})
        fed_rate.index = pd.to_datetime(fed_rate.index)
        fed_rate = fed_rate.resample('D').ffill()  # Resample to daily
        if fed_rate.isna().any().any():
            print("NaN values found in Federal Funds Rate data:", fed_rate.isna().sum())
            fed_rate = fed_rate.fillna(method='ffill').fillna(method='bfill')
        fed_rate.index.name = None  # Fix: remove index name

        # Fetch CPI Data from BLS API
        headers = {'Content-type': 'application/json'}
        data = {
            "seriesid": ["CUUR0000SA0"],
            "startyear": "2018",
            "endyear": "2024",
            "registrationkey": "23b34ab4ad3d475987e20a56fd7c8020"
        }
        response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', json=data, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching CPI data from BLS: Status code {response.status_code}")
            return pd.DataFrame()
        json_data = response.json()
        if json_data['status'] != 'REQUEST_SUCCEEDED':
            print(f"Error fetching CPI data: {json_data['message']}")
            return pd.DataFrame()

        cpi_data = []
        for series in json_data['Results']['series']:
            for item in series['data']:
                year = int(item['year'])
                month = int(item['period'][1:])
                value = float(item['value'])
                date = pd.Timestamp(year=year, month=month, day=1)
                cpi_data.append({'date': date, 'CPI': value})

        cpi_df = pd.DataFrame(cpi_data).set_index('date').sort_index()
        cpi_df = cpi_df.resample('D').interpolate()
        cpi_df['InflationRate'] = cpi_df['CPI'].pct_change(periods=365) * 100
        if cpi_df.isna().any().any():
            print("NaN values found in CPI data:", cpi_df.isna().sum())
            cpi_df = cpi_df.fillna(method='ffill').fillna(method='bfill')
        cpi_df.index.name = None  # Fix: remove index name

        # Align indexes for join
        sp500.index.name = None
        fed_rate.index.name = None
        cpi_df.index.name = None

        # Combine economic data
        economic_data = sp500.join([cpi_df[['InflationRate', 'CPI']], fed_rate], how='outer')
        if economic_data.isna().any().any():
            print("NaN values found in economic_data:", economic_data.isna().sum())
            economic_data = economic_data.fillna(method='ffill').fillna(method='bfill')

        return economic_data

    except Exception as e:
        print(f"Error fetching economic data: {e}")
        return pd.DataFrame()

    

def preprocess_data_with_blockchain(btc_data, blockchain_data):
    if btc_data.empty or blockchain_data.empty:
        print("Empty data provided to preprocess_data_with_blockchain")
        return (None,) * 12

    # Fetch economic data
    economic_data = fetch_economic_data(btc_data.index[0].date(), btc_data.index[-1].date())
    if economic_data.empty:
        print("Empty economic data, proceeding with blockchain data only")
        combined_data = btc_data.join(blockchain_data, how='outer')
    else:
        combined_data = btc_data.join(blockchain_data, how='outer').join(economic_data, how='outer')

    combined_data = combined_data.fillna(method='ffill').fillna(method='bfill')
    if combined_data.isna().any().any():
        print("NaN values found in combined_data after filling:", combined_data.isna().sum())
        return (None,) * 12

    prices = combined_data['Close'].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    prices_scaled = scaler.fit_transform(prices)

    block_height = combined_data['BlockHeight'].values
    tx_count = combined_data['TxCount'].values
    tx_volume_usd = combined_data['TxVolumeUSD'].values
    inflation_rate = combined_data['InflationRate'].values
    interest_rate = combined_data['InterestRate'].values
    sp500 = combined_data['SP500'].values

    features = []
    targets = prices_scaled[7:]
    for i in range(len(prices_scaled) - 7):
        lagged = prices_scaled[i:i+7].flatten()
        ma7 = np.mean(prices_scaled[i:i+7])
        blockchain_features = [block_height[i+7], tx_count[i+7], tx_volume_usd[i+7]]
        economic_features = [inflation_rate[i+7], interest_rate[i+7], sp500[i+7]]
        features.append(np.append(np.append(np.append(lagged, ma7), blockchain_features), economic_features))
    features = np.array(features)

    if np.isnan(features).any() or np.isnan(targets).any():
        print("NaN values found in features or targets:", np.isnan(features).sum(), np.isnan(targets).sum())
        return (None,) * 12

    total_samples = len(features)
    train_size = int(total_samples * 0.7)
    val_size = int(total_samples * 0.15)
    test_size = total_samples - train_size - val_size

    train_features = features[:train_size]
    val_features = features[train_size:train_size + val_size]
    test_features = features[train_size + val_size:]
    train_targets = targets[:train_size]
    val_targets = targets[train_size:train_size + val_size]
    test_targets = targets[train_size + val_size:]

    return (train_features, val_features, test_features, 
            train_targets, val_targets, test_targets, 
            scaler, prices, combined_data, block_height, tx_count, tx_volume_usd)

In [33]:
def fetch_economic_data(start_date='2018-01-01', end_date='2024-12-31'):
    try:
        # Fetch S&P 500 Index Data
        sp500 = yf.download('^GSPC', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'SP500'})
        if sp500.empty:
            raise ValueError("No S&P 500 data fetched")
        if sp500.isna().any().any():
            print("NaN values found in S&P 500 data:", sp500.isna().sum())
            sp500 = sp500.fillna(method='ffill').fillna(method='bfill')
        sp500.index = pd.to_datetime(sp500.index)  # Ensure proper index type

        # Fetch Federal Funds Rate from FRED
        fed_rate = pdr.get_data_fred('FEDFUNDS', start=start_date, end=end_date)
        fed_rate = fed_rate.rename(columns={'FEDFUNDS': 'InterestRate'})
        fed_rate.index = pd.to_datetime(fed_rate.index)
        fed_rate = fed_rate.resample('D').ffill()  # Resample to daily
        if fed_rate.isna().any().any():
            print("NaN values found in Federal Funds Rate data:", fed_rate.isna().sum())
            fed_rate = fed_rate.fillna(method='ffill').fillna(method='bfill')
        fed_rate.index.name = None  # Fix: remove index name

        # Fetch CPI Data from BLS API
        headers = {'Content-type': 'application/json'}
        data = {
            "seriesid": ["CUUR0000SA0"],
            "startyear": "2018",
            "endyear": "2024",
            "registrationkey": "23b34ab4ad3d475987e20a56fd7c8020"
        }
        response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', json=data, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching CPI data from BLS: Status code {response.status_code}")
            return pd.DataFrame()
        json_data = response.json()
        if json_data['status'] != 'REQUEST_SUCCEEDED':
            print(f"Error fetching CPI data: {json_data['message']}")
            return pd.DataFrame()

        cpi_data = []
        for series in json_data['Results']['series']:
            for item in series['data']:
                year = int(item['year'])
                month = int(item['period'][1:])
                value = float(item['value'])
                date = pd.Timestamp(year=year, month=month, day=1)
                cpi_data.append({'date': date, 'CPI': value})

        cpi_df = pd.DataFrame(cpi_data).set_index('date').sort_index()
        cpi_df = cpi_df.resample('D').interpolate()
        cpi_df['InflationRate'] = cpi_df['CPI'].pct_change(periods=365) * 100
        if cpi_df.isna().any().any():
            print("NaN values found in CPI data:", cpi_df.isna().sum())
            cpi_df = cpi_df.fillna(method='ffill').fillna(method='bfill')
        cpi_df.index.name = None  # Fix: remove index name

        # Align indexes for join
        sp500.index.name = None
        fed_rate.index.name = None
        cpi_df.index.name = None

        # Combine economic data
        economic_data = sp500.join([cpi_df[['InflationRate', 'CPI']], fed_rate], how='outer')
        if economic_data.isna().any().any():
            print("NaN values found in economic_data:", economic_data.isna().sum())
            economic_data = economic_data.fillna(method='ffill').fillna(method='bfill')

        return economic_data

    except Exception as e:
        print(f"Error fetching economic data: {e}")
        return pd.DataFrame()


In [11]:
import pandas_datareader as pdr
import requests

In [19]:
fetch_economic_data()

[*********************100%***********************]  1 of 1 completed




NaN values found in CPI data: CPI                0
InflationRate    365
dtype: int64
NaN values found in economic_data: (SP500, ^GSPC)    787
InflationRate      20
CPI                20
InterestRate       20
dtype: int64


Unnamed: 0,"(SP500, ^GSPC)",InflationRate,CPI,InterestRate
2018-01-01,2695.810059,1.551235,247.867000,1.41
2018-01-02,2695.810059,1.551235,247.903258,1.41
2018-01-03,2713.060059,1.551235,247.939516,1.41
2018-01-04,2723.989990,1.551235,247.975774,1.41
2018-01-05,2743.149902,1.551235,248.012032,1.41
...,...,...,...,...
2024-12-23,5974.069824,2.869980,315.605000,4.48
2024-12-24,6040.040039,2.869980,315.605000,4.48
2024-12-26,6037.589844,2.869980,315.605000,4.48
2024-12-27,5970.839844,2.869980,315.605000,4.48


In [None]:
async def main_economic_data():
    btc_data = fetch_btc_data()
    if btc_data.empty:
        print("Failed to fetch BTC data, aborting execution")
        return
    blockchain_data = fetch_blockchain_data()
    if blockchain_data.empty:
        print("Failed to fetch blockchain data, aborting execution")
        return
    (train_features, val_features, test_features,
     train_targets, val_targets, test_targets,
     scaler, prices, combined_data, block_height, tx_count, tx_volume_usd) = preprocess_data_with_blockchain(btc_data, blockchain_data)

    if train_features is None:
        print("Preprocessing failed, aborting execution")
        return

    realized_volatility, garch_volatility = volatility_analysis(combined_data, len(test_targets))

    arima_pred, arima_conf_int = train_arima_model_with_block_chain(train_targets.flatten(), val_targets.flatten(), test_targets.flatten(), scaler)
    arima_mae, arima_rmse, arima_r2 = evaluate_model(scaler.inverse_transform(test_targets), arima_pred)

    lr_pred, lr_true = train_linear_regression(train_features, val_features, test_features, train_targets, val_targets, test_targets)
    lr_pred_inv = scaler.inverse_transform(lr_pred.reshape(-1, 1))
    lr_true_inv = scaler.inverse_transform(lr_true.reshape(-1, 1))
    lr_mae, lr_rmse, lr_r2 = evaluate_model(lr_true_inv, lr_pred_inv)

    lstm_pred, lstm_true = await train_lstm_model_async(train_features, val_features, test_features, train_targets, val_targets, test_targets)
    lstm_pred_inv = scaler.inverse_transform(lstm_pred)
    lstm_true_inv = scaler.inverse_transform(lstm_true.reshape(-1, 1))
    lstm_mae, lstm_rmse, lstm_r2 = evaluate_model(lstm_true_inv, lstm_pred_inv)

    print("\nModel Performance Comparison:")
    print(f"ARIMA - MAE: {arima_mae:.4f}, RMSE: {arima_rmse:.4f}, R2: {arima_r2:.4f}")
    print(f"Linear Regression - MAE: {lr_mae:.4f}, RMSE: {lr_rmse:.4f}, R2: {lr_r2:.4f}")
    print(f"LSTM (with Blockchain and Economic Data) - MAE: {lstm_mae:.4f}, RMSE: {lstm_rmse:.4f}, R2: {lstm_r2:.4f}")

    plot_individual_model(lr_true_inv, arima_pred, "ARIMA_economic_data", arima_conf_int)
    plot_individual_model(lr_true_inv, lr_pred_inv, "LinearRegression_economic_data")
    plot_individual_model(lstm_true_inv, lstm_pred_inv, "LSTM_economic_data")
     # Plot Performance Metrics Comparison
    models_metrics = [
        ("ARIMA", arima_mae, arima_rmse, arima_r2),
        ("Linear Regression", lr_mae, lr_rmse, lr_r2),
        ("LSTM", lstm_mae, lstm_rmse, lstm_r2)
    ]
    plot_performance_comparison(models_metrics,'economic_data')


 

In [38]:
await main_economic_data()

[*********************100%***********************]  1 of 1 completed


[*********************100%***********************]  1 of 1 completed


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Blockchain Data index levels: 1
Blockchain Data columns after join: ['date', 'TxCount', 'TxVolumeBTC', 'Close']
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-14091.988, Time=1.05 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-14096.215, Time=0.24 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-14096.271, Time=0.23 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-14096.218, Time=0.31 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-14097.829, Time=0.10 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-14094.333, Time=0.40 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 2.341 seconds
NaNs in X_train_val: 0
NaNs in y_train_val: 0
NaNs in X_test: 0
X_train_val_reshaped: (2166, 1, 11)
y_train_val shape: (2166,)
NaNs in prediction: 0

Model Performance Comparison:
ARIMA - MAE: 22068.6384, RMSE: 26600.0228, R2: -2.1119
Linear Regression - MAE: 1301.4418, RMSE: 182

<h1>Description of Data</h1>

In [39]:
btc_data = fetch_btc_data()

[*********************100%***********************]  1 of 1 completed


In [41]:
btc_data.describe()

Price,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
count,2556.0,2556.0,2556.0,2556.0,2556.0
mean,28060.072129,28650.847219,27380.998146,28031.100705,26953720000.0
std,22106.145509,22560.37718,21575.174321,22073.164851,19656350000.0
min,3236.761719,3275.37793,3191.303467,3236.274658,2923670000.0
25%,8975.989502,9202.108154,8792.074707,8940.944092,13808140000.0
50%,22185.371094,22598.989258,21453.022461,22013.654297,24054430000.0
75%,42840.408203,43575.499023,41906.514648,42807.783203,35360100000.0
max,106140.601562,108268.445312,105291.734375,106147.296875,350967900000.0


In [43]:
with open("describe_btc_data.tex", "w") as f:
    f.write(btc_data.describe().to_latex())

In [44]:
blockchain_data = fetch_blockchain_data()

[*********************100%***********************]  1 of 1 completed

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Blockchain Data index levels: 1
Blockchain Data columns after join: ['date', 'TxCount', 'TxVolumeBTC', 'Close']





In [47]:
with open("describe_blockchain_data.tex", "w") as f:
    f.write(blockchain_data.describe().to_latex())

In [46]:
blockchain_data.describe()

Unnamed: 0,BlockHeight,TxCount,TxVolumeUSD
count,2556.0,2556.0,2556.0
mean,501277.5,242325.0,1418388000000.0
std,737.997967,0.0,0.05958197
min,500000.0,242325.0,1418388000000.0
25%,500638.75,242325.0,1418388000000.0
50%,501277.5,242325.0,1418388000000.0
75%,501916.25,242325.0,1418388000000.0
max,502555.0,242325.0,1418388000000.0
