In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid, GridSearchCV
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [9]:
# Load your dataset (replace this with your actual dataset loading code)
# For example, assuming you have a CSV file 'your_dataset.csv'
dataset = pd.read_csv('BTC-USD.csv')

# Assuming you have columns like 'Date', 'Low', and 'High'
# Make sure to preprocess your data accordingly

# Function to train ARIMA model
def train_arima(train_data, order):
    model = ARIMA(train_data, order=order)
    fitted_model = model.fit()
    return fitted_model

# Function to make ARIMA predictions
def arima_predict(model, start, end):
    predictions = model.predict(start=start, end=end, typ='levels')
    return predictions

# Function to perform time series cross-validation for ARIMA
def time_series_cross_validation_arima(data, order, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmse_scores = []

    for train_index, test_index in tscv.split(data):
        train_data, test_data = data[train_index], data[test_index]

        # Train ARIMA model
        arima_model = train_arima(train_data, order)

        # Make ARIMA predictions
        arima_preds = arima_predict(arima_model, start=len(train_data), end=len(train_data) + len(test_data) - 1)

        # Calculate RMSE for this fold
        rmse = np.sqrt(mean_squared_error(test_data, arima_preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Function to train Random Forest model
def train_random_forest(X_train, y_train, optimal_params):
    rf_model = RandomForestRegressor(**optimal_params, random_state=42)
    rf_model.fit(X_train, y_train)
    return rf_model

# Function to make Random Forest predictions
def random_forest_predict(model, X_test):
    predictions = model.predict(X_test)
    return predictions

# Function to perform time series cross-validation for Random Forest
def time_series_cross_validation_rf(X, y, optimal_params, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmse_scores = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train Random Forest model
        rf_model = train_random_forest(X_train, y_train, optimal_params)

        # Make Random Forest predictions
        rf_preds = random_forest_predict(rf_model, X_test)

        # Calculate RMSE for this fold
        rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Combine ARIMA and Random Forest predictions
def hybrid_predict(arima_preds, rf_preds, arima_weight=0.5):
    hybrid_preds = arima_weight * arima_preds + (1 - arima_weight) * rf_preds
    return hybrid_preds

# Use auto_arima to find optimal ARIMA orders
arima_order_low = pm.auto_arima(dataset['Low'], seasonal=False).order
arima_order_high = pm.auto_arima(dataset['High'], seasonal=False).order

# Perform time series cross-validation for ARIMA
cv_rmse_arima_low = time_series_cross_validation_arima(dataset['Low'].values, arima_order_low)
cv_rmse_arima_high = time_series_cross_validation_arima(dataset['High'].values, arima_order_high)
print(f'ARIMA Cross-Validation RMSE for Low: {cv_rmse_arima_low}')
print(f'ARIMA Cross-Validation RMSE for High: {cv_rmse_arima_high}')

# Train ARIMA models on the entire dataset
arima_model_low = train_arima(dataset['Low'].values, arima_order_low)
arima_model_high = train_arima(dataset['High'].values, arima_order_high)


dataset['Low_Lag1'] = dataset['Low'].shift(1)
dataset['High_Lag1'] = dataset['High'].shift(1)

# Drop the rows with NaN values resulting from the shift
dataset = dataset.dropna()

# Now you can proceed with the rest of the code
X_rf = dataset[['Low_Lag1', 'High_Lag1']].values


# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Function to find optimal Random Forest parameters
def find_optimal_rf_parameters(X, y, param_grid):
    optimal_params = None
    min_rmse = float('inf')

    for params in ParameterGrid(param_grid):
        cv_rmse = time_series_cross_validation_rf(X, y, params)
        if cv_rmse < min_rmse:
            min_rmse = cv_rmse
            optimal_params = params

    return optimal_params

# Find the optimal Random Forest parameters
optimal_rf_params_low = find_optimal_rf_parameters(X_rf, dataset['Low'].values, param_grid_rf)
optimal_rf_params_high = find_optimal_rf_parameters(X_rf, dataset['High'].values, param_grid_rf)
print(f'Optimal Random Forest Parameters for Low: {optimal_rf_params_low}')
print(f'Optimal Random Forest Parameters for High: {optimal_rf_params_high}')

# Perform time series cross-validation for Random Forest
cv_rmse_rf_low = time_series_cross_validation_rf(X_rf, dataset['Low'].values, optimal_rf_params_low)
cv_rmse_rf_high = time_series_cross_validation_rf(X_rf, dataset['High'].values, optimal_rf_params_high)
print(f'Random Forest Cross-Validation RMSE for Low: {cv_rmse_rf_low}')
print(f'Random Forest Cross-Validation RMSE for High: {cv_rmse_rf_high}')

# Train Random Forest models on the entire dataset
rf_model_low = train_random_forest(X_rf, dataset['Low'].values, optimal_rf_params_low)
rf_model_high = train_random_forest(X_rf, dataset['High'].values, optimal_rf_params_high)



ARIMA Cross-Validation RMSE for Low: 4245.468716699945
ARIMA Cross-Validation RMSE for High: 4168.154980726558
Optimal Random Forest Parameters for Low: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Optimal Random Forest Parameters for High: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Cross-Validation RMSE for Low: 1888.9011579531684
Random Forest Cross-Validation RMSE for High: 2076.9762950038703


In [10]:
num_rows = len(dataset)

print(f"Number of rows in the CSV file: {num_rows}")

Number of rows in the CSV file: 365


In [11]:
def predict_next_values(dataset, num_days, num_next_values):
    # Extract the specified number of days for prediction
    last_days_data = dataset.tail(num_days)

    # Feature engineering for ARIMA
    arima_features_low = last_days_data['Low'].values
    arima_features_high = last_days_data['High'].values

    # Feature engineering for Random Forest
    X_rf_predict = last_days_data[['Low_Lag1', 'High_Lag1']].values

    # Make predictions using ARIMA
    arima_pred_low_next = arima_predict(arima_model_low, start=len(last_days_data['Low']), end=len(last_days_data['Low']) + num_next_values - 1)
    arima_pred_high_next = arima_predict(arima_model_high, start=len(last_days_data['High']), end=len(last_days_data['High']) + num_next_values - 1)

    # Make predictions using Random Forest
    rf_pred_low_next = random_forest_predict(rf_model_low, X_rf_predict[-1].reshape(1, -1))
    rf_pred_high_next = random_forest_predict(rf_model_high, X_rf_predict[-1].reshape(1, -1))

    # Combine predictions for the next values
    hybrid_pred_low_next = hybrid_predict(arima_pred_low_next, rf_pred_low_next)
    hybrid_pred_high_next = hybrid_predict(arima_pred_high_next, rf_pred_high_next)

    # Display or use hybrid_pred_low_next and hybrid_pred_high_next for further analysis
    print(f'Hybrid Prediction for Low (Next {num_next_values} Values): {hybrid_pred_low_next}')
    print(f'Hybrid Prediction for High (Next {num_next_values} Values): {hybrid_pred_high_next}')

# Example usage:
print(dataset.tail(5))
num_days = 181
num_next_values = 3
predict_next_values(dataset, num_days, num_next_values)


           Date          Open          High           Low         Close  \
361  2024-02-06  42657.390625  43344.148438  42529.019531  43084.671875   
362  2024-02-07  43090.019531  44341.949219  42775.957031  44318.222656   
363  2024-02-08  44332.125000  45575.839844  44332.125000  45301.566406   
364  2024-02-09  45297.382813  48152.496094  45260.824219  47147.199219   
365  2024-02-10  47153.527344  48087.664063  46926.738281  47762.769531   

        Adj Close       Volume      Low_Lag1     High_Lag1  
361  43084.671875  16798476726  42264.816406  43494.250000  
362  44318.222656  21126587775  42529.019531  43344.148438  
363  45301.566406  26154524080  42775.957031  44341.949219  
364  47147.199219  39316770844  44332.125000  45575.839844  
365  47762.769531  16489554944  45260.824219  48152.496094  
Hybrid Prediction for Low (Next 3 Values): [37003.795678  36992.6189205 36942.1540765]
Hybrid Prediction for High (Next 3 Values): [38812.99585225 38599.16231375 38531.12960439]


In [13]:


# Extract the last 100 days for prediction
last_100_days_data = dataset.tail(180)

# Feature engineering for ARIMA
arima_features_low = last_100_days_data['Low'].values
arima_features_high = last_100_days_data['High'].values

# Feature engineering for Random Forest
X_rf_predict = last_100_days_data[['Low_Lag1', 'High_Lag1']].values

# Make predictions using ARIMA
arima_pred_low_next = arima_predict(arima_model_low, start=len(last_100_days_data['Low']), end=len(last_100_days_data['Low']))
arima_pred_high_next = arima_predict(arima_model_high, start=len(last_100_days_data['High']), end=len(last_100_days_data['High']))

# Make predictions using Random Forest
rf_pred_low_next = random_forest_predict(rf_model_low, X_rf_predict[-1].reshape(1, -1))
rf_pred_high_next = random_forest_predict(rf_model_high, X_rf_predict[-1].reshape(1, -1))

# Combine predictions for the next value
hybrid_pred_low_next = hybrid_predict(arima_pred_low_next, rf_pred_low_next)
hybrid_pred_high_next = hybrid_predict(arima_pred_high_next, rf_pred_high_next)

# Display or use hybrid_pred_low_next and hybrid_pred_high_next for further analysis
print(f'Hybrid Prediction for Low (Next Value): {hybrid_pred_low_next[0]}')
print(f'Hybrid Prediction for High (Next Value): {hybrid_pred_high_next[0]}')

Hybrid Prediction for Low (Next Value): 36872.30251400298
Hybrid Prediction for High (Next Value): 38931.11690263591
