In [11]:
%reset -f

# 0. Import dependencies and load data

In [1]:
from datetime import timedelta
import os
import sys
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

from xgboost import XGBRegressor
from prophet import Prophet

sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))

Importing plotly failed. Interactive plots will not work.


In [2]:
from src.model_utility_functions import (
    preprocess_data,
    split_data,
    evaluate_model,
    cross_validation_scores,
    prepare_prophet_data,
)

from src.lag_utility_functions import (
    plot_acf_pacf,
    evaluate_ar_model,
    compare_lag_sets,
    plot_predictions,
    extract_lags,
)

from src.ma_utility_functions import (
    evaluate_moving_average,
    compare_moving_average_windows,
    plot_moving_average_predictions,
    extract_ma,
)

In [3]:
DATA_FOLDER_PATH = os.path.join(os.path.dirname(os.getcwd()), "data")
BASE_FILE_NAME = "_in_USD_historical_data.csv"

In [4]:
btc = pd.read_csv(os.path.join(DATA_FOLDER_PATH, f"BTC{BASE_FILE_NAME}"))
btc = preprocess_data(btc)
eth = pd.read_csv(os.path.join(DATA_FOLDER_PATH, f"ETH{BASE_FILE_NAME}"))
eth = preprocess_data(eth)
ltc = pd.read_csv(os.path.join(DATA_FOLDER_PATH, f"LTC{BASE_FILE_NAME}"))
ltc = preprocess_data(ltc)

In [5]:
train, test = split_data(btc, test_size=0.2)

# 1.1. Find best lags

In [None]:
plot_acf_pacf(btc, lags=20)

based on the ACF and PACF results I choose the lags 1, 2, 7, 10

In [8]:
lag_set = (1, 2, 7, 10)

In [None]:
predictions_dict = {}
for lags in lag_set:
    _, predictions = evaluate_ar_model(train, test, lags)
    predictions_dict[lags] = predictions

# Step 5: Plot the actual and predicted values using the predictions_dict
plot_predictions(test, predictions_dict)

In [None]:
compare_lag_sets(btc, lag_set)

based on the lag graph and the different MSE I will choose the lags 1, 7 and 10

# 1.2. Choose  Moving Average windows

In [None]:
# artitrary chosen 5 window sizes
window_sizes = [3, 5, 7, 10, 14]


mse_results, predictions_dict = compare_moving_average_windows(btc, window_sizes)


plot_moving_average_predictions(test, predictions_dict)


print("Mean Squared Error for each window size:")
for window_size, mse in mse_results.items():
    print(f"Window Size {window_size}: MSE = {mse:.4f}")

based on the graph and the MSe I will choose 3, 5 and 7 window sizes

# 2. Extract features

In [6]:
# transform all three series into DataFrames
btc = pd.DataFrame(btc)
eth = pd.DataFrame(eth)
ltc = pd.DataFrame(ltc)

In [7]:
chosen_lags = [1, 7, 10]
chosen_windows = [3, 5, 7]

btc = extract_lags(btc, chosen_lags)
eth = extract_lags(eth, chosen_lags)
ltc = extract_lags(ltc, chosen_lags)

btc = extract_ma(btc, chosen_windows)
eth = extract_ma(eth, chosen_windows)
ltc = extract_ma(ltc, chosen_windows)

# 3. Prepare the train and test data

In [8]:
btc["currency_id"] = 0
eth["currency_id"] = 1
ltc["currency_id"] = 2

combined_data = pd.concat([btc, eth, ltc])

In [9]:
combined_data["y"] = combined_data.groupby("currency_id")["price"].shift(-1)

In [10]:
combined_data = combined_data.dropna()

In [11]:
X = combined_data.drop(columns=["y"])
y = combined_data["y"]

train_size = int(len(combined_data) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 4. Model Comparison

In [13]:
# Hyperparameter grid for Random Forest
rf_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 10, 20, 30, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

# Hyperparameter grid for XGBoost
xgb_param_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3],
}

In [None]:
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid,
    n_iter=50,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
rf_random.fit(X_train, y_train)
print(f"Best Random Forest Params: {rf_random.best_params_}")

In [None]:
xgb = XGBRegressor(random_state=42)
xgb_random = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_param_grid,
    n_iter=50,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
xgb_random.fit(X_train, y_train)
print(f"Best XGBoost Params: {xgb_random.best_params_}")

In [24]:
prophet_df = combined_data[["price"]].rename(columns={"price": "y"})
prophet_df["ds"] = combined_data.index[: len(prophet_df)]  # Using index as date
prophet_df = prophet_df[["ds", "y"]]

In [None]:
# prophet_df = prepare_prophet_data(combined_data)
model = Prophet()
model.fit(prophet_df)
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)
prophet_predictions = forecast["yhat"][-len(y_test) :].values

In [None]:
# evaluate RandomForest model
rf_predictions = rf_random.best_estimator_.predict(X_test)
evaluate_model(y_test.values, rf_predictions, "Random Forest")
cross_validation_scores(rf_random.best_estimator_, X_train, y_train)

In [None]:
# evaluate XGBoost model
xgb_predictions = xgb_random.best_estimator_.predict(X_test)
evaluate_model(y_test.values, xgb_predictions, "XGBoost")
cross_validation_scores(xgb_random.best_estimator_, X_train, y_train)

In [None]:
# evaluate Prophet model
evaluate_model(y_test.values, prophet_predictions, "Prophet")

It seems that the Random Forest is the best performing model

# 5. Forecast

In [35]:
# get the models directory
root_dir = os.path.dirname(os.getcwd())
models_dir = os.path.join(root_dir, "models")
model_file_path = os.path.join(models_dir, "rf_03.pkl")

In [36]:
# # save the model for future use
# with open(model_file_path, "wb") as f:
#     pickle.dump(rf_random.best_estimator_, f)

In [37]:
# load the model
model = pickle.load(open(model_file_path, "rb"))

In [None]:
predictions = []
history = X[X["currency_id"] == 0].copy()
last_row = history.iloc[-1]
pred = model.predict([last_row])[0]
predictions.append(pred)
tomorrow_date = last_row.name + timedelta(days=1)
new_row = last_row.copy()
new_row["price"] = pred
new_row.name = tomorrow_date
history = pd.concat([history, new_row.to_frame().T])
history

### Test the forecast method

In [12]:
from src.crypto_forecast_model import CryptoForecastModel

In [13]:
cfm = CryptoForecastModel()
cfm.read_model()
cfm.forecast(X, "BTC")