In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
import joblib

from utilities import get_evaluation_metrics


In [2]:
SUBDATASET_PATH = "data/subdataset/"
MODELS_PATH = os.path.join("models", "ARIMA")


In [3]:
RMSEs, MAEs, MAPEs = list(), list(), list()
for ticker in tqdm_notebook(os.listdir(SUBDATASET_PATH), desc="Train and Evaluate ARIMA"):
    if not os.path.exists(MODELS_PATH):
        os.makedirs(MODELS_PATH)

    model_path = os.path.join(MODELS_PATH, f"{ticker.split('.')[0]}.pkl")

    df = pd.read_csv(SUBDATASET_PATH+ticker, index_col="Date")
    df["Next 5 Days Close"] = df["Close"].shift(-5)
    df.dropna(inplace=True)

    X = df.drop("Next 5 Days Close", axis=1)
    y = df["Next 5 Days Close"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=.1, shuffle=False)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    mod = ARIMA(y_train, order=(1, 1, 0))
    res = mod.fit()
    res.save(model_path)

    model = ARIMAResults.load(model_path)
    y_pred = model.forecast(len(y_test))
    RMSE, MAE, MAPE = get_evaluation_metrics(y_test, y_pred)
    RMSEs.append(RMSE)
    MAEs.append(MAE)
    MAPEs.append(MAPE)


Train and Evaluate ARIMA:   0%|          | 0/493 [00:00<?, ?it/s]

In [4]:
print(f"RMSE: {np.mean(RMSEs)}\nMAE: {np.mean(MAEs)}\nMAPE: {np.mean(MAPEs)}")


RMSE: 21.252175374651312
MAE: 17.668906418796027
MAPE: 0.08602918633225576
