#### Model training and evaluation notebook, tests XGBoost, Random Forest, and compares with Moving Averages baseline approach includes mlflow experiment tracking.

Note that in pipeline training and evaluation is performed as pipeline steps in sagemaker jobs, this one is for experimentation and comparison

In [2]:
%pip install -q optuna mlflow==2.13.2 sagemaker-mlflow==0.1.0

import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
import xgboost as xgb
from optuna import create_study
import mlflow
import logging
import sagemaker
import os
import matplotlib.pyplot as plt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
root_folder = "data/processed"

data_filename = "data.csv"

#### Data loading

In [4]:
df = pd.read_csv(f"../{root_folder}/{data_filename}")

train_df = df[df["type"] == "train"]
validation_df = df[df["type"] == "validation"]
test_df = df[df["type"] == "test"]

In [4]:
y_train = train_df["close_target"]
X_train = train_df.drop(columns=["close_target", "datetime", "type", "version"])

y_validation = validation_df["close_target"]
X_validation = validation_df.drop(
    columns=["close_target", "datetime", "type", "version"]
)

y_test = test_df["close_target"]
X_test = test_df.drop(columns=["close_target", "datetime", "type", "version"])

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

#### Constants setting

In [5]:
tracking_server_arn = (
    "arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj"
)
experiment_name = "index-predictor-model-training-notebook"

model_filename = "model_xgb_v0_0_1.xgb"
model_folder = "models"

#### Custom metrics computation

In [6]:
def compute_cumulative_reward(y_pred, close_prices):
    """Get the cumulative, reward, since the model predicts if at time t+3 the price will be higher
    or lower than at time t, if model predicts correctly, we get the difference between the price at
    time t+3 and t, if the model predicts incorrectly, we get the negative difference between the
    price at time t+3 and t"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append(close_prices[i + 3] - close_prices[i])
        else:
            rewards.append(close_prices[i] - close_prices[i + 3])
    return np.sum(rewards), rewards


def compute_cumulative_return(y_pred, close_prices):
    """Similar to the compute_cumulative_reward function, but in percentage terms"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append((close_prices[i + 3] - close_prices[i]) / close_prices[i])
        else:
            rewards.append((close_prices[i] - close_prices[i + 3]) / close_prices[i])
    return np.sum(rewards), rewards

In [7]:
%mkdir -p ../models

### XGBoost Model

In [8]:
mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)

with mlflow.start_run(
    run_name=sagemaker.utils.name_from_base("train-xgboost-optuna")
) as run:

    def objective(trial):

        params = {
            "objective": "binary:logistic",
            "max_depth": trial.suggest_int("max_depth", 3, 7),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
            "eval_metric": "logloss",
        }
        num_boost_round = trial.suggest_int("num_boost_round", 50, 100)

        evals = [(dtrain, "train"), (dvalidation, "eval")]

        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=20,
            verbose_eval=False,
        )

        y_pred_validation = bst.predict(dvalidation)
        y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)

        validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)

        return validation_accuracy

    study = create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    print(study.best_params)

    best_params = study.best_params

    params = {
        "objective": "binary:logistic",
        "max_depth": best_params["max_depth"],
        "learning_rate": best_params["learning_rate"],
        "eval_metric": "logloss",
    }

    evals = [(dtrain, "train"), (dvalidation, "eval")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=best_params["num_boost_round"],
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False,
    )

    y_pred_validation = bst.predict(dvalidation)
    y_pred_test = bst.predict(dtest)

    y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
    y_pred_test_binary = (y_pred_test > 0.5).astype(int)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
    test_accuracy = accuracy_score(y_test, y_pred_test_binary)

    cumulative_return, return_ = compute_cumulative_return(
        y_pred_test_binary, test_df["close"].values
    )
    cumulative_reward, reward_ = compute_cumulative_reward(
        y_pred_test_binary, test_df["close"].values
    )

    mlflow.log_params(
        {
            "train_dataset_size": len(y_train),
            "validation_dataset_size": len(y_validation),
            "test_dataset_size": len(y_test),
            "number_of_optuna_trials": 10,
            "final_model_hyperparams": best_params,
        },
        run_id=run.info.run_id,
    )

    mlflow.log_metrics(
        {
            "validation_accuracy": validation_accuracy * 100,
            "test_accuracy": test_accuracy * 100,
            "cumulative_return": cumulative_return,
            "cumulative_reward": cumulative_reward,
        }
    )

    bst.save_model(f"../{model_folder}/{model_filename}")
    mlflow.log_artifact(
        f"{os.path.pardir}/{model_folder}", model_filename, run_id=run.info.run_id
    )

    print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print(
        "Cumulative return:",
        cumulative_return,
    )
    print(
        "Cumulative reward:",
        cumulative_reward,
    )

    mlflow.end_run(status="FINISHED")

{'max_depth': 3, 'learning_rate': 0.00730110583727178, 'num_boost_round': 55}
Validation Accuracy: 56.11%
Test Accuracy: 55.46%
Cumulative return: 0.02824741766309955
Cumulative reward: 154.76953125


#### Plot cumulative return of the model during one day of trading

In [None]:
cumulative_returns = pd.Series(return_).cumsum()

plot_df = pd.DataFrame(
    {"Index": range(len(cumulative_returns)), "Cumulative Return": cumulative_returns}
)

plt.figure(figsize=(6, 3))
plt.plot(plot_df["Index"], plot_df["Cumulative Return"], label="Cumulative Return")
plt.xlabel("Number of minutes since start of trading day")
plt.ylabel("Cumulative return")
plt.title("Cumulative return of the model during one day of trading")
plt.legend()
plt.show()

### Random Forest

In [9]:
import mlflow
import optuna
from optuna import create_study
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import logging
import sagemaker
import os
import pickle

mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)

with mlflow.start_run(
    run_name=sagemaker.utils.name_from_base("train-rf-optuna")
) as run:

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        }

        rf = RandomForestClassifier(**params)
        rf.fit(X_train, y_train)

        y_pred_validation = rf.predict(X_validation)
        validation_accuracy = accuracy_score(y_validation, y_pred_validation)

        return validation_accuracy

    study = create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    print(study.best_params)

    best_params = study.best_params

    rf = RandomForestClassifier(**best_params)
    rf.fit(X_train, y_train)

    y_pred_validation = rf.predict(X_validation)
    y_pred_test = rf.predict(X_test)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    cumulative_return, _ = compute_cumulative_return(
        y_pred_test, test_df["close"].values
    )
    cumulative_reward, _ = compute_cumulative_reward(
        y_pred_test, test_df["close"].values
    )

    mlflow.log_params(
        {
            "train_dataset_size": len(y_train),
            "validation_dataset_size": len(y_validation),
            "test_dataset_size": len(y_test),
            "number_of_optuna_trials": 10,
            "final_model_hyperparams": best_params,
        },
        run_id=run.info.run_id,
    )

    mlflow.log_metrics(
        {
            "validation_accuracy": validation_accuracy * 100,
            "test_accuracy": test_accuracy * 100,
            "cumulative_return": cumulative_return,
            "cumulative_reward": cumulative_reward,
        }
    )

    model_path = f"../{model_folder}/{model_filename}"
    with open(model_path, "wb") as f:
        pickle.dump(rf, f)
    mlflow.log_artifact(model_path, run_id=run.info.run_id)

    print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print("Cumulative return:", cumulative_return)
    print("Cumulative reward:", cumulative_reward)

    mlflow.end_run(status="FINISHED")

{'n_estimators': 113, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt'}
Validation Accuracy: 55.56%
Test Accuracy: 54.34%
Cumulative return: 0.017248126928588882
Cumulative reward: 94.5908203125


### Moving averages (Baseline)

In [10]:
import mlflow
import logging
import sagemaker
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)


def moving_average_signal(close_prices, window):
    ma = close_prices.rolling(window=window).mean()
    signals = np.where(close_prices > ma, 1, 0)
    return signals


def compute_metrics(predictions, actuals):
    accuracy = accuracy_score(actuals, predictions)
    cumulative_return, _ = compute_cumulative_return(
        predictions, test_df["close"].values
    )
    cumulative_reward, _ = compute_cumulative_reward(
        predictions, test_df["close"].values
    )
    return accuracy, cumulative_return, cumulative_reward


fixed_window_size = 8

with mlflow.start_run(
    run_name=sagemaker.utils.name_from_base("moving-averages")
) as run:

    y_pred_validation = moving_average_signal(X_validation["close"], fixed_window_size)
    y_pred_test = moving_average_signal(X_test["close"], fixed_window_size)

    validation_accuracy, validation_cumulative_return, validation_cumulative_reward = (
        compute_metrics(y_pred_validation, y_validation)
    )
    test_accuracy, test_cumulative_return, test_cumulative_reward = compute_metrics(
        y_pred_test, y_test
    )

    mlflow.log_params(
        {
            "train_dataset_size": len(y_train),
            "validation_dataset_size": len(y_validation),
            "test_dataset_size": len(y_test),
            "fixed_window_size": fixed_window_size,
        },
        run_id=run.info.run_id,
    )

    mlflow.log_metrics(
        {
            "validation_accuracy": validation_accuracy * 100,
            "test_accuracy": test_accuracy * 100,
            "test_cumulative_return": test_cumulative_return,
            "test_cumulative_reward": test_cumulative_reward,
        }
    )

    print("Window size:", fixed_window_size)
    print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print("Test Cumulative return:", test_cumulative_return)
    print("Test Cumulative reward:", test_cumulative_reward)

    mlflow.end_run(status="FINISHED")

Window size: 8
Validation Accuracy: 51.67%
Test Accuracy: 50.98%
Test Cumulative return: 0.012116591910712194
Test Cumulative reward: 66.328125
