#### First notebook used for model training and evaluation, was used before migrating to jobs, it includes mlflow experiments

In [61]:
%pip install optuna
%pip install mlflow==2.13.2 sagemaker-mlflow==0.1.0

import boto3
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch import nn
import torch
import numpy as np
import xgboost as xgb
from optuna import create_study
from xgboost import XGBClassifier
import mlflow
import logging
import sagemaker
import os

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [69]:
BUCKET_NAME = "team1-index-predictor-bucket"

root_folder = "data/processed"

data_filename = "data.csv"

In [70]:
df = pd.read_csv(f"../{root_folder}/{data_filename}")

train_df = df[df["type"] == "train"]
validation_df = df[df["type"] == "validation"]
test_df = df[df["type"] == "test"]

In [71]:
y_train = train_df["close_target"]
X_train = train_df.drop(columns=["close_target", "datetime", "type", "version"])

y_validation = validation_df["close_target"]
X_validation = validation_df.drop(
    columns=["close_target", "datetime", "type", "version"]
)

y_test = test_df["close_target"]
X_test = test_df.drop(columns=["close_target", "datetime", "type", "version"])

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

In [72]:
tracking_server_arn = (
    "arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj"
)
experiment_name = "index-predictor-model-training-notebook"

model_filename = "model_xgb_v0_0_1.xgb"
model_folder = "models"

In [73]:
def compute_cumulative_reward(y_pred, close_prices):
    """Get the cumulative, reward, since the model predicts if at time t+3 the price will be higher
    or lower than at time t, if model predicts correctly, we get the difference between the price at
    time t+3 and t, if the model predicts incorrectly, we get the negative difference between the
    price at time t+3 and t"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append(close_prices[i + 3] - close_prices[i])
        else:
            rewards.append(close_prices[i] - close_prices[i + 3])
    return np.sum(rewards)


def compute_cumulative_return(y_pred, close_prices):
    """Similar to the compute_cumulative_reward function, but in percentage terms"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append((close_prices[i + 3] - close_prices[i]) / close_prices[i])
        else:
            rewards.append((close_prices[i] - close_prices[i + 3]) / close_prices[i])
    return np.sum(rewards)

In [74]:
%mkdir -p ../models

In [75]:
mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)

with mlflow.start_run(
    run_name=sagemaker.utils.name_from_base("train-xgboost-optuna")
) as run:

    def objective(trial):

        params = {
            "objective": "binary:logistic",
            "max_depth": trial.suggest_int("max_depth", 3, 7),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
            "eval_metric": "logloss",
        }
        num_boost_round = trial.suggest_int("num_boost_round", 50, 100)

        evals = [(dtrain, "train"), (dvalidation, "eval")]

        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=20,
            verbose_eval=False,
        )

        y_pred_validation = bst.predict(dvalidation)
        y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)

        validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)

        return validation_accuracy

    study = create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    print(study.best_params)

    best_params = study.best_params

    params = {
        "objective": "binary:logistic",
        "max_depth": best_params["max_depth"],
        "learning_rate": best_params["learning_rate"],
        "eval_metric": "logloss",
    }

    evals = [(dtrain, "train"), (dvalidation, "eval")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=best_params["num_boost_round"],
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False,
    )

    y_pred_validation = bst.predict(dvalidation)
    y_pred_test = bst.predict(dtest)

    y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
    y_pred_test_binary = (y_pred_test > 0.5).astype(int)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
    test_accuracy = accuracy_score(y_test, y_pred_test_binary)

    cumulative_return = compute_cumulative_return(
        y_pred_test_binary, test_df["close"].values
    )
    cumulative_reward = compute_cumulative_reward(
        y_pred_test_binary, test_df["close"].values
    )

    mlflow.log_params(
        {
            "train_dataset_size": len(y_train),
            "validation_dataset_size": len(y_validation),
            "test_dataset_size": len(y_test),
            "number_of_optuna_trials": 10,
            "final_model_hyperparams": best_params,
        },
        run_id=run.info.run_id,
    )

    mlflow.log_metrics(
        {
            "validation_accuracy": validation_accuracy * 100,
            "test_accuracy": test_accuracy * 100,
            "cumulative_return": cumulative_return,
            "cumulative_reward": cumulative_reward,
        }
    )

    bst.save_model(f"../{model_folder}/{model_filename}")
    mlflow.log_artifact(
        f"{os.path.pardir}/{model_folder}", model_filename, run_id=run.info.run_id
    )

    print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print(
        "Cumulative return:",
        cumulative_return,
    )
    print(
        "Cumulative reward:",
        cumulative_reward,
    )

    mlflow.end_run(status="FINISHED")

{'max_depth': 7, 'learning_rate': 0.023768748963184105, 'num_boost_round': 80}
Validation Accuracy: 55.83%
Test Accuracy: 55.46%
Cumulative return: 0.02685286705147641
Cumulative reward: 147.248046875


In [61]:
%pip install optuna
%pip install mlflow==2.13.2 sagemaker-mlflow==0.1.0

import boto3
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch import nn
import torch
import numpy as np
import xgboost as xgb
from optuna import create_study
from xgboost import XGBClassifier
import mlflow
import logging
import sagemaker
import os

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [69]:
BUCKET_NAME = "team1-index-predictor-bucket"

root_folder = "data/processed"

data_filename = "data.csv"

In [70]:
df = pd.read_csv(f"../{root_folder}/{data_filename}")

train_df = df[df["type"] == "train"]
validation_df = df[df["type"] == "validation"]
test_df = df[df["type"] == "test"]

In [71]:
y_train = train_df["close_target"]
X_train = train_df.drop(columns=["close_target", "datetime", "type", "version"])

y_validation = validation_df["close_target"]
X_validation = validation_df.drop(
    columns=["close_target", "datetime", "type", "version"]
)

y_test = test_df["close_target"]
X_test = test_df.drop(columns=["close_target", "datetime", "type", "version"])

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

In [72]:
tracking_server_arn = (
    "arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj"
)
experiment_name = "index-predictor-model-training-notebook"

model_filename = "model_xgb_v0_0_1.xgb"
model_folder = "models"

In [73]:
def compute_cumulative_reward(y_pred, close_prices):
    """Get the cumulative, reward, since the model predicts if at time t+3 the price will be higher
    or lower than at time t, if model predicts correctly, we get the difference between the price at
    time t+3 and t, if the model predicts incorrectly, we get the negative difference between the
    price at time t+3 and t"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append(close_prices[i + 3] - close_prices[i])
        else:
            rewards.append(close_prices[i] - close_prices[i + 3])
    return np.sum(rewards)


def compute_cumulative_return(y_pred, close_prices):
    """Similar to the compute_cumulative_reward function, but in percentage terms"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append((close_prices[i + 3] - close_prices[i]) / close_prices[i])
        else:
            rewards.append((close_prices[i] - close_prices[i + 3]) / close_prices[i])
    return np.sum(rewards)

In [74]:
%mkdir -p ../models

In [75]:
mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)

with mlflow.start_run(
    run_name=sagemaker.utils.name_from_base("train-xgboost-optuna")
) as run:

    def objective(trial):

        params = {
            "objective": "binary:logistic",
            "max_depth": trial.suggest_int("max_depth", 3, 7),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
            "eval_metric": "logloss",
        }
        num_boost_round = trial.suggest_int("num_boost_round", 50, 100)

        evals = [(dtrain, "train"), (dvalidation, "eval")]

        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=20,
            verbose_eval=False,
        )

        y_pred_validation = bst.predict(dvalidation)
        y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)

        validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)

        return validation_accuracy

    study = create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    print(study.best_params)

    best_params = study.best_params

    params = {
        "objective": "binary:logistic",
        "max_depth": best_params["max_depth"],
        "learning_rate": best_params["learning_rate"],
        "eval_metric": "logloss",
    }

    evals = [(dtrain, "train"), (dvalidation, "eval")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=best_params["num_boost_round"],
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False,
    )

    y_pred_validation = bst.predict(dvalidation)
    y_pred_test = bst.predict(dtest)

    y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
    y_pred_test_binary = (y_pred_test > 0.5).astype(int)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
    test_accuracy = accuracy_score(y_test, y_pred_test_binary)

    cumulative_return = compute_cumulative_return(
        y_pred_test_binary, test_df["close"].values
    )
    cumulative_reward = compute_cumulative_reward(
        y_pred_test_binary, test_df["close"].values
    )

    mlflow.log_params(
        {
            "train_dataset_size": len(y_train),
            "validation_dataset_size": len(y_validation),
            "test_dataset_size": len(y_test),
            "number_of_optuna_trials": 10,
            "final_model_hyperparams": best_params,
        },
        run_id=run.info.run_id,
    )

    mlflow.log_metrics(
        {
            "validation_accuracy": validation_accuracy * 100,
            "test_accuracy": test_accuracy * 100,
            "cumulative_return": cumulative_return,
            "cumulative_reward": cumulative_reward,
        }
    )

    bst.save_model(f"../{model_folder}/{model_filename}")
    mlflow.log_artifact(
        f"{os.path.pardir}/{model_folder}", model_filename, run_id=run.info.run_id
    )

    print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print(
        "Cumulative return:",
        cumulative_return,
    )
    print(
        "Cumulative reward:",
        cumulative_reward,
    )

    mlflow.end_run(status="FINISHED")

{'max_depth': 7, 'learning_rate': 0.023768748963184105, 'num_boost_round': 80}
Validation Accuracy: 55.83%
Test Accuracy: 55.46%
Cumulative return: 0.02685286705147641
Cumulative reward: 147.248046875
