In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting tqdm (from optuna)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting PyYAML (from optuna)
  Using cached PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Using cached MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl.metadata (

In [29]:
import boto3
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch import nn
import torch
import numpy as np
import xgboost as xgb
from optuna import create_study
from xgboost import XGBClassifier

In [30]:
BUCKET_NAME = "team1-index-predictor-bucket"

root_folder = "data/processed"

train_data_filename = "train-v0.csv"
validation_data_filename = "validation-v0.csv"
test_data_filename = "test-v0.csv"

s3 = boto3.client("s3")

In [107]:
train_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=train_raw_data_filename)
validation_s3_object = s3.get_object(
    Bucket=BUCKET_NAME, Key=validation_raw_data_filename
)
test_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=test_raw_data_filename)

train_data = train_s3_object["Body"].read().decode("utf-8")
train_df = pd.read_csv(StringIO(train_data))

validation_data = validation_s3_object["Body"].read().decode("utf-8")
validation_df = pd.read_csv(StringIO(validation_data))

test_data = test_s3_object["Body"].read().decode("utf-8")
test_df = pd.read_csv(StringIO(test_data))

In [31]:
train_df = pd.read_csv(f"../{root_folder}/{train_data_filename}")
validation_df = pd.read_csv(f"../{root_folder}/{validation_data_filename}")
test_df = pd.read_csv(f"../{root_folder}/{test_data_filename}")

In [32]:
X_train = train_df.drop(columns=["Close_target"])
y_train = train_df["Close_target"]

X_validation = validation_df.drop(columns=["Close_target"])
y_validation = validation_df["Close_target"]

X_test = test_df.drop(columns=["Close_target"])
y_test = test_df["Close_target"]

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

In [33]:
import logging

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)


def objective(trial):

    params = {
        "objective": "binary:logistic",
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        "eval_metric": "logloss",
    }
    num_boost_round = trial.suggest_int("num_boost_round", 50, 100)

    evals = [(dtrain, "train"), (dvalidation, "eval")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=20,
        verbose_eval=False,
    )

    y_pred_validation = bst.predict(dvalidation)
    y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)

    return validation_accuracy


study = create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(study.best_params)

best_params = study.best_params

params = {
    "objective": "binary:logistic",
    "max_depth": best_params["max_depth"],
    "learning_rate": best_params["learning_rate"],
    "eval_metric": "logloss",
}

evals = [(dtrain, "train"), (dvalidation, "eval")]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=best_params["num_boost_round"],
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=False,
)

y_pred_validation = bst.predict(dvalidation)
y_pred_test = bst.predict(dtest)

y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
y_pred_test_binary = (y_pred_test > 0.5).astype(int)

validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
test_accuracy = accuracy_score(y_test, y_pred_test_binary)

print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

{'max_depth': 5, 'learning_rate': 0.01759489987922903, 'num_boost_round': 93}
Validation Accuracy: 53.33%
Test Accuracy: 47.78%


In [34]:
def compute_cumulative_reward(y_pred, close_prices):
    """Get the cumulative, reward, since the model predicts if at time t+3 the price will be higher
    or lower than at time t, if model predicts correctly, we get the difference between the price at
    time t+3 and t, if the model predicts incorrectly, we get the negative difference between the
    price at time t+3 and t"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append(close_prices[i + 3] - close_prices[i])
        else:
            rewards.append(close_prices[i] - close_prices[i + 3])
    return np.sum(rewards)


def compute_cumulative_return(y_pred, close_prices):
    """Similar to the compute_cumulative_reward function, but in percentage terms"""
    rewards = []
    for i in range(0, len(close_prices) - 3):
        if y_pred[i] == 1:
            rewards.append((close_prices[i + 3] - close_prices[i]) / close_prices[i])
        else:
            rewards.append((close_prices[i] - close_prices[i + 3]) / close_prices[i])
    return np.sum(rewards)


print(
    "Cumulative return:",
    compute_cumulative_return(y_pred_test_binary, test_df["Close"].values),
)
print(
    "Cumulative reward:",
    compute_cumulative_reward(y_pred_test_binary, test_df["Close"].values),
)

Cumulative return: -0.0032180125650403723
Cumulative reward: -17.5595703125


In [82]:
model_filename = "model_xgb_v0_0_1.xgb"
model_folder = "models"

In [83]:
%mkdir -p ../models

In [84]:
import tarfile

bst.save_model(f"../{model_folder}/{model_filename}")

with tarfile.open(f"../{model_folder}/{model_filename}.tar.gz", "w:gz") as tar:
    tar.add(f"../{model_folder}/{model_filename}", arcname=model_filename)



In [74]:
bst.save_model(f"../models/{model_filename}")

In [5]:
s3.upload_file(
    model_filename,
    BUCKET_NAME,
    f"models/{model_filename}",
)