In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting tqdm (from optuna)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting PyYAML (from optuna)
  Using cached PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Using cached MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl.metadata (

In [62]:
import boto3
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch import nn
import torch
import numpy as np
import xgboost as xgb
from optuna import create_study
from xgboost import XGBClassifier

In [51]:
BUCKET_NAME = "team1-index-predictor-bucket"

root_folder = "data/processed"

train_data_filename = "train.csv"
validation_data_filename = "validation.csv"
test_data_filename = "test.csv"

s3 = boto3.client("s3")

In [107]:
train_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=train_raw_data_filename)
validation_s3_object = s3.get_object(
    Bucket=BUCKET_NAME, Key=validation_raw_data_filename
)
test_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=test_raw_data_filename)

train_data = train_s3_object["Body"].read().decode("utf-8")
train_df = pd.read_csv(StringIO(train_data))

validation_data = validation_s3_object["Body"].read().decode("utf-8")
validation_df = pd.read_csv(StringIO(validation_data))

test_data = test_s3_object["Body"].read().decode("utf-8")
test_df = pd.read_csv(StringIO(test_data))

In [53]:
train_df = pd.read_csv(f"../{root_folder}/{train_data_filename}")
validation_df = pd.read_csv(f"../{root_folder}/{validation_data_filename}")
test_df = pd.read_csv(f"../{root_folder}/{test_data_filename}")

In [57]:
X_train = train_df.drop(columns=["Close_target"])
y_train = train_df["Close_target"]

X_validation = validation_df.drop(columns=["Close_target"])
y_validation = validation_df["Close_target"]

X_test = test_df.drop(columns=["Close_target"])
y_test = test_df["Close_target"]

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

In [65]:
# Define the parameter dictionary
params = {
    "objective": "binary:logistic",
    "max_depth": 6,
    "learning_rate": 0.1,
    "eval_metric": "logloss",
}

# Train the model
evals = [(dtrain, "train"), (dvalidation, "eval")]
bst = xgb.train(
    params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10
)

# Make predictions on the validation and test sets
y_pred_validation = bst.predict(dvalidation)
y_pred_test = bst.predict(dtest)

# Convert probabilities to binary predictions
y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
y_pred_test_binary = (y_pred_test > 0.5).astype(int)

# Calculate accuracy
validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
test_accuracy = accuracy_score(y_test, y_pred_test_binary)

print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

[0]	train-logloss:0.68899	eval-logloss:0.69523
[1]	train-logloss:0.68183	eval-logloss:0.69666
[2]	train-logloss:0.67837	eval-logloss:0.69755
[3]	train-logloss:0.67141	eval-logloss:0.69968
[4]	train-logloss:0.66877	eval-logloss:0.70009
[5]	train-logloss:0.66355	eval-logloss:0.70116
[6]	train-logloss:0.65817	eval-logloss:0.70258
[7]	train-logloss:0.65664	eval-logloss:0.70325
[8]	train-logloss:0.65107	eval-logloss:0.70276
[9]	train-logloss:0.64679	eval-logloss:0.70309
[10]	train-logloss:0.64381	eval-logloss:0.70407
Validation Accuracy: 48.61%
Test Accuracy: 52.22%


In [71]:
import logging

optuna_logger = logging.getLogger("optuna")
optuna_logger.setLevel(logging.WARNING)


def objective(trial):

    params = {
        "objective": "binary:logistic",
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        "eval_metric": "logloss",
    }
    num_boost_round = trial.suggest_int("num_boost_round", 50, 200)

    evals = [(dtrain, "train"), (dvalidation, "eval")]

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False,
    )

    y_pred_validation = bst.predict(dvalidation)
    y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)

    validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)

    return validation_accuracy


study = create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(study.best_params)

best_params = study.best_params

params = {
    "objective": "binary:logistic",
    "max_depth": best_params["max_depth"],
    "learning_rate": best_params["learning_rate"],
    "eval_metric": "logloss",
}

evals = [(dtrain, "train"), (dvalidation, "eval")]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=best_params["num_boost_round"],
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=False,
)

y_pred_validation = bst.predict(dvalidation)
y_pred_test = bst.predict(dtest)

y_pred_validation_binary = (y_pred_validation > 0.5).astype(int)
y_pred_test_binary = (y_pred_test > 0.5).astype(int)

validation_accuracy = accuracy_score(y_validation, y_pred_validation_binary)
test_accuracy = accuracy_score(y_test, y_pred_test_binary)

print(f"Validation Accuracy: {validation_accuracy*100:.2f}%")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

{'max_depth': 3, 'learning_rate': 0.06051875312554363, 'num_boost_round': 85}
Validation Accuracy: 53.06%
Test Accuracy: 53.61%


In [74]:
model_filename = "xgboost_model.v0.0.1.json"
bst.save_model(model_filename)

In [None]:
s3.upload_file(
    model_filename,
    BUCKET_NAME,
    f"models/{model_filename}",
)