In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting tqdm (from optuna)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting PyYAML (from optuna)
  Using cached PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Using cached MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl.metadata (

In [110]:
import boto3
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from torch import nn
import torch

In [None]:
BUCKET_NAME = "team1-index-predictor-bucket"

root_folder = "data/processed"

train_raw_data_filename = f"{root_folder}/train.csv"
validation_raw_data_filename = f"{root_folder}/validation.csv"
test_raw_data_filename = f"{root_folder}/test.csv"
inference_raw_data_filename = f"{root_folder}/inference.csv"

s3 = boto3.client("s3")

In [107]:
train_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=train_raw_data_filename)
validation_s3_object = s3.get_object(
    Bucket=BUCKET_NAME, Key=validation_raw_data_filename
)
test_s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=test_raw_data_filename)

train_data = train_s3_object["Body"].read().decode("utf-8")
train_df = pd.read_csv(StringIO(train_data))

validation_data = validation_s3_object["Body"].read().decode("utf-8")
validation_df = pd.read_csv(StringIO(validation_data))

test_data = test_s3_object["Body"].read().decode("utf-8")
test_df = pd.read_csv(StringIO(test_data))

In [122]:
train_df = pd.read_csv("../data/processed/train.csv")
validation_df = pd.read_csv("../data/processed/validation.csv")
test_df = pd.read_csv("../data/processed/test.csv")

In [124]:
def create_lag_features(df, lag=1):
    for i in range(1, lag + 1):
        df[f"lag_{i}"] = df["Close"].shift(i)
    df.dropna(inplace=True)
    return df

In [126]:
class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(1, 64, batch_first=True)
        self.linear = nn.Linear(64, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Get the output of the last timestep
        out = self.linear(lstm_out)
        return out

In [127]:
lag = 30

train_df = create_lag_features(train_df, lag)
validation_df = create_lag_features(validation_df, lag)
test_df = create_lag_features(test_df, lag)

model = LSTMModel()

In [130]:
def train_lstm(model, train_df, validation_df, lag, n_epochs=10, lr=0.001):
    X_train = train_df.drop(columns=["Close"])
    y_train = train_df["Close"]

    X_validation = validation_df.drop(columns=["Close"])
    y_validation = validation_df["Close"]

    X_train = X_train.values.reshape(-1, lag + 7, 1)
    X_validation = X_validation.values.reshape(-1, lag + 7, 1)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(torch.Tensor(X_train))
        loss = criterion(outputs, torch.Tensor(y_train.values).view(-1, 1))
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                outputs = model(torch.Tensor(X_validation))
                val_loss = criterion(
                    outputs, torch.Tensor(y_validation.values).view(-1, 1)
                )
                print(
                    f"Epoch {epoch} - Loss: {loss.item()} - Val Loss: {val_loss.item()}"
                )

In [129]:
train_lstm(model, train_df, validation_df, lag, n_epochs=400, lr=0.001)

Epoch 0 - Loss: 0.19946429133415222 - Val Loss: 0.6568797826766968
Epoch 10 - Loss: 0.08037499338388443 - Val Loss: 0.35540950298309326
Epoch 20 - Loss: 0.04074123129248619 - Val Loss: 0.05274505540728569
Epoch 30 - Loss: 0.027115914970636368 - Val Loss: 0.16485995054244995
Epoch 40 - Loss: 0.02264607883989811 - Val Loss: 0.10754813253879547
Epoch 50 - Loss: 0.01985975354909897 - Val Loss: 0.07121189683675766
Epoch 60 - Loss: 0.015204137191176414 - Val Loss: 0.06065426021814346
Epoch 70 - Loss: 0.007279721554368734 - Val Loss: 0.0023650529328733683
Epoch 80 - Loss: 0.004458197392523289 - Val Loss: 0.018891694024205208
Epoch 90 - Loss: 0.002273330232128501 - Val Loss: 0.00039925932651385665
Epoch 100 - Loss: 0.0015602742787450552 - Val Loss: 0.0004249253252055496
Epoch 110 - Loss: 0.0013858929742127657 - Val Loss: 0.0003990131081081927
Epoch 120 - Loss: 0.0012682595988735557 - Val Loss: 0.0010976779740303755
Epoch 130 - Loss: 0.0011993483640253544 - Val Loss: 0.0007781537133269012
Epoch

In [145]:
X_test = test_df.drop(columns=["Close"])
y_test = test_df["Close"]

X_test = X_test.values.reshape(-1, lag + 7, 1)

model.eval()

criterion = nn.MSELoss()

with torch.no_grad():
    outputs = model(torch.Tensor(X_test))
    test_loss = criterion(outputs, torch.Tensor(y_test.values).view(-1, 1))
    print(f"Test Loss: {test_loss.item()}")

Test Loss: 0.0002928580797743052


In [146]:
# scaler params should be loaded from feature store
from sklearn.preprocessing import MinMaxScaler
import json

with open("../data/processed/scaler_params.json", "r") as f:
    scaler_params = json.load(f)

scaler = MinMaxScaler()
scaler.min_, scaler.scale_ = scaler_params["min_"], scaler_params["scale_"]
scaler.data_min_, scaler.data_max_ = (
    scaler_params["data_min"],
    scaler_params["data_max"],
)
scaler.data_range_ = scaler_params["data_range"]

In [148]:
y_test_inv = scaler.inverse_transform(y_test.values.reshape(-1, 1)).flatten()
outputs_inv = scaler.inverse_transform(outputs.numpy().reshape(-1, 1)).flatten()

mse = mean_squared_error(y_test_inv, outputs_inv)
mape = mean_absolute_percentage_error(y_test_inv, outputs_inv)

print(f"MSE: {mse}")
print(f"MAPE: {mape}")

MSE: 18.81880458328459
MAPE: 0.0005960154743389435


In [149]:
y_test_inv[:5], outputs_inv[:5]

(array([5414.91992188, 5414.66015625, 5413.37988281, 5410.81982422,
        5411.45019531]),
 array([5412.3457, 5413.9897, 5415.487 , 5416.999 , 5418.139 ],
       dtype=float32))

In [136]:
torch.save(model.state_dict(), "lstm_model.pth")

In [None]:
# After model is trained, should be put to model registry