In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import json
import numpy as np
from sklearn.metrics import f1_score
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
data_dir = "data"
model_dir = "saved_models"
feature_file_path = os.path.join(model_dir, "features.txt")

In [3]:
data_version = "v5.0"
train_data_path = os.path.join(data_dir, "train.parquet")
features_json_path = os.path.join(data_dir, "features.json")

# Load feature metadata
with open(features_json_path, "r") as f:
    feature_metadata = json.load(f)
feature_cols = feature_metadata["feature_sets"]["medium"]
target_col = "target"

# Load training data
training_data = pd.read_parquet(train_data_path, columns=["era"] + feature_cols + [target_col])
training_data["era"] = training_data["era"].astype(str)

# Filter recent training data if needed
training_data = training_data[-200000:]

In [4]:
corr_list = {feature: training_data[feature].corr(training_data[target_col]) for feature in feature_cols}
sorted_features = sorted(corr_list, key=corr_list.get, reverse=True)
final_feature = sorted_features[:40]  # Selecting top 40 features

# Save the selected features to a text file
with open(feature_file_path, "w") as f:
    for feature in final_feature:
        f.write(feature + "\n")
print(f"Selected features saved to {feature_file_path}")

targets_df = training_data[["era", target_col] + final_feature]

Selected features saved to saved_models/features.txt


In [5]:
def create_dataset_with_window(Xraw, yraw, window_len: int):
    Xraw_filled = Xraw.fillna(-1)
    yraw_filled = yraw.fillna(-1)
    
    new_data = []
    new_cols = []
    new_labels = []
    
    for col_idx in range(window_len):
        local_new_cols = [f"{col}_ts{col_idx}" for col in Xraw_filled.columns]
        new_cols.extend(local_new_cols)

    for start in tqdm(range(0, len(Xraw_filled) - window_len + 1), desc="Creating Dataset"):
        new_row_data = Xraw_filled.iloc[start : start + window_len].values.reshape(-1)
        new_label_data = yraw_filled.iloc[start + window_len - 1]
        new_data.append(new_row_data)
        new_labels.append(new_label_data)

    return pd.DataFrame(new_data, columns=new_cols).astype(float), pd.Series(new_labels).astype(float)


Xraw = targets_df[final_feature]
yraw = targets_df[target_col]
window_len = 5

X, y = create_dataset_with_window(Xraw, yraw, window_len)

Creating Dataset:  10%|▉         | 19595/199996 [00:00<00:02, 64782.06it/s]

Creating Dataset: 100%|██████████| 199996/199996 [00:03<00:00, 56967.46it/s]


In [6]:
# Convert data to PyTorch tensors
X_numpy = X.values
y_numpy = y.values
feature_size = len(final_feature)
X_reshaped = X_numpy.reshape(-1, window_len, feature_size)

X_tensor = torch.tensor(X_reshaped, dtype=torch.float32)
y_tensor = torch.tensor(y_numpy, dtype=torch.float32)

# Define DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [7]:
class ClassificationLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, lr):
        super(ClassificationLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.criterion = nn.CrossEntropyLoss()
        self.class_to_bucket = {0: 0, 1: 0.25, 2: 0.5, 3: 0.75, 4: 1}

        self.history = {'loss': [], 'f1_score': []}

    def forward(self, x):
        h, _ = self.lstm(x)
        x = self.linear(h[:, -1, :])  # Take last output for classification
        return x

    def train_model(self, train_loader, num_epochs, device):
        self.to(device)
        for epoch in range(num_epochs):
            epoch_loss = 0
            all_preds, all_labels = [], []
            for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
                inputs, labels = inputs.to(device), labels.to(device)
                self.optimizer.zero_grad()
                outputs = self(inputs)
                label_indices = torch.tensor([self.bucket_to_class(val) for val in labels.cpu().numpy()], dtype=torch.long).to(device)
                loss = self.criterion(outputs, label_indices)
                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(label_indices.cpu().numpy())

            avg_loss = epoch_loss / len(train_loader)
            f1 = f1_score(all_labels, all_preds, average='macro')
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, F1 Score: {f1:.4f}")
            self.history['loss'].append(avg_loss)
            self.history['f1_score'].append(f1)

    def bucket_to_class(self, val):
        bucket_to_class = {v: k for k, v in self.class_to_bucket.items()}
        return bucket_to_class[val]

In [9]:
input_size = len(final_feature)
hidden_size = 128
output_size = 5
lr = 0.001
num_epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ClassificationLSTM(input_size=input_size, hidden_size=hidden_size, output_size=output_size, lr=lr)
model.train_model(train_loader, num_epochs, device)

Epoch 1/50: 100%|██████████| 6250/6250 [00:16<00:00, 371.02it/s]


Epoch [1/50], Loss: 1.2804, F1 Score: 0.1334


Epoch 2/50: 100%|██████████| 6250/6250 [00:16<00:00, 379.38it/s]


Epoch [2/50], Loss: 1.2720, F1 Score: 0.1345


Epoch 3/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.98it/s]


Epoch [3/50], Loss: 1.2682, F1 Score: 0.1364


Epoch 4/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.40it/s]


Epoch [4/50], Loss: 1.2644, F1 Score: 0.1402


Epoch 5/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.90it/s]


Epoch [5/50], Loss: 1.2595, F1 Score: 0.1457


Epoch 6/50: 100%|██████████| 6250/6250 [00:16<00:00, 380.29it/s]


Epoch [6/50], Loss: 1.2539, F1 Score: 0.1551


Epoch 7/50: 100%|██████████| 6250/6250 [00:16<00:00, 375.13it/s]


Epoch [7/50], Loss: 1.2478, F1 Score: 0.1635


Epoch 8/50: 100%|██████████| 6250/6250 [00:16<00:00, 371.87it/s]


Epoch [8/50], Loss: 1.2405, F1 Score: 0.1750


Epoch 9/50: 100%|██████████| 6250/6250 [00:16<00:00, 378.57it/s]


Epoch [9/50], Loss: 1.2332, F1 Score: 0.1860


Epoch 10/50: 100%|██████████| 6250/6250 [00:16<00:00, 378.90it/s]


Epoch [10/50], Loss: 1.2252, F1 Score: 0.1986


Epoch 11/50: 100%|██████████| 6250/6250 [00:16<00:00, 375.49it/s]


Epoch [11/50], Loss: 1.2177, F1 Score: 0.2113


Epoch 12/50: 100%|██████████| 6250/6250 [00:16<00:00, 372.04it/s]


Epoch [12/50], Loss: 1.2102, F1 Score: 0.2219


Epoch 13/50: 100%|██████████| 6250/6250 [00:16<00:00, 376.08it/s]


Epoch [13/50], Loss: 1.2020, F1 Score: 0.2330


Epoch 14/50: 100%|██████████| 6250/6250 [00:16<00:00, 378.90it/s]


Epoch [14/50], Loss: 1.1948, F1 Score: 0.2457


Epoch 15/50: 100%|██████████| 6250/6250 [00:16<00:00, 380.78it/s]


Epoch [15/50], Loss: 1.1871, F1 Score: 0.2572


Epoch 16/50: 100%|██████████| 6250/6250 [00:16<00:00, 374.83it/s]


Epoch [16/50], Loss: 1.1795, F1 Score: 0.2690


Epoch 17/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.79it/s]


Epoch [17/50], Loss: 1.1731, F1 Score: 0.2766


Epoch 18/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.32it/s]


Epoch [18/50], Loss: 1.1659, F1 Score: 0.2847


Epoch 19/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.13it/s]


Epoch [19/50], Loss: 1.1594, F1 Score: 0.2948


Epoch 20/50: 100%|██████████| 6250/6250 [00:16<00:00, 384.34it/s]


Epoch [20/50], Loss: 1.1521, F1 Score: 0.3042


Epoch 21/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.41it/s]


Epoch [21/50], Loss: 1.1460, F1 Score: 0.3107


Epoch 22/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.03it/s]


Epoch [22/50], Loss: 1.1399, F1 Score: 0.3193


Epoch 23/50: 100%|██████████| 6250/6250 [00:16<00:00, 382.44it/s]


Epoch [23/50], Loss: 1.1335, F1 Score: 0.3254


Epoch 24/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.32it/s]


Epoch [24/50], Loss: 1.1282, F1 Score: 0.3331


Epoch 25/50: 100%|██████████| 6250/6250 [00:16<00:00, 370.33it/s]


Epoch [25/50], Loss: 1.1224, F1 Score: 0.3399


Epoch 26/50: 100%|██████████| 6250/6250 [00:16<00:00, 375.58it/s]


Epoch [26/50], Loss: 1.1166, F1 Score: 0.3443


Epoch 27/50: 100%|██████████| 6250/6250 [00:17<00:00, 365.69it/s]


Epoch [27/50], Loss: 1.1116, F1 Score: 0.3488


Epoch 28/50: 100%|██████████| 6250/6250 [00:16<00:00, 375.42it/s]


Epoch [28/50], Loss: 1.1060, F1 Score: 0.3556


Epoch 29/50: 100%|██████████| 6250/6250 [00:16<00:00, 368.36it/s]


Epoch [29/50], Loss: 1.1016, F1 Score: 0.3603


Epoch 30/50: 100%|██████████| 6250/6250 [00:16<00:00, 381.14it/s]


Epoch [30/50], Loss: 1.0969, F1 Score: 0.3649


Epoch 31/50: 100%|██████████| 6250/6250 [00:16<00:00, 375.36it/s]


Epoch [31/50], Loss: 1.0919, F1 Score: 0.3708


Epoch 32/50: 100%|██████████| 6250/6250 [00:16<00:00, 373.55it/s]


Epoch [32/50], Loss: 1.0878, F1 Score: 0.3746


Epoch 33/50: 100%|██████████| 6250/6250 [00:16<00:00, 372.95it/s]


Epoch [33/50], Loss: 1.0829, F1 Score: 0.3783


Epoch 34/50: 100%|██████████| 6250/6250 [00:17<00:00, 365.65it/s]


Epoch [34/50], Loss: 1.0792, F1 Score: 0.3824


Epoch 35/50: 100%|██████████| 6250/6250 [00:16<00:00, 372.38it/s]


Epoch [35/50], Loss: 1.0747, F1 Score: 0.3862


Epoch 36/50: 100%|██████████| 6250/6250 [00:16<00:00, 379.12it/s]


Epoch [36/50], Loss: 1.0719, F1 Score: 0.3902


Epoch 37/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.48it/s]


Epoch [37/50], Loss: 1.0677, F1 Score: 0.3934


Epoch 38/50: 100%|██████████| 6250/6250 [00:16<00:00, 383.20it/s]


Epoch [38/50], Loss: 1.0633, F1 Score: 0.3965


Epoch 39/50: 100%|██████████| 6250/6250 [00:16<00:00, 380.77it/s]


Epoch [39/50], Loss: 1.0602, F1 Score: 0.4004


Epoch 40/50: 100%|██████████| 6250/6250 [00:16<00:00, 373.73it/s]


Epoch [40/50], Loss: 1.0566, F1 Score: 0.4044


Epoch 41/50: 100%|██████████| 6250/6250 [00:16<00:00, 376.88it/s]


Epoch [41/50], Loss: 1.0534, F1 Score: 0.4073


Epoch 42/50: 100%|██████████| 6250/6250 [00:17<00:00, 362.43it/s]


Epoch [42/50], Loss: 1.0494, F1 Score: 0.4107


Epoch 43/50: 100%|██████████| 6250/6250 [00:16<00:00, 377.85it/s]


Epoch [43/50], Loss: 1.0465, F1 Score: 0.4146


Epoch 44/50: 100%|██████████| 6250/6250 [00:16<00:00, 380.18it/s]


Epoch [44/50], Loss: 1.0437, F1 Score: 0.4158


Epoch 45/50: 100%|██████████| 6250/6250 [00:16<00:00, 368.36it/s]


Epoch [45/50], Loss: 1.0403, F1 Score: 0.4183


Epoch 46/50: 100%|██████████| 6250/6250 [00:16<00:00, 378.61it/s]


Epoch [46/50], Loss: 1.0377, F1 Score: 0.4209


Epoch 47/50: 100%|██████████| 6250/6250 [00:16<00:00, 370.50it/s]


Epoch [47/50], Loss: 1.0357, F1 Score: 0.4231


Epoch 48/50: 100%|██████████| 6250/6250 [00:17<00:00, 364.35it/s]


Epoch [48/50], Loss: 1.0320, F1 Score: 0.4274


Epoch 49/50: 100%|██████████| 6250/6250 [00:16<00:00, 370.71it/s]


Epoch [49/50], Loss: 1.0294, F1 Score: 0.4271


Epoch 50/50: 100%|██████████| 6250/6250 [00:16<00:00, 373.04it/s]


Epoch [50/50], Loss: 1.0269, F1 Score: 0.4281


In [10]:
model_save_path = os.path.join(model_dir, "model.pth")
torch.save(model, model_save_path)
print(f"Trained model saved to {model_save_path}")

Trained model saved to saved_models/model.pth
