# Initial

## Constants

In [1]:
# Constants for easy reference and modification
GROUPBY_COL = 'unique_id'

DATETIME_COL = 'datetime'

TARGET_COL = 'vehicle_type'

FEATURE_COLS = [
    'vehicle_speed',
    'vehicle_angle_sine', 'vehicle_angle_cosine',
    'vehicle_x', 'vehicle_y', 'vehicle_z'
]

# HYPERPARAMETERS
BATCH_SIZE = 32
NUM_CLASSES = 10
NUM_FEATURES = 6
NUM_LAYERS = 2
HIDDEN_SIZE = 128
DROPOUT = 0.5
NUM_EPOCHS = 10

## Imports

In [2]:
# Standard library imports
import os

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

# PyTorch and related imports
import torch
import torch_directml
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from torch.optim.lr_scheduler import CyclicLR
from torch.utils.data import DataLoader, Dataset

dml = torch_directml.device()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(dml)

privateuseone:0


## Helper Functions and Class Definitions

### For creating dataloaders

In [3]:
def create_sequences(df):
    sequences = []
    for vehicle_id, group in df.groupby(GROUPBY_COL):
        sorted_group = group.sort_values(by=[DATETIME_COL]).copy()
        sequence_features = sorted_group[FEATURE_COLS].values
        label = sorted_group.iloc[0][TARGET_COL]
        sequences.append((sequence_features, label, vehicle_id))
    return sequences

class VehicleDataset(Dataset):
    def __init__(self, df):
        self.sequences = create_sequences(df)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label, vehicle_id = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.long), vehicle_id

def collate_fn(batch):
    sequences, labels, vehicle_ids = zip(*batch)

    # Clone and detach the tensors in sequences before padding
    padded_sequences = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

    labels_tensor = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor([len(seq) for seq in sequences])

    return padded_sequences, labels_tensor, vehicle_ids, lengths

# ######################## USING GENERATORS ########################
# def create_sequences_generator(df):
#     for vehicle_id, group in df.groupby(GROUPBY_COL):
#          # Ensure that data is sorted by  datetime
#         group[DATETIME_COL] = pd.to_datetime(group[DATETIME_COL])
#         group.sort_values(by=[DATETIME_COL], inplace=True)

#         sequence_features = group[FEATURE_COLS].values
#         label = group.iloc[0][TARGET_COL]
#         yield sequence_features, label, vehicle_id

# class VehicleIterableDataset(IterableDataset):
#     def __init__(self, df):
#         # IterableDataset stores the raw data and parameters needed to create the generator
#         self.df = df
#         self.groupby_col = GROUPBY_COL
#         self.feature_cols = FEATURE_COLS
#         self.target_col = TARGET_COL

#     def __iter__(self):
#         # Re-initialises the generator each time it is called, as generators are exhausted after one complete iteration
#         return create_sequences_generator(self.df)

# def collate_fn_generator(batch):
#     # Separate features, labels, and vehicle IDs
#     sequences, labels, vehicle_ids = zip(*batch)

#     # Pad sequences to have the same length
#     padded_sequences = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in sequences], batch_first=True)

#     # Convert labels to tensor
#     labels_tensor = torch.tensor(labels, dtype=torch.long)

#     # Calculate lengths of sequences
#     lengths = torch.tensor([len(seq) for seq in sequences])

#     return padded_sequences, labels_tensor, vehicle_ids, lengths

### LSTM class

In [4]:
class LSTMClassifier(nn.Module):
    """
    The output of an LSTM layer in PyTorch is a 3D tensor, and each dim represents:
    1. Batch size: The batch size of the DataLoader.
        In a batch, multiple sequences are processed simultaneously by the LSTM.
        Each item in this dimension corresponds to a different sequence in the batch.
    2. Sequence length: The number of rows in each sequence.
        When padding is used to handle variable-length sequences,
        this dimension represents the max length in the batch.
    3. Hidden size: A hyperparameter when creating the LSTM layer.
        The size of the LSTM's hidden state and
        also the output features from the LSTM cell at each time step.
    """
    def __init__(self, num_classes, input_size, hidden_size, num_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.num_classes = num_classes
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        # Pack the padded sequences
        x_packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

        # Forward propagate LSTM
        packed_output, (hidden, cell) = self.lstm(x_packed)

        # Unpack the output
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # Extract the hidden state of the last time step
        last_hidden = hidden[-1]

        # Pass through the linear layer
        out = self.fc(last_hidden)

        return out

### Trainer class

In [5]:
class EarlyStopping:
    """Utility class for early stopping"""
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f"Validation loss decreased ({self.val_loss_min:.6f} to {val_loss:.6f}). Saving model...")
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss


class ModelTrainer:
    """
    Class to encapsulate all methods and utilities for training and evaluating a model.
    """
    def __init__(self, model, criterion, optimizer, scheduler=None, device=None):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.criterion.to(self.device)


    def train_for_one_epoch(self, data_loader, print_every=10):
        """
        Train the model for one epoch, iterating over batches from the data_loader.

        Args:
            data_loader (DataLoader): The PyTorch DataLoader to fetch data from.
            print_every (int): Interval of batches after which to print training metrics.

        Returns:
            tuple: A tuple containing the average loss, accuracy, and F1 score for the epoch.
        """

        # Set the model to training mode
        self.model.train()

        # Initialize accumulators for loss and accuracy metrics
        total_loss, correct, total = 0, 0, 0
        all_labels, all_predictions = [], []

        # Process each batch from the data loader
        for batch_count, (sequences, labels, _, lengths) in enumerate(data_loader):
            # Move data to the appropriate device (GPU or CPU)
            sequences, labels = sequences.to(self.device), labels.to(self.device)

            # Zero the gradients before running the forward pass.
            self.optimizer.zero_grad()

            # Forward pass: Compute predicted y by passing data through the model
            outputs = self.model(sequences, lengths)

            # Compute loss
            loss = self.criterion(outputs, labels)

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Perform a single optimization step (parameter update)
            self.optimizer.step()

            # Update training loss
            total_loss += loss.item()

            # Calculate batch accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Extend the list for later calculating whole epoch metrics
            all_labels.extend(labels.tolist())
            all_predictions.extend(predicted.tolist())

            # Print training metrics every "print_every" batches
            if (batch_count + 1) % print_every == 0:
                avg_loss = total_loss / (batch_count + 1)
                accuracy = correct / total
                batch_f1 = f1_score(labels.cpu(), predicted.cpu(), average='weighted')
                print(f"Batch [{batch_count + 1}/{len(data_loader)}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, F1: {batch_f1:.4f}")

        # Scheduler step if scheduler is set
        if self.scheduler:
            self.scheduler.step()

        # Calculate average metrics over all batches
        avg_loss = total_loss / len(data_loader)
        accuracy = correct / total
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        return avg_loss, accuracy, f1


    def evaluate(self, data_loader):
        """
        Evaluate the model on a given dataset using the specified data loader.

        Args:
            data_loader (DataLoader): The DataLoader to provide batches of data.

        Returns:
            tuple: A tuple containing the average loss, accuracy, and F1 score for the evaluation.
        """

        # Set the model to evaluation mode. This will turn off dropout and batch normalization
        self.model.eval()

        # Initialize accumulators for calculating average loss, accuracy, and F1 score
        total_loss, correct, total = 0, 0, 0
        all_labels, all_predictions = [], []

        # Disable gradient calculations for efficiency since gradients are not needed for evaluation
        with torch.no_grad():
            for sequences, labels, _, lengths in data_loader:
                # Ensure data is on the correct device (GPU or CPU) to avoid unnecessary transfers
                sequences, labels = sequences.to(self.device), labels.to(self.device)

                # Forward pass: compute the model's predictions for the batch
                outputs = self.model(sequences, lengths)

                # Compute loss for the batch using the criterion defined for the model
                loss = self.criterion(outputs, labels)

                # Accumulate the loss for averaging later
                total_loss += loss.item()

                # Compute prediction accuracy for the batch
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)  # Count total labels processed in this batch
                correct += (predicted == labels).sum().item()  # Count correct predictions

                # Store all labels and predictions for calculating global F1 score later
                all_labels.extend(labels.tolist())
                all_predictions.extend(predicted.tolist())

        # Calculate and return average loss, accuracy, and F1 score across all batches
        avg_loss = total_loss / len(data_loader)  # Average loss across batches
        accuracy = correct / total  # Overall accuracy
        f1 = f1_score(all_labels, all_predictions, average='weighted')  # Overall F1 score

        return avg_loss, accuracy, f1

    
    def train_loop(self, train_loader, val_loader, num_epochs=100, print_every=10, save_internal=False, save_interval=10, filename="model", save_dir="../"):
        """
        Full training loop including training and validation evaluation for a given number of epochs.
        """
        # Ensure save directories exist
        model_save_dir = os.path.join(save_dir, "models")
        img_save_dir = os.path.join(save_dir, "imgs")
        os.makedirs(model_save_dir, exist_ok=True)
        os.makedirs(img_save_dir, exist_ok=True)

        # Initialize early stopping utility
        early_stopping = EarlyStopping(patience=7, verbose=True)

        # Initialize lists to track metrics
        train_losses, train_accs, train_f1s = [], [], []
        val_losses, val_accs, val_f1s = [], [], []

        for epoch in range(num_epochs):
            print(f"Epoch {epoch + 1}/{num_epochs}")

            # Train for one epoch and collect metrics
            train_loss, train_acc, train_f1 = self.train_for_one_epoch(train_loader, print_every)
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            train_f1s.append(train_f1)

            # Evaluate on the validation set
            val_loss, val_acc, val_f1 = self.evaluate(val_loader)
            val_losses.append(val_loss)
            val_accs.append(val_acc)
            val_f1s.append(val_f1)
            print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}\n")

            # Early stopping check
            early_stopping(val_loss, self.model)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break

            # Save model periodically
            if save_internal:
                if (epoch + 1) % save_interval == 0:
                    internal_filename = f"{filename}_epoch{epoch + 1}"
                    self.save_checkpoint(epoch, model_save_dir, internal_filename)

        # Save the final model
        final_model_path = os.path.join(model_save_dir, f"{filename}_final.pth")
        torch.save(self.model.state_dict(), final_model_path)
        print(f"Final model saved to {final_model_path}")

        # Plot and save metrics
        self.plot_metrics(train_losses, val_losses, train_accs, val_accs, train_f1s, val_f1s, num_epochs, img_save_dir, filename)

    def save_checkpoint(self, epoch, save_dir, filename):
        """
        Save model checkpoint.
        """
        save_path = os.path.join(save_dir, f"{filename}_epoch_{epoch + 1}.pth")
        torch.save(self.model.state_dict(), save_path)
        print(f"Model saved to {save_path}")

    def plot_metrics(self, train_losses, val_losses, train_accs, val_accs, train_f1s, val_f1s, num_epochs, img_save_dir, filename):
        """
        Plot and save training and validation metrics.
        """
        epochs = list(range(1, num_epochs + 1))
        plt.figure(figsize=(18, 6))

        plt.subplot(1, 3, 1)
        plt.plot(epochs, train_losses, 'r-', label='Train Loss')
        plt.plot(epochs, val_losses, 'b-', label='Validation Loss')
        plt.title('Loss Over Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 3, 2)
        plt.plot(epochs, train_accs, 'r-', label='Train Accuracy')
        plt.plot(epochs, val_accs, 'b-', label='Validation Accuracy')
        plt.title('Accuracy Over Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.subplot(1, 3, 3)
        plt.plot(epochs, train_f1s, 'r-', label='Train F1 Score')
        plt.plot(epochs, val_f1s, 'b-', label='Validation F1 Score')
        plt.title('F1 Score Over Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('F1 Score')
        plt.legend()

        plt.tight_layout()
        plot_filepath = os.path.join(img_save_dir, f"metrics_{filename}.png")
        plt.savefig(plot_filepath)
        print(f"Metrics plot saved to {plot_filepath}")

        plt.show()



# Reshape Data for input to LSTM

In [6]:
train_df = pd.read_csv('../data/train_data.csv')
train_df[DATETIME_COL] = pd.to_datetime(train_df[DATETIME_COL])

val_df = pd.read_csv('../data/val_data.csv')
val_df[DATETIME_COL] = pd.to_datetime(val_df[DATETIME_COL])

test_df = pd.read_csv('../data/test_data.csv')
test_df[DATETIME_COL] = pd.to_datetime(test_df[DATETIME_COL])

In [7]:
# Creating datasets and dataloaders
train_dataset = VehicleDataset(train_df)
val_dataset = VehicleDataset(val_df)
test_dataset = VehicleDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [8]:
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
# test_loader1 = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# assert(len(test_loader) == len(test_loader1))
# for i, batch in enumerate(test_loader):
#     for j, batch1 in enumerate(test_loader1):
#         if (i == j):
#             assert(torch.all(torch.eq(batch[0], batch1[0])))
#             assert(torch.all(torch.eq(batch[1], batch1[1])))
#             assert(batch[2] == batch1[2])
#             assert(torch.all(torch.eq(batch[3], batch1[3])))
#         else:
#             continue

# Model Training

## Initialise model

In [9]:
# Initialise model with hyperparameters
model = LSTMClassifier(
    num_classes=NUM_CLASSES,
    input_size=NUM_FEATURES,
    num_layers=NUM_LAYERS,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT
)

## Initialise optimiser and scheduler

In [10]:
# Initialise Adam optimizer with CyclicLR scheduler
optimizer = optim.Adam(model.parameters(), lr=0.0001)
lr_scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01,
                         step_size_up=len(train_df), cycle_momentum=False)

## Initialise criterion

In [11]:
# Initialise criterion as Cross Entropy Loss for multiclassification task
criterion = nn.CrossEntropyLoss()

# Create weighted criterion
class_frequencies = train_df.groupby(TARGET_COL)[GROUPBY_COL].nunique()
class_weights = 1.0 / class_frequencies
class_weights = class_weights / class_weights.min() # Normalize the weights so that the smallest one is 1.0 (optional)
class_weights_tensor = torch.tensor(class_weights.values, dtype=torch.float32)
weighted_criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

## Instantiate ModelTrainer object

In [12]:
trainer = ModelTrainer(
    model, 
    criterion, 
    optimizer, 
    scheduler=lr_scheduler, 
    device=dml
)
trainer.__dict__

{'model': LSTMClassifier(
   (lstm): LSTM(6, 128, num_layers=2, batch_first=True, dropout=0.5)
   (fc): Linear(in_features=128, out_features=10, bias=True)
 ),
 'criterion': CrossEntropyLoss(),
 'optimizer': Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     initial_lr: 0.0001
     lr: 0.0001
     maximize: False
     weight_decay: 0
 ),
 'scheduler': <torch.optim.lr_scheduler.CyclicLR at 0x7fac6c6ba170>,
 'device': device(type='privateuseone', index=0)}

## Train model for num epochs

In [14]:
trainer.train_loop(
    train_loader, 
    val_loader, 
    num_epochs=NUM_EPOCHS, 
    print_every=10, 
    save_internal=True,
    save_interval=2, 
    filename="LSTM_test", 
)