In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Use GPU

In [5]:
# Set MPS as the device
device = torch.device("mps")

## Load raw data

In [22]:
current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
data_folder = os.path.join(parent_path, "data")
raw_data_path = os.path.join(data_folder, "parquet", "raw", "raw.parquet")
df = pd.read_parquet(raw_data_path)

## Drop T01 - T03 trials

In [23]:
dropids = [
    id for id in df.trial_id.unique() if any(x in id for x in ["T01", "T02", "T03"])
]
df = df.loc[~df["trial_id"].isin(dropids)]

In [27]:
# Scale Data
from sklearn.preprocessing import StandardScaler

non_id_cols = [
    "index",
    "acc_1",
    "acc_2",
    "acc_3",
    "gyr_1",
    "gyr_2",
    "gyr_3",
    "grf_1",
    "grf_2",
    "grf_3",
]

scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[non_id_cols] = scaler.fit_transform(df[non_id_cols])

In [37]:
feature_cols = ["acc_1", "acc_2", "acc_3", "gyr_1", "gyr_2", "gyr_3"]
target_cols = ["grf_1", "grf_2", "grf_3"]

In [111]:
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch import nn
from sklearn.model_selection import train_test_split

trial_id_list = df_scaled.trial_id.unique().tolist()

train_ids, test_ids = train_test_split(trial_id_list, test_size=0.2)
val_ids, test_ids = train_test_split(test_ids, test_size=0.5)

print(
    f"Train Size:{len(train_ids)}\nVal Size:{len(val_ids)}\nTest Size:{len(test_ids)}\n "
)

# Separate data into the three dfs

train_df = df_scaled.loc[df_scaled.trial_id.isin(train_ids)]
train_df = train_df.reset_index(drop=True)

val_df = df_scaled.loc[df_scaled.trial_id.isin(val_ids)]
val_df = val_df.reset_index(drop=True)

test_df = df_scaled.loc[df_scaled.trial_id.isin(test_ids)]
test_df = test_df.reset_index(drop=True)


# Define Function to split the dfs into X and Y sequences
def prep_sequences(df):

    df_grouped = df.groupby("trial_id")
    X_list = []
    Y_list = []

    for _, trial in df_grouped:
        X_list.append(trial[feature_cols].values)
        Y_list.append(trial[target_cols].values)

    return X_list, Y_list


X_train, Y_train = prep_sequences(train_df)
X_val, Y_val = prep_sequences(val_df)
X_test, Y_test = prep_sequences(test_df)


# Create the Torch Datasets
class CreateDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]


train_dataset = CreateDataset(X_train, Y_train)
val_dataset = CreateDataset(X_val, Y_val)
test_dataset = CreateDataset(X_test, Y_test)

# Check datashape
sample_input, sample_output = train_dataset[0]  # Get the first sample
print(f"Input Shape: {sample_input.shape}, Output Shape: {sample_output.shape}")

Train Size:148
Val Size:19
Test Size:19
 
Input Shape: (4500, 6), Output Shape: (4500, 3)


In [123]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


# Define collate function for Dataloader
def collate_fn(batch):

    inputs, outputs = zip(*batch)

    # Convert to tensors
    inputs = [torch.tensor(seq, dtype=torch.float32) for seq in inputs]
    outputs = [torch.tensor(seq, dtype=torch.float32) for seq in outputs]

    # Sort the sequences in descending orders
    lengths = torch.tensor([len(seq) for seq in inputs])  # Get length of each sequence
    lengths, sorted_idx = lengths.sort(descending=True)

    inputs = [inputs[i] for i in sorted_idx]
    outputs = [outputs[i] for i in sorted_idx]

    # Pad the sorted sequence
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=-100)
    outputs_padded = pad_sequence(outputs, batch_first=True, padding_value=-100)

    return inputs_padded, outputs_padded, lengths


train_dataloader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn
)
val_dataloader = DataLoader(
    val_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn
)
test_dataloader = DataLoader(
    test_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn
)

In [124]:
# Define RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        # Sort the lengths in descending order and get the sorted indices
        lengths, sorted_idx = lengths.sort(0, descending=True)
        x = x[sorted_idx]  # Sort the input accordingly
        x_packed = nn.utils.rnn.pack_padded_sequence(
            x, lengths, batch_first=True, enforce_sorted=False
        )

        out_packed, _ = self.rnn(x_packed)

        out, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)

        out = self.fc(out)
        return out

In [132]:
# Define model training function
def train_model(
    model, dataloader, val_dataloader, num_epochs=50, early_stop_patience=10
):
    for epoch in range(num_epochs):

        epoch_loss = 0.0
        best_val_loss = np.inf
        patience_count = 0

        for inputs, targets, lengths in dataloader:
            inputs, targets, lengths = (
                inputs.to(device),
                targets.to(device),
                lengths.cpu(),
            )

            # Forward pass
            predictions = model(inputs, lengths)

            # Mask padded values
            masks = (targets != -100).float()
            predictions = predictions * masks
            targets = targets * masks

            # Loss Calculation
            loss = criterion(predictions, targets)
            loss = loss * masks  # Mask out padded positions
            loss = loss.sum() / masks.sum()  # Normalize by valid tokens

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(dataloader)
        training_losses.append(avg_train_loss)

        val_loss = eval_model(model, val_dataloader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_count = 0
        else:
            patience_count += 1

        if patience_count == early_stop_patience:
            print(f"Early Stop Triggered: Best Validation Loss = {best_val_loss}")
            return

        print(
            f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        )

    return model, training_losses, best_val_loss


def eval_model(model, dataloader):

    epoch_loss = 0.0

    for inputs, targets, lengths in dataloader:
        inputs, targets, lengths = inputs.to(device), targets.to(device), lengths.cpu()

        # Forward pass
        predictions = model(inputs, lengths)

        # Mask padded values
        masks = (targets != -100).float()
        predictions = predictions * masks
        targets = targets * masks

        # Loss Calculation
        loss = criterion(predictions, targets)
        loss = loss * masks
        loss = loss.sum() / masks.sum()

        epoch_loss += loss.item()

    avg_eval_loss = epoch_loss / len(dataloader)

    return avg_eval_loss


input_size = X_train[0].shape[1]
output_size = Y_train[0].shape[1]
model = RNNModel(
    input_size=input_size, hidden_size=64, output_size=output_size
)  # Adjust sizes as needed
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(
    model, train_dataloader, val_dataloader, num_epochs=50, early_stop_patience=10
)

Epoch [1/50], Training Loss: 0.8592, Validation Loss: 0.6256


KeyboardInterrupt: 