In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import copy
import sys
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.model_selection import train_test_split


KeyboardInterrupt: 

In [None]:
fileroot='2024_03_08_TRF_Maleen/'
filename='2024_03_08_TRF_Maleen_01'

datapath='/home/maleen/rosbags/Transformers/datasets/training/'

# Load the DataFrame from the pickle file
df1 = pd.read_pickle(datapath + filename + '.pkl')

arrays = [np.array(item) for item in df1['Skeleton_3D']]
timestamps = [np.array(item) for item in df1['Skeleton_Timestamp']]

# Convert datetime to seconds from start
timestamps = (timestamps - timestamps[0])

# Stack these arrays along a new axis to create a 3D NumPy array
# Each "slice" of this 3D array represents one frame of keypoints
skeleton_3d_frames = np.stack(arrays, axis=0)

In [None]:
#Create masks for the data (1 = data present, 0 = data missing)
# Correcting the mask values
masks = np.where(np.isnan(skeleton_3d_frames).any(axis=2), 0, 1)  # 0 for missing, 1 for present

In [None]:
# Calculate the differences in position and time
position_diff = np.diff(skeleton_3d_frames, axis=0)
time_diff = np.diff(timestamps)

# Ensure that time_diff is of shape (n,1,1) so that it broadcasts correctly when dividing
time_diff = time_diff[:, np.newaxis, np.newaxis]

# Update masks to match the dimensions of position_diff and time_diff
# We use the bitwise AND operator to ensure that both the current and previous frames are valid
masks_pos = masks[:-1, :] & masks[1:, :]

# Add an additional dimension to masks
masks_pos = masks_pos[:,:,np.newaxis]

# Now we calculate velocity, handling missing data according to the mask
# Where the mask is False, we will get np.nan
skel_vel = np.where(masks_pos, position_diff / time_diff, np.nan)

masks_velocity = masks_pos[:-1, :] & masks_pos[1:, :]

# Calculate the differences in velocity
velocity_diff = np.diff(skel_vel, axis=0)

# Now we calculate acceleration, handling missing data according to the mask
# Where the mask is False, we will get np.nan
skel_acc = np.where(masks_velocity, velocity_diff / time_diff[:-1, :, :], np.nan)  # Use time_diff with one less time dimension

In [None]:
# Now, slice skeleton_3d_frames and skel_vel to match the dimensions of skell_acc
skel_pos= skeleton_3d_frames[2:]
skel_vel = skel_vel[1:]
masks = masks[2:]

In [None]:
masks.shape

In [None]:
# Initialize arrays to hold the normalized data, medians, and IQRs
norm_pos = np.empty_like(skel_pos)
medians_per_joint_axis_skel_pos = np.empty((skel_pos.shape[1], skel_pos.shape[2]))
iqrs_per_joint_axis_skel_pos = np.empty((skel_pos.shape[1], skel_pos.shape[2]))

norm_vel= np.empty_like(skel_vel)
medians_per_joint_axis_vel = np.empty((skel_vel.shape[1], skel_vel.shape[2]))
iqrs_per_joint_axis_vel = np.empty((skel_vel.shape[1], skel_vel.shape[2]))

norm_acc = np.empty_like(skel_acc)
medians_per_joint_axis_acc = np.empty((skel_acc.shape[1], skel_acc.shape[2]))
iqrs_per_joint_axis_acc = np.empty((skel_acc.shape[1], skel_acc.shape[2]))

def robust_normalize_data_with_clipping(data, masks, medians_per_joint_axis, iqrs_per_joint_axis, normalized_data, clipping_percentiles=(1, 99)):
    for joint in range(data.shape[1]):  # For each joint
        for axis in range(data.shape[2]):  # For each axis (x, y, z)
            joint_axis_data = data[:, joint, axis]
            mask_for_joint = masks[:, joint]

            # Select valid data based on the mask
            valid_data = joint_axis_data[mask_for_joint == 1]

            # Determine clipping thresholds based on percentiles
            lower_threshold, upper_threshold = np.percentile(valid_data, clipping_percentiles) if valid_data.size > 0 else (np.nan, np.nan)

            # Clip the data based on valid mask and thresholds
            clipped_values = np.clip(joint_axis_data, lower_threshold, upper_threshold)

            # Calculate median and IQR for clipped data
            median = np.median(clipped_values[mask_for_joint == 1]) if np.any(mask_for_joint == 1) else np.nan
            q75, q25 = np.percentile(clipped_values[mask_for_joint == 1], [75 ,25]) if np.any(mask_for_joint == 1) else (np.nan, np.nan)
            iqr = q75 - q25

            # Store the calculated medians and IQRs
            medians_per_joint_axis[joint, axis] = median
            iqrs_per_joint_axis[joint, axis] = iqr

            # Normalize the clipped data, avoiding division by zero
            if iqr > 0:
                normalized_values = (clipped_values - median) / iqr
            else:
                normalized_values = clipped_values  # Keep original values if IQR is 0 or nan

            # Apply normalization only where data is present
            normalized_data[:, joint, axis] = np.where(mask_for_joint == 1, normalized_values, np.nan)
            
    return normalized_data, medians_per_joint_axis, iqrs_per_joint_axis

# Example usage with your data
# Note: masks, skel_pos, skel_vel, skel_acc should be defined in your context

norm_pos, medians_pos, iqrs_pos = robust_normalize_data_with_clipping(skel_pos, masks, medians_per_joint_axis_skel_pos, iqrs_per_joint_axis_skel_pos, norm_pos)
norm_vel, medians_vel, iqrs_vel = robust_normalize_data_with_clipping(skel_vel, masks, medians_per_joint_axis_vel, iqrs_per_joint_axis_vel, norm_vel)
norm_acc, medians_acc, iqrs_acc = robust_normalize_data_with_clipping(skel_acc, masks, medians_per_joint_axis_acc, iqrs_per_joint_axis_acc, norm_acc)


In [None]:
data=skel_acc
norm_data=norm_acc
# Plot the original and normalized data for a specific joint and axis
joint, axis = 0, 0  # Change as needed
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(data[:, joint, axis], bins=20, alpha=0.7, label='Original')
plt.title("Original Data Distribution")
plt.xlabel("Value")
plt.ylabel("Frequency")

plt.subplot(1, 2, 2)
plt.hist(norm_data[:, joint, axis], bins=20, alpha=0.7, label='Normalized')
plt.title("Normalized Data Distribution")
plt.xlabel("Value")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

# Check the median and range of the normalized data
normalized_median = np.nanmedian(norm_data[:, joint, axis])
print("Median of normalized data:", normalized_median)

within_iqr = ((norm_data[:, joint, axis] > -2) & (norm_data[:, joint, axis] < 2)).sum()
print(f"Data points within [-1, 1] (IQR): {within_iqr} out of {norm_data.shape[0]}")

within_iqr2 = ((data[:, joint, axis] > -1) & (data[:, joint, axis] < 1)).sum()
print(f"Data points within [-1, 1] (IQR): {within_iqr2} out of {norm_data.shape[0]}")

In [None]:

class SkeletalInputEmbedding(nn.Module):
    def __init__(self, num_joints=18, dof=3, embed_dim=128, seq_len=60):
        super().__init__()
        self.num_joints = num_joints
        self.dof = dof
        self.seq_len = seq_len
        self.embed_dim = embed_dim

        self.joint_embed = nn.Linear(dof, embed_dim)
        self.vel_embed = nn.Linear(dof, embed_dim)
        self.acc_embed = nn.Linear(dof, embed_dim)

        self.register_buffer('positional_encoding', self.get_sinusoidal_encoding(seq_len * num_joints, embed_dim))

    def forward(self, joint_positions, velocities, accelerations, mask=None):
        # Replace NaNs in the input data
        joint_positions = torch.nan_to_num(joint_positions)
        velocities = torch.nan_to_num(velocities)
        accelerations = torch.nan_to_num(accelerations)

        # Apply mask after NaN replacement
        if mask is not None:
            mask = mask.unsqueeze(-1)  # Add a dimension for the features
            
        # Embedding and combining the embeddings
        joint_embeddings = self.joint_embed(joint_positions)
        vel_embeddings = self.vel_embed(velocities)
        acc_embeddings = self.acc_embed(accelerations)

        combined_embeddings = joint_embeddings + vel_embeddings + acc_embeddings

        # Apply mask
        if mask is not None:
            combined_embeddings = combined_embeddings * mask

        pos_encodings = self.positional_encoding[:self.seq_len * self.num_joints, :].view(self.seq_len, self.num_joints, self.embed_dim)
        #print(pos_encodings)
        combined_embeddings += pos_encodings.unsqueeze(0)  # Unsqueeze to add batch dimension for broadcasting


        combined_embeddings = combined_embeddings.view(-1, self.seq_len, self.num_joints, self.embed_dim)

        return combined_embeddings

    def get_sinusoidal_encoding(self, seq_len, embed_dim):
        pe = torch.zeros(seq_len, embed_dim)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(np.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe


In [None]:
# # Define the module
# embed_module = SkeletalInputEmbedding(num_joints=18, dof=3, embed_dim=128, seq_len=60)

# # Example setup for testing the module
# batch_size = 1
# seq_len = 2  # Number of frames
# num_joints = 3
# dof = 3

# # Initialize module
# embedding_module = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=6, seq_len=seq_len)

# # Generate sample data
# joint_positions = torch.randn(batch_size, seq_len, num_joints, dof)
# velocities = torch.randn(batch_size, seq_len, num_joints, dof)
# accelerations = torch.randn(batch_size, seq_len, num_joints, dof)
# mask = torch.randint(0, 2, (batch_size, seq_len, num_joints))

# print(mask)
# # Test the forward pass
# output_embeddings = embedding_module(joint_positions, velocities, accelerations, mask)

# # Print output shapes
# print("Output shape:", output_embeddings.shape)

# # Output shape and example output
# print("Output shape:", output_embeddings.shape)
# print("Sample output embeddings:", output_embeddings[0][0])

In [None]:

# # Initialize your embedding layer
# embedding_layer = SkeletalInputEmbedding()

# # Create a dummy input tensor for joint positions, velocities, and accelerations
# # Shape: (batch_size, seq_len, num_joints, dof)
# joint_positions = torch.randn(1, 1, 18, 3)
# velocities = torch.randn(1, 1, 18, 3)
# accelerations = torch.randn(1, 1, 18, 3)

# # Create a mask tensor
# # For simplicity, let's mask out half of the joints
# # Shape: (batch_size, seq_len, num_joints)
# mask = torch.tensor([[[1] * 9 + [0] * 9]])

# # Run the forward pass with the test inputs and mask
# with torch.no_grad():
#     output_embeddings = embedding_layer(joint_positions, velocities, accelerations, mask)

# # Check if the masked positions in the output embeddings are set to zero
# masked_output = output_embeddings[:, :, mask.view(-1) == 0]
# assert torch.all(masked_output == 0), "Masked positions in the output are not all zero!"

# print("Masking applied correctly:", torch.all(masked_output == 0))


In [None]:

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, dropout_rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        
        # Transformer Encoder Layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.embed_dim,
            nhead=self.num_heads,
            dropout=self.dropout_rate,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        """
        src: Tensor of shape (batch_size, seq_len, num_joints, embed_dim)
        src_mask: None or Tensor for masking in multi-head attention (not used in this example)
        src_key_padding_mask: Tensor of shape (batch_size, seq_len * num_joints) indicating which elements are padded
        """
        # Reshaping src to fit the transformer's input requirement
        
        batch_size, seq_len, num_joints, embed_dim = src.size()
        src = src.view(batch_size, seq_len * num_joints, embed_dim)  # Flatten seq_len and num_joints

        # Applying Transformer Encoder
        output = self.transformer_encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        
        # Reshape back to (batch_size, seq_len, num_joints, embed_dim)
        output = output.view(batch_size, seq_len, num_joints, embed_dim)
        return output


In [None]:
# # Parameters
# num_joints = 18
# dof = 3
# embed_dim = 128
# seq_len = 60
# num_heads = 4
# num_layers = 3
# batch_size = 10

# # Create instances
# input_embedding = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=embed_dim, seq_len=seq_len)
# transformer_encoder = TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_layers)

# # Generate synthetic joint positions, velocities, accelerations
# joint_positions = torch.randn(batch_size, seq_len, num_joints, dof)
# velocities = torch.randn(batch_size, seq_len, num_joints, dof)
# accelerations = torch.randn(batch_size, seq_len, num_joints, dof)

# # Generate mask (1 for data present, 0 for data missing)
# mask = torch.randint(0, 2, (batch_size, seq_len, num_joints)).float()  # Random binary mask

# def test_transformer_encoder():
#     # Embedding inputs
#     embeddings = input_embedding(joint_positions, velocities, accelerations, mask=mask)
#     print("Embeddings shape:", embeddings.shape)

#     # Apply encoder
#     # Note: src_key_padding_mask needs to be reshaped properly to match the expected dimensions in the encoder
#     src_key_padding_mask = mask.view(batch_size, seq_len * num_joints)
#     output = transformer_encoder(embeddings, src_key_padding_mask=1-src_key_padding_mask)
#     print("Encoder output shape:", output.shape)

#     # Check output shape
#     assert output.shape == (batch_size, seq_len, num_joints, embed_dim), "Output shape mismatch"
#     print("Test passed!")

# def test_transformer_encoder_with_masking():
#     # Embedding inputs
#     embeddings = input_embedding(joint_positions, velocities, accelerations, mask=mask)
#     print("Embeddings shape:", embeddings.shape)

#     # Create an all-ones mask (no data is considered missing)
#     no_mask = torch.ones_like(mask)

#     # Apply encoder with actual mask
#     src_key_padding_mask = mask.view(batch_size, seq_len * num_joints)
#     masked_output = transformer_encoder(embeddings, src_key_padding_mask=1-src_key_padding_mask)
#     print("Masked Encoder output shape:", masked_output.shape)

#     # Apply encoder without any mask (all data is considered present)
#     no_mask_key_padding_mask = no_mask.view(batch_size, seq_len * num_joints)
#     unmasked_output = transformer_encoder(embeddings, src_key_padding_mask=1-no_mask_key_padding_mask)
#     print("Unmasked Encoder output shape:", unmasked_output.shape)

#     # Check output shapes
#     assert masked_output.shape == (batch_size, seq_len, num_joints, embed_dim), "Output shape mismatch with mask"
#     assert unmasked_output.shape == (batch_size, seq_len, num_joints, embed_dim), "Output shape mismatch without mask"

#     # Check if outputs are different (which they should be if masking is effective)
#     difference = torch.abs(masked_output - unmasked_output).sum()
#     print("Difference between masked and unmasked outputs:", difference.item())
#     assert difference > 0, "Mask seems to have no effect"

#     print("Masking functionality test passed!")

# # Run the test
# test_transformer_encoder_with_masking()


# # Run the test
# test_transformer_encoder()


In [None]:

class TransformerDecoder(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, num_joints, dropout_rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.num_joints = num_joints

        # Transformer Decoder Layer
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=self.embed_dim,
            nhead=self.num_heads,
            dropout=self.dropout_rate,
            batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=self.num_layers)

        # Output layer to convert decoder output to joint position dimension
        self.output_layer = nn.Linear(self.embed_dim, 3)  # Assuming output per joint is a 3D position

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        """
        tgt: Tensor of shape (batch_size, output_seq_len, num_joints, embed_dim), initially could be start token or zero vectors
        memory: Tensor of shape (batch_size, input_seq_len * num_joints, embed_dim), output from the Transformer encoder
        tgt_mask: Mask to ensure the decoder's predictions are based only on past positions
        memory_mask: Optional, to mask encoder outputs if necessary
        tgt_key_padding_mask: Tensor of shape (batch_size, output_seq_len * num_joints) for masking target sequences
        memory_key_padding_mask: Tensor of shape (batch_size, input_seq_len * num_joints) for masking memory sequences
        """
        # Reshaping memory and target to fit the transformer's input requirement
        batch_size, input_seq_len, num_joints, embed_dim = memory.size()
        batch_size, output_seq_len, num_joints, embed_dim = tgt.size()
        memory = memory.view(batch_size, input_seq_len * num_joints, embed_dim)  # Ensure memory is correctly reshaped
        tgt = tgt.view(batch_size, output_seq_len * num_joints, embed_dim)  # Flatten output_seq_len and num_joints

        # Transformer Decoder
        output = self.transformer_decoder(
            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
            tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
        )

        # Reshape back and project to joint position dimensions
        output = output.view(batch_size, output_seq_len, num_joints, embed_dim)
        output = self.output_layer(output)

        return output


In [None]:

# # Initialize the decoder
# embed_dim = 128
# num_heads = 8
# num_layers = 4
# num_joints = 18
# dropout_rate = 0.1
# decoder = TransformerDecoder(embed_dim, num_heads, num_layers, num_joints, dropout_rate)

# # Mock data setup
# batch_size = 5
# seq_len = 10  # Length of the output sequence
# input_seq_len = 10  # Length of the encoder output sequence
# tgt = torch.rand(batch_size, seq_len, num_joints, embed_dim)  # Random target for simulation
# memory = torch.rand(batch_size, input_seq_len, num_joints, embed_dim)  # Encoder output

# # Masks setup
# tgt_key_padding_mask = torch.zeros(batch_size, seq_len * num_joints, dtype=torch.bool)
# memory_key_padding_mask = torch.zeros(batch_size, input_seq_len * num_joints, dtype=torch.bool)

# # Simulate some missing data
# tgt_key_padding_mask[0, 50:] = True
# memory_key_padding_mask[0, 50:] = True

# # Causal mask to prevent looking ahead in the target sequence
# tgt_mask = torch.triu(torch.ones(seq_len * num_joints, seq_len * num_joints), diagonal=1).bool()

# # Forward pass through the decoder
# output = decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)

# # Check outputs
# print("Output shape:", output.shape)  # Expected: (batch_size, seq_len, num_joints, 3)

# # Check if padding affects the output
# print("Output for padded data (should be zero or unchanged):")
# print(output[0, -1])  # Check last few outputs of the first batch where padding is applied


In [None]:
def generate_sequences(norm_pos, norm_vel, norm_acc, mask, input_length=60, predict_length=60):
    num_frames = norm_pos.shape[0]
    num_joints = norm_pos.shape[1]

    # Calculate the total number of sequences we can create
    num_sequences = num_frames - input_length - predict_length + 1

    # Initialize arrays to store the input and target sequences
    X_pos = np.zeros((num_sequences, input_length, num_joints, 3))
    X_vel = np.zeros((num_sequences, input_length, num_joints, 3))
    X_acc = np.zeros((num_sequences, input_length, num_joints, 3))
    Y_pos = np.zeros((num_sequences, predict_length, num_joints, 3))
    Y_vel = np.zeros((num_sequences, predict_length, num_joints, 3))
    Y_acc = np.zeros((num_sequences, predict_length, num_joints, 3))
    X_mask = np.zeros((num_sequences, input_length, num_joints))
    Y_mask = np.zeros((num_sequences, predict_length, num_joints))

    # Create sequences
    for i in range(num_sequences):
        X_pos[i] = norm_pos[i:i + input_length]
        X_vel[i] = norm_vel[i:i + input_length]
        X_acc[i] = norm_acc[i:i + input_length]
        Y_pos[i] = norm_pos[i + input_length:i + input_length + predict_length]
        Y_vel[i] = norm_vel[i + input_length:i + input_length + predict_length]
        Y_acc[i] = norm_acc[i + input_length:i + input_length + predict_length]
        X_mask[i] = mask[i:i + input_length]
        Y_mask[i] = mask[i + input_length:i + input_length + predict_length]

    return X_pos, X_vel, X_acc, X_mask, Y_pos, Y_vel, Y_acc, Y_mask


In [None]:
# Ensure CUDA is available
print(torch.cuda.is_available())

batch_size=1
seq_length=60
num_joints=18

class TransformerModel(nn.Module):
    def __init__(self, batch_size=1, seq_len=60, num_joints=18, dof=3, embed_dim=128, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.input_embedding = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=embed_dim)
        self.encoder = TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_encoder_layers)
        self.decoder = TransformerDecoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_encoder_layers, num_joints=num_joints, dropout_rate=dropout)
        self._initialize_weights()
    
    def _initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                torch.nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
                if module.bias is not None:
                    torch.nn.init.constant_(module.bias, 0)
    
    def forward(self, batch_size, seq_len, num_joints, xpos, xvel, xacc, xmask, ypos, yvel, yacc, ymask):
        inputembeddings = self.input_embedding(xpos, xvel, xacc, xmask)
        tgtembeddings = self.input_embedding(ypos, yvel, yacc, ymask) 

        #print(inputembeddings)

        src_key_padding_mask = ~xmask.view(batch_size, seq_len * num_joints)
        tgt_key_padding_mask = ~ymask.view(batch_size, seq_len * num_joints)
        
        encoder_output = self.encoder(inputembeddings,src_key_padding_mask=src_key_padding_mask)
        # Teacher forcing: use true input embeddings as decoder input during training
        decoder_output = self.decoder(tgtembeddings, encoder_output, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=src_key_padding_mask)
        return decoder_output

model = TransformerModel()
model.cuda()

# Setup DataLoaders

X_pos, X_vel, X_acc, X_mask, Y_pos, Y_vel, Y_acc, Y_mask = generate_sequences(norm_pos, norm_vel, norm_acc, masks)

# Convert to PyTorch tensors
X_pos_tensor = torch.tensor(X_pos, dtype=torch.float32)
X_vel_tensor = torch.tensor(X_vel, dtype=torch.float32)
X_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
X_mask_tensor = torch.tensor(X_mask, dtype=torch.bool)

Y_pos_tensor = torch.tensor(Y_pos, dtype=torch.float32)
Y_vel_tensor = torch.tensor(X_vel, dtype=torch.float32)
Y_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
Y_mask_tensor = torch.tensor(Y_mask, dtype=torch.bool)

# Create the full dataset
full_dataset = TensorDataset(X_pos_tensor, X_vel_tensor, X_acc_tensor, X_mask_tensor, Y_pos_tensor, Y_vel_tensor, Y_acc_tensor, Y_mask_tensor)

# Contiguous split based on time
train_size = int(0.8 * len(full_dataset))
train_dataset = Subset(full_dataset, range(train_size))
val_dataset = Subset(full_dataset, range(train_size, len(full_dataset)))

# Creating the DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# Custom Loss Function
def custom_masked_mse_loss(output, target, xmask, ymask):
    # Expand mask dimensions to match the dimensions of output and target
    xmask_expanded = xmask.unsqueeze(-1).expand_as(output)  # Ensure it covers the last dimension as well
    ymask_expanded = ymask.unsqueeze(-1).expand_as(target)
    
    # Apply the masks
    output = output * xmask_expanded
    target = target * ymask_expanded
    
    # Compute the difference and the squared difference
    diff = output - target
    squared_diff = diff ** 2
    
    # Apply the mask again to the squared differences
    masked_squared_diff = squared_diff[xmask_expanded.bool()]
    
    # Return the mean of the masked squared differences, or zero if empty
    return masked_squared_diff.mean() if masked_squared_diff.numel() > 0 else torch.tensor(0.0).to(output.device)

optimizer = optim.Adam(model.parameters(), lr=0.00001)

# Training and validation
train_losses = []
val_losses = []
best_loss = float('inf')
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for xpos, xvel, xacc, xmask, ypos, yvel, yacc, ymask, in train_dataloader:
        xpos, xvel, xacc, xmask = xpos.cuda(), xvel.cuda(), xacc.cuda(), xmask.cuda()
        ypos, yvel, yacc, ymask = ypos.cuda(), yvel.cuda(), yacc.cuda(), ymask.cuda()

        optimizer.zero_grad()
        output = model(batch_size, seq_length, num_joints, xpos, xvel, xacc, xmask, ypos, yvel, yacc, ymask)

        loss = custom_masked_mse_loss(output, ypos, xmask, ymask)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_train_loss = total_loss / len(train_dataloader)
    train_losses.append(average_train_loss)

    # model.eval()
    # total_val_loss = 0.0
    # with torch.no_grad():
    #     for pos, vel, acc, mask, y_pos in val_dataloader:
    #         pos, vel, acc, mask = pos.cuda(), vel.cuda(), acc.cuda(), mask.cuda()
    #         y_pos = y_pos.cuda()

    #         output = model(pos, vel, acc, mask)
    #         val_loss = criterion(output, y_pos, mask)
    #         total_val_loss += val_loss.item()

    # average_val_loss = total_val_loss / len(val_dataloader)
    # val_losses.append(average_val_loss)

    print(f'Epoch {epoch+1}: Training Loss: {average_train_loss:.4f}') 
   

    if average_train_loss < best_loss:
        best_loss = average_train_loss
        torch.save(model.state_dict(), 'best_model_weights.pth')
        print(f'Saved new best model with validation loss: {average_val_loss:.4f}')

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# print(torch.cuda.is_available())

# class TransformerModel(nn.Module):
#     def __init__(self, num_joints=18, dof=3, embed_dim=128, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=512, dropout=0.1):
#         super(TransformerModel, self).__init__()
#         self.input_embedding = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=embed_dim)
#         self.encoder = TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_layers)
#         self.decoder = TransformerDecoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_layers)
#         self._initialize_weights()
    
#     def _initialize_weights(self):
#         for module in self.modules():
#             if isinstance(module, nn.Linear):
#                 torch.nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
#                 if module.bias is not None:
#                     torch.nn.init.constant_(module.bias, 0)
    
#     def forward(self, pos, vel, acc, mask):
#         embeddings = self.input_embedding(pos, vel, acc, mask)
#         # if torch.isnan(embeddings).any():
#         #print(embeddings[0][0])
 
    
#         encoder_output = self.encoder(embeddings)
#         # if torch.isnan(encoder_output).any():
#         #     print("NaN detected in encoder output")
    
#         predicted_output = self.decoder(encoder_output, encoder_output)
#         # if torch.isnan(predicted_output).any():
#         #     print(predicted_output)
        
#         return predicted_output

# # Assuming the previous imports and TransformerModel definition
# model = TransformerModel()
# model.cuda()

# X_pos, X_vel, X_acc, X_mask, Y_pos, Y_vel, Y_acc, Y_mask = generate_sequences(norm_pos, norm_vel, norm_acc, masks)

# # Convert to PyTorch tensors
# X_pos_tensor = torch.tensor(X_pos, dtype=torch.float32)
# X_vel_tensor = torch.tensor(X_vel, dtype=torch.float32)
# X_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
# X_mask_tensor = torch.tensor(X_mask, dtype=torch.bool)
# Y_pos_tensor = torch.tensor(Y_pos, dtype=torch.float32)

# # Create the full dataset
# full_dataset = TensorDataset(X_pos_tensor, X_vel_tensor, X_acc_tensor, X_mask_tensor, Y_pos_tensor)

# # Contiguous split based on time
# train_size = int(0.8 * len(full_dataset))
# train_dataset = Subset(full_dataset, range(train_size))
# val_dataset = Subset(full_dataset, range(train_size, len(full_dataset)))

# # Creating the DataLoaders
# train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)  # Shuffling is generally not done in time-series
# val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# # Loss and Optimizer
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Lists for storing loss values
# train_losses = []
# val_losses = []

# # Training loop
# best_loss = float('inf')
# num_epochs = 50
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for pos, vel, acc, mask, y_pos in train_dataloader:
#         pos, vel, acc, mask = pos.cuda(), vel.cuda(), acc.cuda(), mask.cuda()
#         y_pos = y_pos.cuda()

#         optimizer.zero_grad()
#         output = model(pos, vel, acc, mask)
#         #print(mask.shape)
#         mask = mask.unsqueeze(-1).expand_as(output)
#         output = output.where(~torch.isnan(output), torch.zeros_like(output))

#         y_pos = y_pos.where(~torch.isnan(y_pos), torch.zeros_like(y_pos))

#         masked_output = output * mask
#         masked_y_pos = y_pos * mask

#         loss = criterion(masked_output, masked_y_pos)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#     epoch_loss = running_loss / len(train_dataloader)
#     train_losses.append(epoch_loss)
#     print(f'Epoch {epoch+1}, Training Loss: {epoch_loss}')

#     # Validation phase
#     model.eval()
#     val_running_loss = 0.0
#     with torch.no_grad():
#         for pos, vel, acc, mask, y_pos in val_dataloader:
#             pos, vel, acc, mask = pos.cuda(), vel.cuda(), acc.cuda(), mask.cuda()
#             y_pos = y_pos.cuda()

#             output = model(pos, vel, acc, mask)
            
#             mask = mask.unsqueeze(-1).expand_as(output)
#             output = output.where(~torch.isnan(output), torch.zeros_like(output))

#             y_pos = y_pos.where(~torch.isnan(y_pos), torch.zeros_like(y_pos))

#             masked_output = output * mask
#             masked_y_pos = y_pos * mask

#             val_loss = criterion(masked_output, masked_y_pos)
#             val_running_loss += val_loss.item()

#     val_epoch_loss = val_running_loss / len(val_dataloader)
#     val_losses.append(val_epoch_loss)
#     print(f'Epoch {epoch+1}, Validation Loss: {val_epoch_loss}')

#     # Check if the validation loss improved
#     if val_epoch_loss < best_loss:
#         best_loss = val_epoch_loss
#         torch.save(model.state_dict(), 'best_model_weights.pth')
#         print(f'Epoch {epoch+1}, New Best Validation Loss: {val_epoch_loss}, Model Saved')

        

# print("Training complete")

# # Plotting the training and validation losses
# plt.figure(figsize=(10, 5))
# plt.plot(train_losses, label='Training Loss')
# plt.plot(val_losses, label='Validation Loss')
# plt.title('Training and Validation Loss per Epoch')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.show()






In [None]:
# Example of loading weights
try:
    model.load_state_dict(torch.load('best_model_weights.pth'))
    print("Weights loaded successfully.")
except Exception as e:
    print("Failed to load weights:", e)


# Check for NaNs or extreme values in weights
for name, param in model.named_parameters():
    if torch.isnan(param).any():
        print(f"NaN found in {name}")
    if param.abs().max() > 1e6:  # Example threshold for "extreme" values
        print(f"Extreme values found in {name}")

In [None]:
torch.cuda.is_available()

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, num_joints=18, dof=3, embed_dim=128, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.input_embedding = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=embed_dim)
        self.encoder = TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.decoder = TransformerDecoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_decoder_layers, dim_feedforward=dim_feedforward, num_joints=num_joints, dof=dof, dropout=dropout)
    #     self._initialize_weights()

    # def _initialize_weights(self):
    #     for module in self.modules():
    #         if isinstance(module, nn.Linear):
    #             torch.nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
    #             if module.bias is not None:
    #                 torch.nn.init.constant_(module.bias, 0)

    def forward(self, pos, vel, acc, mask):
        embeddings = self.input_embedding(pos, vel, acc, mask)
        # if torch.isnan(embeddings).any():
        #print(embeddings)

        encoder_output = self.encoder(embeddings)
        # if torch.isnan(encoder_output).any():
        #     print("NaN detected in encoder output")

        predicted_output = self.decoder(encoder_output, encoder_output, num_frames_to_predict=60)
        # if torch.isnan(predicted_output).any():
        #     print(predicted_output)
        
        return predicted_output


# Load model
model = TransformerModel()
model.load_state_dict(torch.load('best_model_weights.pth'))
model.cuda()  # Ensure model is in CUDA environment for GPU usage
model.eval()  # Set model to evaluation mode

# Generate or load your test dataset here
# For demonstration, I'm assuming you have a function to generate or load test data similar to your training setup
X_pos, X_vel, X_acc, X_mask, Y_pos, Y_vel, Y_acc, Y_mask = generate_sequences(norm_pos, norm_vel, norm_acc, masks)

# Convert test data to PyTorch tensors
X_pos_test_tensor = torch.tensor(X_pos, dtype=torch.float32)
X_vel_test_tensor = torch.tensor(X_vel, dtype=torch.float32)
X_acc_test_tensor = torch.tensor(X_acc, dtype=torch.float32)
X_mask_test_tensor = torch.tensor(X_mask, dtype=torch.bool)

# Create a Dataset and DataLoader for testing
test_dataset = TensorDataset(X_pos_test_tensor, X_vel_test_tensor, X_acc_test_tensor, X_mask_test_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=1)  # Batch size set to 1 for testing one sequence at a time

# Testing loop
for pos, vel, acc, mask in test_dataloader:
    pos, vel, acc, mask = pos.cuda(), vel.cuda(), acc.cuda(), mask.cuda()
    
    with torch.no_grad():  # No need to track gradients during inference
        predicted_output = model(pos, vel, acc, mask)
    
    # Now, `predicted_output` contains the model's predictions for the next 60 frames
    # Depending on your setup, you might want to apply any post-processing or visualization here
    # For example, print the output or compare it with ground truth data if available
    print(predicted_output.cpu().numpy())  # Output predictions to CPU and convert to numpy for printing or further analysis

    break

# Add any specific metrics or visualizations you need to evaluate the model's predictions


In [None]:
predicted_output[0]

In [None]:
# class TransformerModel(nn.Module):
#     def __init__(self, num_joints=18, dof=3, embed_dim=128, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=512, dropout=0.1):
#         super(TransformerModel, self).__init__()
#         self.input_embedding = SkeletalInputEmbedding(num_joints=num_joints, dof=dof, embed_dim=embed_dim)
#         self.encoder = TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
#         self.decoder = TransformerDecoder(embed_dim=embed_dim, num_heads=num_heads, num_layers=num_decoder_layers, dim_feedforward=dim_feedforward, num_joints=num_joints, dof=dof, dropout=dropout)
#         self._initialize_weights()

#     def _initialize_weights(self):
#         for module in self.modules():
#             if isinstance(module, nn.Linear):
#                 torch.nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
#                 if module.bias is not None:
#                     torch.nn.init.constant_(module.bias, 0)

#     def forward(self, pos, vel, acc, mask):
#         embeddings = self.input_embedding(pos, vel, acc, mask)
#         # if torch.isnan(embeddings).any():
#         #print(embeddings)

#         encoder_output = self.encoder(embeddings)
#         # if torch.isnan(encoder_output).any():
#         #     print("NaN detected in encoder output")

#         predicted_output = self.decoder(encoder_output, encoder_output, num_frames_to_predict=60)
#         # if torch.isnan(predicted_output).any():
#         #     print(predicted_output)
        
#         return predicted_output

# # Convert to PyTorch tensors
# X_pos_tensor = torch.tensor(X_pos, dtype=torch.float32)
# X_vel_tensor = torch.tensor(X_vel, dtype=torch.float32)
# X_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
# X_mask_tensor = torch.tensor(X_mask, dtype=torch.bool)

# Y_pos_tensor = torch.tensor(Y_pos, dtype=torch.float32)

# dataset = TensorDataset(X_pos_tensor, X_vel_tensor, X_acc_tensor, X_mask_tensor, Y_pos_tensor)
# dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# # Initialize the model
# model = TransformerModel()
# model.cuda()  # Assuming you're using GPU
# #register_hooks(model)

# # Loss and Optimizer
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop

# best_loss = float('inf')
# num_epochs = 50
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for pos, vel, acc, mask, y_pos in dataloader:
#         pos, vel, acc, mask = pos.cuda(), vel.cuda(), acc.cuda(), mask.cuda()
#         y_pos = y_pos.cuda()
    
#         optimizer.zero_grad()
#         output = model(pos, vel, acc, mask)
#         #print(output)
    
#         mask = mask.unsqueeze(-1).expand_as(output)
#         output = output.where(~torch.isnan(output), torch.zeros_like(output))
        
#         y_pos = y_pos.where(~torch.isnan(y_pos), torch.zeros_like(y_pos))
        
#         masked_output = output * mask
#         masked_y_pos = y_pos * mask
#         # print("output: ", output[0][0])
#         # print("mask: ", mask[0][0])
#         # print("masked output: ", masked_output[0][0])
        
#         loss = criterion(masked_output, masked_y_pos)
#         loss.backward()
#         optimizer.step()
    
#         running_loss += loss.item()

#     # for name, param in model.named_parameters():
#     #     if torch.isnan(param).any():
#     #         print(f"Pre-save NaN found in {name}")
#     #     else:
#     #         print(f"{name} - max: {param.max()}, min: {param.min()}, mean: {param.mean()}")

#     epoch_loss = running_loss / len(dataloader)
#     if epoch_loss < best_loss:
#         best_loss = epoch_loss
#         torch.save(model.state_dict(), 'best_model_weights.pth')  # Save the best model weights
#         print(f'Epoch {epoch+1}, Loss: {epoch_loss} (new best, model saved)')
#     else:
#         print(f'Epoch {epoch+1}, Loss: {epoch_loss}')

# print("Training complete")