In [1]:
import numpy as np

def encode_body_pose(body_keypoints):
    # Custom encoding logic for body pose keypoints
    return np.mean(body_keypoints, axis=0)  # Example: Mean of body keypoints

def encode_hand_details(hand_keypoints):
    # Custom encoding logic for hand keypoints
    return np.mean(hand_keypoints, axis=0)  # Example: Mean of hand keypoints

def encode_facial_expression(face_keypoints):
    # Custom encoding logic for face keypoints
    return np.mean(face_keypoints, axis=0)  # Example: Mean of face keypoints

def hierarchical_tokenization(keypoints):
    # Full body keypoints
    body_tokens = encode_body_pose(keypoints['body'])
    
    # Hand keypoints (higher resolution)
    hand_tokens = encode_hand_details(keypoints['hands'])
    
    # Facial expression keypoints
    face_tokens = encode_facial_expression(keypoints['face'])
    
    return {
        'body': body_tokens,
        'hands': hand_tokens,
        'face': face_tokens
    }

# Example input keypoints dictionary
keypoints = {
    'body': np.random.rand(17, 2),    # Assuming 17 keypoints for body
    'hands': np.random.rand(21, 2),   # Assuming 21 keypoints for hands
    'face': np.random.rand(68, 2)     # Assuming 68 keypoints for face
}

# Tokenize keypoints
tokens = hierarchical_tokenization(keypoints)
print(tokens)


{'body': array([0.5133897 , 0.39745952]), 'hands': array([0.53476158, 0.52073028]), 'face': array([0.5347218, 0.4757175])}


In [2]:
import tensorflow as tf

def custom_loss(y_true, y_pred, token_weights):
    # Classification loss
    classification_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    
    # Temporal continuity loss (dummy function as placeholder)
    continuity_loss = tf.reduce_sum(tf.square(y_pred[:, 1:] - y_pred[:, :-1]))  # Temporal difference

    # Keypoint accuracy loss (dummy function as placeholder)
    keypoint_loss = tf.reduce_mean(tf.square(y_pred - y_true))
    
    return classification_loss + continuity_loss + keypoint_loss

# Example usage
y_true = np.random.rand(10, 50)  # Example true labels
y_pred = np.random.rand(10, 50)  # Example predictions
token_weights = 1.0  # Example token weights

loss_value = custom_loss(y_true, y_pred, token_weights)
print("Custom Loss:", loss_value.numpy())


Custom Loss: [179.70439501 195.45743981 168.69579873 175.56042109 195.9127067
 179.34880646 182.3638538  177.7655953  188.90251865 182.07707673]


In [1]:
import numpy as np
import pickle

# Function to read pickle data
def read_pkl(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

# Load the data
path = '/nas/Dataset/Phoenix/phoenix-2014-keypoints.pkl'
data = read_pkl(path)

# Augmentation function adapted for the new data structure
def augment_keypoints_video(video_keypoints, noise_level=0.01, viewpoint_shift=0.1, speed_variation=0.9):
    num_frames, num_keypoints, _ = video_keypoints.shape
    
    # Separate x, y, and c components
    xy_coords = video_keypoints[:, :, :2]
    confidences = video_keypoints[:, :, 2]
    
    # Add Gaussian noise to x, y coordinates
    noise = np.random.normal(0, noise_level, xy_coords.shape)
    augmented_xy = xy_coords + noise
    
    # Apply viewpoint transformation (scaling)
    scale_factor = 1 + viewpoint_shift * (np.random.rand() * 2 - 1)
    augmented_xy *= scale_factor
    
    # Speed variation by interpolating along the time axis for each keypoint
    new_length = int(num_frames * speed_variation)
    augmented_xy_interpolated = np.zeros((new_length, num_keypoints, 2))
    
    for k in range(num_keypoints):
        for dim in range(2):  # For x and y separately
            augmented_xy_interpolated[:, k, dim] = np.interp(
                np.linspace(0, num_frames - 1, new_length),
                np.arange(num_frames),
                augmented_xy[:, k, dim]
            )
    
    # Combine augmented x, y with original confidences
    augmented_video_keypoints = np.zeros((new_length, num_keypoints, 3))
    augmented_video_keypoints[:, :, :2] = augmented_xy_interpolated
    augmented_video_keypoints[:, :, 2] = np.interp(
        np.linspace(0, num_frames - 1, new_length),
        np.arange(num_frames),
        confidences
    )

    return augmented_video_keypoints

# Apply augmentation to the whole dataset
augmented_data = {}
for video, video_data in data.items():
    video_keypoints = video_data['keypoints']
    augmented_video_keypoints = augment_keypoints_video(video_keypoints)
    augmented_data[video] = {'keypoints': augmented_video_keypoints}

# Example: Check the shape of augmented data for a sample video
sample_video = list(augmented_data.keys())[0]
print("Original shape:", data[sample_video]['keypoints'].shape)
print("Augmented shape:", augmented_data[sample_video]['keypoints'].shape)


ValueError: object too deep for desired array

In [4]:
# Example function to integrate everything
def process_and_train_model(data_input, model, optimizer):
    augmented_data = augment_keypoints(data_input)
    tokens = hierarchical_tokenization(augmented_data)
    
    # Pass tokens through the model
    y_pred = model(tokens)
    
    # Assume y_true is available
    y_true = np.random.rand(*y_pred.shape)  # Placeholder for ground truth
    
    # Compute custom loss
    loss = custom_loss(y_true, y_pred, token_weights=1.0)
    
    # Perform gradient descent
    with tf.GradientTape() as tape:
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    print("Training step completed with loss:", loss.numpy())
