## Dataset download

In [1]:
from typing import Callable
import torch
from torch import Tensor

from SLTDataset import SLTDataset
from pose_transforms import (
    norm_positions,
    get_norm_distances,
    get_norm_speed,
    fill_missing,
    get_filter_landmarks,
    get_sample_frames_to_fit_max_len,
    replace_nans_with_zeros,
    use_frames_diffs,
)


DATA_DIR = "/mnt/disk3Tb/slt-datasets/GSL"


MAX_FRAMES = 30
LANDMARKS_USED = ["pose", "lhand", "rhand"]
USE_3D = False

holistic_landmarks = (
    ["pose" for _ in range(33)]
    + ["face" for _ in range(468)]
    + ["lhand" for _ in range(21)]
    + ["rhand" for _ in range(21)]
)
LANDMARKS_MASK = torch.tensor(
    [True if kp in LANDMARKS_USED else False for kp in holistic_landmarks]
)

transforms: list[Callable[[Tensor], Tensor]] = [
    get_filter_landmarks(LANDMARKS_MASK, use_3d=USE_3D),
    # norm_positions,
    # get_norm_distances(indices=(11, 12), distance_factor=0.2),
    # fill_missing,
    # get_norm_speed(max_frames=MAX_FRAMES),
    get_sample_frames_to_fit_max_len(MAX_FRAMES),
    replace_nans_with_zeros,
    # use_frames_diffs,
]

train_dataset = SLTDataset(
    data_dir=DATA_DIR,
    split="train",
    input_mode="pose",
    output_mode="text",
    transforms=transforms,
)
val_dataset = SLTDataset(
    data_dir=DATA_DIR,
    split="val",
    input_mode="pose",
    output_mode="text",
    transforms=transforms,
)
test_dataset = SLTDataset(
    data_dir=DATA_DIR,
    split="test",
    input_mode="pose",
    output_mode="text",
    transforms=transforms,
)

Loaded metadata for dataset: The Greek Sign Language (GSL) Dataset
Loaded train annotations at /mnt/disk3Tb/slt-datasets/GSL/annotations.csv


Validating files: 100%|██████████| 8821/8821 [00:00<00:00, 263736.10it/s]


Dataset loaded correctly

Loaded metadata for dataset: The Greek Sign Language (GSL) Dataset
Loaded val annotations at /mnt/disk3Tb/slt-datasets/GSL/annotations.csv


Validating files: 100%|██████████| 588/588 [00:00<00:00, 217885.92it/s]


Dataset loaded correctly

Loaded metadata for dataset: The Greek Sign Language (GSL) Dataset
Loaded test annotations at /mnt/disk3Tb/slt-datasets/GSL/annotations.csv


Validating files: 100%|██████████| 881/881 [00:00<00:00, 239356.25it/s]

Dataset loaded correctly






In [2]:
# out = train_dataset.visualize_pose(
#     0,
#     use_video=False,
#     h=480,
#     w=648,
#     transforms=transforms,
#     out_path="visualizations/pose_t_2.mp4",
# )

### Text tokenization and analysis for padding and truncation

In [3]:
from transformers import AutoTokenizer
from WordLevelTokenizer import WordLevelTokenizer

# from tokenizers.models import WordLevel, BPE
# from tokenizers.trainers import WordLevelTrainer, BpeTrainer

texts = train_dataset.annotations["text"].tolist()


USE_BERT_EMBEDDINGS = False

if USE_BERT_EMBEDDINGS:
    TEXT_MODEL = "nlpaueb/bert-base-greek-uncased-v1"
    tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
else:
    TEXT_MODEL = None
    tokenizer = WordLevelTokenizer(texts)

BOS_IDX = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else -1
EOS_IDX = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1
PAD_IDX = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1

print(f"BOS_IDX: {BOS_IDX}, EOS_IDX: {EOS_IDX}, PAD_IDX: {PAD_IDX}")

  from .autonotebook import tqdm as notebook_tqdm


BOS_IDX: 2, EOS_IDX: 3, PAD_IDX: 0


In [4]:
tokenized_sequences = tokenizer(texts, padding="max_length", max_length=25)

In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch


USE_CLASS_WEIGHTS = False
class_weights = None

if USE_CLASS_WEIGHTS:
    flattened_tgts = [
        item
        for sublist in tokenized_sequences["input_ids"]
        for item in sublist
        if item != PAD_IDX
    ]
    token_ids = sorted(list(set(flattened_tgts)))
    class_weights = compute_class_weight(
        "balanced", classes=np.array(token_ids), y=flattened_tgts
    )
    class_weights_complete = torch.ones(tokenizer.vocab_size)
    class_weights_complete[token_ids] = torch.from_numpy(class_weights).float()

In [8]:
if USE_CLASS_WEIGHTS:
    print(tokenizer.convert_ids_to_tokens([i for i in range(10)]))
    print(class_weights_complete[:10].tolist())

## Preprocessing and dataloader generation

In [9]:
import torch
from torch import Tensor
import torch.utils.data as utils


MAX_TOKENS = 25
BATCH_SIZE = 128


def flatten_landmarks(datum: Tensor):
    """
    Reshape the pose of datum only keeping the first dimension S (sequence lenght) and flattening the number of landmarks L and their dimensions D.
    Args:
            datum: Tensor of shape (S, P, D, L)
    Returns:
            Tensor of shape (frames, P * D * L)
    """
    return datum.reshape(datum.shape[0], -1)


def collate_fn(batch):
    src = [flatten_landmarks(x) for x, y in batch]
    tgt = [y for x, y in batch]
    tgt = tokenizer(
        tgt, padding="max_length", max_length=MAX_TOKENS, return_tensors="pt"
    )["input_ids"]
    return torch.stack(src), tgt


train_loader = utils.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=4
)
validation_loader = utils.DataLoader(
    val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=4
)
test_loader = utils.DataLoader(
    test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=4
)

In [10]:
for src, tgt in train_loader:
    print(src.shape)
    print(tgt.shape)
    break

torch.Size([128, 30, 150])
torch.Size([128, 25])


## Model

### Model definition

In [11]:
from KeypointsTransformer import KeypointsTransformer


D_MODEL = 16
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 4
DROPOUT = 0.2

num_keypoints = LANDMARKS_MASK.sum().item()
IN_FEATURES = int(num_keypoints * (3 if USE_3D else 2))

model = KeypointsTransformer(
    src_max_len=MAX_FRAMES,
    tgt_max_len=MAX_TOKENS,
    in_features=IN_FEATURES,
    tgt_vocab_size=tokenizer.vocab_size,
    d_model=D_MODEL,
    num_encoder_layers=NUM_ENCODER_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS,
    dropout=DROPOUT,
    use_bert_embeddings=USE_BERT_EMBEDDINGS,
)

## Model training

In [13]:
import lightning.pytorch.utilities.model_summary.model_summary as model_summary

from Translator import Translator
from LightningKeypointsTransformer import LKeypointsTransformer


LR = 1e-3
DEVICE = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else ("cuda" if torch.cuda.is_available() else "cpu")
)

BATCH_SIZE_TEST = 1
example_input_array = (
    torch.randn(BATCH_SIZE_TEST, MAX_FRAMES, IN_FEATURES),
    torch.randint(0, tokenizer.vocab_size, (BATCH_SIZE_TEST, MAX_TOKENS)),
    torch.zeros(MAX_TOKENS, MAX_TOKENS),
    torch.randint(0, 2, (BATCH_SIZE_TEST, MAX_TOKENS)).bool(),
)
translator = Translator(DEVICE, MAX_TOKENS)
l_model = LKeypointsTransformer(
    model, DEVICE, tokenizer, translator, LR, example_input_array, class_weights
)
model_summary.summarize(l_model, max_depth=10)



   | Name                                                       | Type                            | Params | In sizes                                                             | Out sizes         
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0  | model                                                      | KeypointsTransformer            | 454 K  | [[1, 30, 150], [1, 25], [25, 25], [1, 25]]                           | [1, 25, 477]      
1  | model.batch_norm                                           | BatchNorm1d                     | 300    | [1, 150, 30]                                                         | [1, 150, 30]      
2  | model.src_keyp_emb                                         | Conv1DEmbedder                  | 21.4 K | [1, 30, 150]                                                         | [1, 30, 16]       

In [14]:
import lightning.pytorch as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger


PRECISION = 32

wandb_logger = WandbLogger(project="gsl", log_model="all")
wandb_logger.experiment.config.update(
    {
        # System hyperparameters
        "DEVICE": DEVICE,
        "PRECISION": PRECISION,
        # Data hyperparameters
        "BATCH_SIZE": BATCH_SIZE,
        "MAX_FRAMES": MAX_FRAMES,
        "MAX_TOKENS": MAX_TOKENS,
        "TEXT_MODEL": TEXT_MODEL,
        "LANDMARKS_USED": str(LANDMARKS_USED),
        "TRANSFORMS": [t.__name__ for t in transforms],
        # Model hyperparameters
        "D_MODEL": D_MODEL,
        "DROPOUT": DROPOUT,
        "USE_BERT_EMBEDDINGS": USE_BERT_EMBEDDINGS,
        "NUM_ENCODER_LAYERS": NUM_ENCODER_LAYERS,
        # Training hyperparameters
        "USE_CLASS_WEIGHTS": USE_CLASS_WEIGHTS,
        "LR": LR,
    }
)

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    dirpath="checkpoints/",
    filename=f"rwth-{wandb_logger.experiment.name}-best-{{epoch:02d}}-{{step:02d}}-{{val_loss:.2f}}",
    mode="min",
    save_last=True,
)
checkpoint_callback.CHECKPOINT_NAME_LAST = f"rwth-{wandb_logger.experiment.name}-last"  # type: ignore

trainer = L.Trainer(
    logger=wandb_logger,
    default_root_dir="./checkpoint",
    precision=PRECISION,
    callbacks=[
        EarlyStopping(monitor="val_loss", mode="min", patience=30),
        checkpoint_callback,
    ],
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpedroodb[0m ([33mlidiaa[0m). Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
trainer.fit(
    model=l_model,
    train_dataloaders=train_loader,
    val_dataloaders=validation_loader,
)

In [16]:
import glob


CHKP = glob.glob(f"checkpoints/rwth-{wandb_logger.experiment.name}-best*")[0]

l_model = LKeypointsTransformer.load_from_checkpoint(
    CHKP, model=model, num_classes=tokenizer.vocab_size
)

trainer.test(
    model=l_model,
    dataloaders=test_loader,
    ckpt_path=CHKP,
)

In [18]:
# import glob


# # CHKP = glob.glob(f"checkpoints/rwth-{wandb_logger.experiment.name}-best*")[0]
# CHKP = glob.glob(f"checkpoints/rwth-eager-breeze-113-best*")[0]
# l_model = LKeypointsTransformer.load_from_checkpoint(
#     CHKP, model=model, num_classes=tokenizer.vocab_size
# )

# debug_loader = utils.DataLoader(
#     [test_dataset[i] for i in range(1)], batch_size=BATCH_SIZE, collate_fn=collate_fn
# )

In [22]:
# trainer.test(
#     model=l_model,
#     dataloaders=debug_loader,
#     ckpt_path=CHKP,
# )