In [25]:
import spacy
import torch
import torch.nn as nn
import lightning as L
import numpy as np
import tqdm

from src.preprocessing.dataset import get_datasets
from src.preprocessing.tokenizer_vocab import tokenize_data, build_vocab, numericalize_data, set_data_format
from src.preprocessing.dataloader import get_data_loader

from src.model.seq2vid import Encoder, Decoder, Vid2Seq
from src.model.lightning import LightVid2Seq

from src.constants import (
    VIDEO_IDS,
    SENTENCE,
    SENTENCE_IDS,
)

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
seed = 1234
SEED = 42

# random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True

In [4]:
# Constants
DATA_DIR = "./data"

FILE_PATHS = {
    "train": f"{DATA_DIR}/how2sign/how2sign_realigned_train.csv",
    "val": f"{DATA_DIR}/how2sign/how2sign_realigned_val.csv",
    "test": f"{DATA_DIR}/how2sign/how2sign_realigned_test.csv",
}

VIDEO_DIRS = {
    "train": f"{DATA_DIR}/how2sign/train/compressed_videos",
    "val": f"{DATA_DIR}/how2sign/val/compressed_videos",
    "test": f"{DATA_DIR}/how2sign/test/compressed_videos",
}

TRAIN_SIZES = {
    "train": 0.00228,  # 70 samples
    "val": 0.0088,     # 15 samples
    "test": 0.0067,    # 15 samples
}

In [5]:
datasets_dict = get_datasets(FILE_PATHS, VIDEO_DIRS, TRAIN_SIZES, SEED)

train_data = datasets_dict["train"]
test_data = datasets_dict["test"]
val_data = datasets_dict["val"]

In [6]:
# Constants
MAX_LENGTH = 1_000
IS_LOWER = True
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
MIN_FREQ = 1

SPECIAL_TOKENS = [UNK_TOKEN, PAD_TOKEN, SOS_TOKEN, EOS_TOKEN]

In [7]:
# Load the spaCy model
en_nlp = spacy.load("en_core_web_sm")

train_data = tokenize_data(train_data, en_nlp, MAX_LENGTH, IS_LOWER, SOS_TOKEN, EOS_TOKEN)
test_data = tokenize_data(test_data, en_nlp, MAX_LENGTH, IS_LOWER, SOS_TOKEN, EOS_TOKEN)
val_data = tokenize_data(val_data, en_nlp, MAX_LENGTH, IS_LOWER, SOS_TOKEN, EOS_TOKEN)


# Build the vocabulary
vocab = build_vocab(train_data, SPECIAL_TOKENS, MIN_FREQ)

# Assert that special tokens are correctly indexed
assert vocab[UNK_TOKEN] == 0
assert vocab[PAD_TOKEN] == 1

# Set the default index for unknown tokens
vocab.set_default_index(vocab[UNK_TOKEN])

train_data = numericalize_data(train_data, vocab)
test_data = numericalize_data(test_data, vocab)
val_data = numericalize_data(val_data, vocab)

train_data = set_data_format(train_data)
test_data = set_data_format(test_data)
val_data = set_data_format(val_data)

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [8]:
BATCH_SIZE = 32
pad_index = vocab[PAD_TOKEN]
sampling_rate = 10

train_data_loader = get_data_loader(train_data, BATCH_SIZE, pad_index, VIDEO_DIRS["train"], sampling_rate, shuffle=True, num_worker=2)
test_data_loader = get_data_loader(test_data, BATCH_SIZE, pad_index, VIDEO_DIRS["test"], sampling_rate, shuffle=True, num_worker=2)
val_data_loader = get_data_loader(val_data, BATCH_SIZE, pad_index, VIDEO_DIRS["val"], sampling_rate, shuffle=True, num_worker=2)

In [9]:
input_dim = 10
assert input_dim == sampling_rate
output_dim = len(vocab)
decoder_embedding_dim = 64
hidden_dim = 64
n_lstm_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encoder = Encoder(
    input_dim,
    hidden_dim,
    n_lstm_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_lstm_layers,
    decoder_dropout,
)

pad_index = pad_index
teacher_forcing_ratio = 0.9
clip = 1.0

model = LightVid2Seq(encoder, decoder, pad_index, teacher_forcing_ratio, clip)

In [10]:
def init_weights(m: nn.Module):
    for _, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model = model.apply(init_weights)

In [11]:
trainer = L.Trainer(max_epochs=1, log_every_n_steps=1, default_root_dir="./weight")

best_model_path = trainer.checkpoint_callback.best_model_path
if best_model_path == "":
    best_model_path = None

# trainer.fit(model, train_data_loader, val_data_loader, ckpt_path=best_model_path)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## Evaluating the Model

The first thing to do is to test the model's performance on the test set.

We'll load the parameters (`state_dict`) that gave our model the best validation loss and run it on the test set to get our test loss and perplexity.


In [12]:
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path
assert path_to_best_checkpoint != "", "require path to best model"

test_loss = trainer.test(model, test_data_loader, ckpt_path=path_to_best_checkpoint)[0]["test_loss"]

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

AssertionError: require path to best model

In [None]:
# NOTE: It's giving an error because of the new changes.

# def translate_video(
#     video,
#     encoder: Encoder,
#     decoder: Decoder,
#     vocab,
#     sos_token,
#     eos_token,
#     device,
#     max_output_length=25,
# ):
#     model.eval()
#     with torch.no_grad():
#         hidden, cell = encoder(video)
#         print(hidden.size(), cell.size())
#         inputs = vocab.lookup_indices([sos_token])
#         for _ in range(max_output_length):
#             inputs_tensor = torch.LongTensor([inputs[-1]])#.to(device)
#             output, hidden, cell = decoder(inputs_tensor, hidden[0], cell[0])
#             predicted_token = output.argmax(-1).item()
#             inputs.append(predicted_token)
#             if predicted_token == vocab[eos_token]:
#                 break
#         tokens = vocab.lookup_tokens(inputs)
#     return tokens

In [43]:
def translate_video(
    batch_video,
    model: Vid2Seq,
    vocab,
    sos_token,
    eos_token,
    device="cpu",
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        batch_size = batch_video.size(1)
        # trg = torch.LongTensor(max_output_length, batch_size)
        trg = torch.zeros((max_output_length, batch_size), dtype=torch.long, device=device)
        trg = torch.full_like(trg, vocab[sos_token])
        
        output = model(batch_video, trg, 0)
        output_ids = output.argmax(-1)

        tokens = [vocab.lookup_tokens(output_ids[:, i].tolist()) for i in range(batch_size)]
    return tokens

In [40]:
batch = next(iter(test_data_loader))
batch_size = batch[VIDEO_IDS].size(1)

expected_translation = [vocab.lookup_tokens(batch[SENTENCE_IDS][:, i].tolist()) for i in range(batch_size)]

batch[VIDEO_IDS].size(), batch[SENTENCE_IDS].size()

(torch.Size([1, 15, 10, 3, 224, 224]), torch.Size([55, 15]))

In [41]:
translation = translate_video(
    batch[VIDEO_IDS],
    model.model,
    vocab,
    SOS_TOKEN,
    EOS_TOKEN,
    # device,
)

In [None]:
len(translation), translation

(15,
 [['chest',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants'],
  ['chest',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants',
   'antioxidants'],
  ['chest',
   'antioxidants',
   'antioxidants',
   'ant

In [44]:
translations = [
    translate_video(
        example[VIDEO_IDS],#.to(device),
        model,
        en_nlp,
        vocab,
        SOS_TOKEN,
        EOS_TOKEN,
        device,
    )
    for example in tqdm.tqdm(train_data_loader) ### raplace with test_data_loader
]

  0%|          | 0/3 [00:02<?, ?it/s]


TypeError: zeros() received an invalid combination of arguments - got (tuple, device=str, dtype=torch.dtype), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [33]:
for i, translate in enumerate(translations):
    print(translate)
    if 5 < i:
        break

NameError: name 'translations' is not defined

In [None]:
bleu = evaluate.load("bleu")

In [None]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [[example["en"]] for example in train_data] ### raplace with test_data

predictions[0], references[0]

('now now the , , , , , , , , , , , , , , , , , , , ,',
 ["You also want to be sure that you have very comfortable socks because you're on your feet a lot."])

In [None]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [None]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [None]:
print(
    tokenizer_fn(predictions[0]),
    tokenizer_fn(references[0][0]),
    sep="\n",
)

['now', 'now', 'the', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']
['you', 'also', 'want', 'to', 'be', 'sure', 'that', 'you', 'have', 'very', 'comfortable', 'socks', 'because', 'you', "'re", 'on', 'your', 'feet', 'a', 'lot', '.']


In [None]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [None]:
results

{'bleu': 0.0,
 'precisions': [0.07080745341614907, 0.0, 0.0, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0274409700063816,
 'translation_length': 1610,
 'reference_length': 1567}