In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

In [2]:
os.chdir(os.path.dirname(os.path.abspath(os.getcwd())))

In [3]:
import torch

from src.mslm.utils.setup_train import build_model
from src.mslm.utils.config_loader import cfg

from src.mslm.inference import MultimodalSignLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_parameters = cfg.model
model_parameters.update({
    "input_size": 133 * 2,
    "output_size": 3072,
    #"use_checkpoint": False
})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
version = 107
checkpoint = 1
epoch = 9

model_parameters

{'input_size': 266,
 'output_size': 3072,
 'hidden_size': 1792,
 'nhead': 16,
 'ff_dim': 2816,
 'n_layers': 10,
 'encoder_dropout': 0.45,
 'multihead_dropout': 0.4,
 'sequential_dropout': 0.6,
 'pool_dim': 256}

In [5]:
def load_model():
    model = build_model(**model_parameters)    

    model_location = f"../outputs/checkpoints/{version}/{checkpoint}/{epoch}/checkpoint.pth" 
    if not os.path.exists(model_location):
        raise FileNotFoundError(
            f"Model not found {model_location}")

    state_dict = torch.load(model_location)

    model.load_state_dict(state_dict["model_state"])

    return model

In [19]:
model = load_model()
model.to(device)

Model Parameters:  {'input_size': 266, 'hidden_size': 1792, 'output_size': 3072, 'nhead': 16, 'ff_dim': 2816, 'n_layers': 10, 'max_seq_length': 301, 'pool_dim': 256, 'encoder_dropout': 0.45, 'multihead_dropout': 0.4, 'sequential_dropout': 0.6}
MHARoPE kwargs {'device': None, 'dtype': None}
dim: 1792 num_heads: 16 dim rope 112




Imitator(
  (linear_feat): Sequential(
    (0): Linear(in_features=266, out_features=1792, bias=True)
    (1): GELU(approximate='none')
    (2): LayerNorm((1792,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=1792, out_features=896, bias=True)
    (4): GELU(approximate='none')
    (5): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
  )
  (conv1): Conv1d(896, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act1): GELU(approximate='none')
  (conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
  (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act2): GELU(approximate='none')
  (linear_hidden): Linear(in_features=256, out_features=1792, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-9): 10 x TransformerEncoderLayerRoPE(
        (self_attn): MultiheadAttentionRoPE(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1792, out_feat

Imitator(
  (linear_feat): Sequential(
    (0): Linear(in_features=266, out_features=1792, bias=True)
    (1): GELU(approximate='none')
    (2): LayerNorm((1792,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=1792, out_features=896, bias=True)
    (4): GELU(approximate='none')
    (5): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
  )
  (conv1): Conv1d(896, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act1): GELU(approximate='none')
  (conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
  (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act2): GELU(approximate='none')
  (linear_hidden): Linear(in_features=256, out_features=1792, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-9): 10 x TransformerEncoderLayerRoPE(
        (self_attn): MultiheadAttentionRoPE(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1792, out_feat

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id="unsloth/Llama-3.2-3B-Instruct"
llama_model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
embeddings = llama_model.get_input_embeddings()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s]


In [21]:
llama_model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e

In [7]:
from src.mslm.utils import create_dataloaders, prepare_datasets
from torch.utils.data import Subset

In [8]:
h5_file = "../data/processed/dataset_v4_unsloth.hdf5"
train_ratio = 0.8
key_points = 133
batch_size = 64

In [11]:
tr_ds, val_ds, tr_len, val_len = prepare_datasets(h5_file, train_ratio, key_points)
#get sample dataset
sample_tr = Subset(tr_ds, range(0, 1))
sample_val = Subset(val_ds, range(0, 1))

sample_tr_len = len(sample_tr)
sample_val_len = len(sample_val)

_, test_dataloader = create_dataloaders(
    sample_tr,
    sample_val,
    batch_size=2,  # asegúrate que ≤ len(sample)
    num_workers=10
)


Videos:  12577
Train size:	10062
Validation size:	2515


In [33]:
for a in test_dataloader:
    print(a[0])

tensor([[[[0.1932, 0.1563],
          [0.0630, 0.1526],
          [0.0192, 0.3261],
          ...,
          [0.2650, 0.2704],
          [0.2549, 0.2659],
          [0.2448, 0.2659]],

         [[0.1933, 0.1562],
          [0.0633, 0.1526],
          [0.0205, 0.3266],
          ...,
          [0.2614, 0.2670],
          [0.2524, 0.2634],
          [0.2434, 0.2643]],

         [[0.1931, 0.1564],
          [0.0624, 0.1528],
          [0.0212, 0.3265],
          ...,
          [0.2626, 0.2668],
          [0.2537, 0.2631],
          [0.2448, 0.2631]],

         ...,

         [[0.1803, 0.1527],
          [0.0645, 0.1471],
          [0.0338, 0.3183],
          ...,
          [0.3447, 0.3295],
          [0.3595, 0.3312],
          [0.3722, 0.3321]],

         [[0.1805, 0.1524],
          [0.0634, 0.1470],
          [0.0261, 0.3178],
          ...,
          [0.3327, 0.3419],
          [0.3459, 0.3392],
          [0.3580, 0.3383]],

         [[0.1815, 0.1539],
          [0.0621, 0.1486],
    

In [18]:
import torch.nn.functional as F

In [22]:
# llama_model.eval()
# mslm = MultimodalSignLM(llama_model, tokenizer, "cuda")
all_embeddings = llama_model.get_input_embeddings().weight.data
del llama_model
# text = "Enumera los pasos descritos:"

In [23]:
def embeddings_to_text(embeddings: torch.Tensor, all_embeddings: torch.Tensor, tokenizer) -> str:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    embeddings = embeddings.to(device)

    embedding_matrix = all_embeddings.to(device)  # [V, D]

    embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # [V, D]
    print(embedding_matrix_norm.shape)

    embeddings_norm = F.normalize(embeddings, p=2, dim=1)  # [T, D]

    similarities = torch.matmul(embeddings_norm, embedding_matrix_norm.T)  # [T, V]

    token_ids = torch.argmax(similarities, dim=1).tolist()
    print(f"Token IDs: {token_ids}")
    return tokenizer.decode(token_ids, skip_special_tokens=True)

In [48]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [46]:
del mask_data, data, embeds, sign_embed


In [26]:
true_embeds = None
pred_embeds = None

In [49]:

with torch.no_grad():
    for data, mask_data, embeds, mask_embds in test_dataloader:
        data = data.to(device).to(torch.float32)
        mask_data = mask_data.to(device)

        print(f"Data shape: {data.shape}, Mask shape: {mask_data.shape}")
        sign_embed = model(data, mask_data).to("cuda")
        sign_embed = sign_embed.to(dtype=torch.bfloat16)

        sign_embed = sign_embed.to("cuda").to(dtype=torch.float32)
        embeds = embeds.to("cuda").to(dtype=torch.float32)
        print(f"Sign embed shape: {sign_embed.shape}, Embeds shape: {embeds.shape}")
        
        true_embeds = embeds
        pred_embeds = sign_embed
        pred_embeds = pred_embeds[:, :23, :][0]
        print(embeddings_to_text(pred_embeds, all_embeddings, tokenizer))
        print(embeddings_to_text(embeds[0], all_embeddings, tokenizer))
        
        del mask_data, data, embeds, sign_embed

Data shape: torch.Size([1, 100, 133, 2]), Mask shape: torch.Size([1, 100])
Sign embed shape: torch.Size([1, 301, 3072]), Embeds shape: torch.Size([1, 23, 3072])
torch.Size([128256, 3072])
Token IDs: [128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000]

torch.Size([128256, 3072])
Token IDs: [128000, 3462, 346, 1744, 29571, 4247, 1645, 1995, 323, 12328, 40608, 281, 521, 4988, 13, 29386, 64591, 8374, 384, 1007, 84, 10610, 30]
parece que también por acá anduvieron paseando. ¿qué tal epecuén?


In [None]:
pred_embeds[:, :23, :].shape

torch.Size([1, 23, 3072])

In [None]:
true_embeds.shape

torch.Size([23, 3072])