In [1]:
import av
import numpy as np

from transformers import Trainer, TrainingArguments, Seq2SeqTrainingArguments, DataCollatorForLanguageModeling
from transformers import AutoProcessor, BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, DataCollatorWithPadding 
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

import torch
from huggingface_hub import  hf_hub_download
import av
import torch
import numpy as np
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor, BitsAndBytesConfig
from huggingface_hub import hf_hub_download
import numpy as np
import evaluate
from activity_dataset import get_dataset_splits



MODEL_ID = "llava-hf/LLaVA-NeXT-Video-7B-hf"
MAX_LENGTH = 256
USE_LORA = False
USE_QLORA = True

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import os

# # Set the environment variable before importing torch
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load the model in half-precision
processor = LlavaNextVideoProcessor.from_pretrained(MODEL_ID)
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map='auto',
)
# processor.tokenizer.padding_side = "left"
# processor.image_processor.do_rescale = False
# processor.tokenizer.pad_token = processor.tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


In [4]:
metric = evaluate.load("accuracy")

In [5]:
from torch.nn.utils.rnn import pad_sequence
from activity_dataset import num_lab
MAX_VIDEO_FRAMES = 300

def collate_fn_df(batch):
    # Extract video tensors and labels
    video_tensors = [video for video, _ in batch]
    labels = [label for _, label in batch]
    # labels = torch.tensor(labels)

    # Pad video tensors to have the same length
    video_tensors = pad_sequence(video_tensors, batch_first=True)  # Shape: (batch_size, seq_len, ...)

    # print(f"{labels=}")
    # print(f"{video_tensors.shape=}")

    # Prepare processor outputs
    outputs = []
    for video, label in zip(video_tensors, labels):
      
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "You are an assistant that watches videos of robotic arms that do different activities, you have to say with only the activity you are seeing in this video"},
                    {"type": "video"},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": num_lab[label]},
                ],
            },
        ]
        video = read_video_pyav(video, np.arange(0, len(video), len(video) / 8).astype(int))

        # Apply the processor's chat template
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)

        print("LEN VIDEO ", len(video))

        # Process the inputs
        processed = processor(
            text=prompt,
            videos=video,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        outputs.append(processed)

    padded_inputs = processor.tokenizer.pad(
            {
                "input_ids": [feat['input_ids'][0] for feat in outputs], # each element is one batch only so we slice [0]
                "attention_mask": [feat['attention_mask'][0] for feat in outputs],
            },
            padding=True,
            return_tensors="pt",
        )
    labels = padded_inputs["input_ids"].clone()
    

    labels[labels == processor.tokenizer.pad_token_id] = -100
    padded_inputs["labels"] = labels
    padded_inputs["pixel_values_videos"] = torch.cat([feat['pixel_values_videos'] for feat in outputs], dim=0)
    print({k:v.shape for k, v in padded_inputs.items()})
    return padded_inputs

In [6]:
def collate_fn(batch):
    videos = []
    texts = []

    for video, label in batch:
        videos.append(video)
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "You are an assistant that watches videos of robotic arms that do different activities, you have to say with only the activity you are seeing in this video"},
                    {"type": "video"},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": num_lab[label]},
                ],
            },
        ]

        # Apply the processor's chat template
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)
        texts.append(prompt)

    batch = processor(text=texts, videos=videos, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    print({k:v.shape for k, v in batch.items()})

    return batch

In [7]:
train_dataset, test_dataset = get_dataset_splits()

In [8]:
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
    )

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


In [9]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
OUTPUT_DIR = "output_llava"
BATCH_SIZE = 1
REPO_ID = "cams01/LLaVa-robot-activity-recognition"

args = TrainingArguments(
    # args related to training
    output_dir = OUTPUT_DIR,
    eval_strategy = 'steps',
    eval_steps=20,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = 8,
    learning_rate = 2e-05,
    max_steps = 10, # adjust this depending on your dataset size
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.1,

    # args related to eval/save
    logging_steps = 20,
    save_strategy = 'steps',
    save_steps=20,
    save_total_limit = 1,
    fp16 = True, # we have the model train and eval with fp16 precision
    fp16_full_eval = True,
    optim = 'adamw_bnb_8bit', # adam in lower-bits to save memory, consider changing to 'adamw_torch' if model is not converging
    # report_to = "wandb", # install wand to use this
    # hub_model_id = REPO_ID,
    # push_to_hub = True, # wel'll push the model to hub after each epoch

    # model that was wrapped for QLORA training with peft will not have arguments listed in its signature
    # so we need to pass lable names explicitly to calculate val loss
    label_names=["labels"],
    # dataloader_num_workers=4, # let's get more workers since iterating on video datasets might be slower in general
)

In [12]:
class LlavaNextVideoDataCollatorWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        padded_inputs = self.processor.tokenizer.pad(
            {
                "input_ids": [feat['input_ids'][0] for feat in features], # each element is one batch only so we slice [0]
                "attention_mask": [feat['attention_mask'][0] for feat in features],
            },
            padding=False,
            return_tensors="pt",
        )

        labels = padded_inputs["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        padded_inputs["labels"] = labels
        padded_inputs["pixel_values_videos"] = torch.cat([feat['pixel_values_videos'] for feat in features], dim=0)
        
        print({k:v.shape for k, v in padded_inputs.items()})

        return padded_inputs

In [None]:
conversation = [
			{

				"role": "user",
				"content": [
					{"type": "text", "text": "Why is this video funny?"},
					{"type": "video"},
				],
			},
		]
print(model.config.video_token_index, model.config.image_token_index)
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
print(prompt, 'cyan')

In [13]:
trainer = Trainer(
    model = model,
    processing_class = processor,
    data_collator = LlavaNextVideoDataCollatorWithPadding(processor=processor),
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args=args,
)

In [14]:
trainer.train()

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


torch.Size([136, 3, 224, 224])


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([1, 256]), 'attention_mask': torch.Size([1, 256]), 'labels': torch.Size([1, 256]), 'pixel_values_videos': torch.Size([1, 136, 3, 336, 336])}
torch.Size([147, 3, 224, 224])
{'input_ids': torch.Size([1, 256]), 'attention_mask': torch.Size([1, 256]), 'labels': torch.Size([1, 256]), 'pixel_values_videos': torch.Size([1, 147, 3, 336, 336])}
torch.Size([139, 3, 224, 224])
{'input_ids': torch.Size([1, 256]), 'attention_mask': torch.Size([1, 256]), 'labels': torch.Size([1, 256]), 'pixel_values_videos': torch.Size([1, 139, 3, 336, 336])}
torch.Size([215, 3, 224, 224])
{'input_ids': torch.Size([1, 256]), 'attention_mask': torch.Size([1, 256]), 'labels': torch.Size([1, 256]), 'pixel_values_videos': torch.Size([1, 215, 3, 336, 336])}
torch.Size([233, 3, 224, 224])
{'input_ids': torch.Size([1, 256]), 'attention_mask': torch.Size([1, 256]), 'labels': torch.Size([1, 256]), 'pixel_values_videos': torch.Size([1, 233, 3, 336, 336])}
torch.Size([125, 3, 224, 224])
{'input_ids': t

  return fn(*args, **kwargs)


ValueError: Video features and video tokens do not match: tokens: 251, features 19584

In [15]:
# trainer.model.push_to_hub(REPO_ID) # let's push to hub the last ckpt
for a in train_dataset:
    print(a)
    break

{'input_ids': tensor([[    1,  3148,  1001, 29901, 29871, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 3

In [None]:
# Assuming you are using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device
model = model.to(device)

for video, label in train_dataset:
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Why is this video funny?"},
                {"type": "video"},
            ],
        },
    ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    # Explicitly move tensors to the correct device
    inputs = processor(text=prompt, videos=video, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device as the model

    # Generate output
    out = model.generate(**inputs, max_new_tokens=60)

    # Decode the output
    result = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(result)
    break


ValueError: too many values to unpack (expected 2)