In [None]:
import numpy as np

from transformers import Trainer, TrainingArguments
from transformers import  BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration 
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

import torch
import numpy as np
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor, BitsAndBytesConfig
import numpy as np
import evaluate
from activity_dataset import get_dataset_splits
from decord import VideoReader, cpu



MODEL_ID = "llava-hf/LLaVA-NeXT-Video-7B-hf"
OUTPUT_DIR = "output_llava"
USE_LORA = False
USE_QLORA = True
BATCH_SIZE = 1

NUM_FRAMES = 200
MAX_LENGTH = 30_000

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    print(f"GPU ID: {torch.cuda.current_device()}")
else:
    print("Using CPU")


Using GPU: NVIDIA GeForce RTX 4090
GPU ID: 0


In [3]:
print("CUDA Available:", torch.cuda.is_available())  # Verifica se PyTorch vede la GPU
print("Current Device:", torch.cuda.current_device())  # Mostra l'ID della GPU
print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))  # Nome della GPU

CUDA Available: True
Current Device: 0
GPU Name: NVIDIA GeForce RTX 4090


In [4]:
print(torch.__version__)  # Controlla la versione di PyTorch
print(torch.version.cuda)  # Controlla la versione di CUDA
print(torch.backends.cudnn.version())  # Controlla la versione di cuDNN


2.5.1+cu124
12.4
90100


In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load the model in half-precision
processor = LlavaNextVideoProcessor.from_pretrained(MODEL_ID)
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map=device,
)
processor.tokenizer.padding_side = "left"
processor.image_processor.do_rescale = False
model.config.use_cache = True

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]


In [6]:
train_dataset, test_dataset = get_dataset_splits()

In [7]:
for i in train_dataset:
    print(i)
    break

(WindowsPath('atlas_dione_objectdetection/ATLAS_Dione_ObjectDetection/ATLAS_Dione_ObjectDetection_Study_ActionClips/ATLAS_Dione_ObjectDetection_Study_ActionClips/set11/set11V004.mkv'), 'UVA Tie')


In [8]:
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
    )

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


In [9]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [10]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
def read_video_decord(video_path, num_frames=NUM_FRAMES):
    '''
    Decode the video with Decord decoder.

    Args:
        video_path (str): Path to the video file.
        num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    vr = VideoReader(uri=video_path, ctx=cpu(0)) # you need to install from source to use gpu ctx
    indices = np.arange(0, len(vr), len(vr) / num_frames).astype(int)
    frames = vr.get_batch(indices).asnumpy()
    return frames

In [12]:
def process_video(video_path, label):
    # Let's use chat template to format the prompt correctly
    video  = read_video_decord(str(video_path))
    conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "You are working in an industrial setting where robotic arms perform various activities. Your task is to analyze videos of these robotic arms in action and accurately classify the specific activity being performed in each video. Answer only with the activity detected."},
                    {"type": "video"},
                    ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": label},
                    ],
            },
        ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)

    batch = processor(
        text=prompt,
        videos=video,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    return batch

In [13]:
class LlavaNextVideoDataCollatorWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        # print("collator padding:\t",{k:v.shape for k, v in features[0].items()})
        features = process_video(features[0][0], features[0][1])
        print("new features:\t", {k:v.shape for k, v in features.items()})
        padded_inputs = self.processor.tokenizer.pad(
            {
                "input_ids": [feat for feat in features["input_ids"]], # each element is one batch only so we slice [0]
                "attention_mask": [feat for  feat in features["attention_mask"] ],
            },
            padding=True,
            return_tensors="pt",
        )
        print("mid padded:\t", {k:v.shape for k, v in padded_inputs.items()})


        labels = padded_inputs["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        padded_inputs["labels"] = labels
        padded_inputs["pixel_values_videos"] = features['pixel_values_videos'].clone()
        
        print("final padded:\t", {k:v.shape for k, v in padded_inputs.items()})

        return padded_inputs

In [14]:
args = TrainingArguments(
    # args related to training
    output_dir = OUTPUT_DIR,
    eval_strategy = 'steps',
    eval_steps=20,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = 8,
    learning_rate = 2e-05,
    max_steps = 10, # adjust this depending on your dataset size
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.1,

    # args related to eval/save
    logging_steps = 20,
    save_strategy = 'steps',
    save_steps=20,
    save_total_limit = 1,
    fp16 = True, # we have the model train and eval with fp16 precision
    fp16_full_eval = True,

    # model that was wrapped for QLORA training with peft will not have arguments listed in its signature
    # so we need to pass lable names explicitly to calculate val loss
    label_names=["labels"],
)

In [15]:
trainer = Trainer(
    model = model,
    processing_class = processor,
    data_collator = LlavaNextVideoDataCollatorWithPadding(processor=processor),
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics,
    args=args,
)

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


new features:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865]), 'pixel_values_videos': torch.Size([1, 200, 3, 336, 336])}
mid padded:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865])}
final padded:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865]), 'labels': torch.Size([1, 28865]), 'pixel_values_videos': torch.Size([1, 200, 3, 336, 336])}
new features:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865]), 'pixel_values_videos': torch.Size([1, 200, 3, 336, 336])}
mid padded:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865])}
final padded:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865]), 'labels': torch.Size([1, 28865]), 'pixel_values_videos': torch.Size([1, 200, 3, 336, 336])}
new features:	 {'input_ids': torch.Size([1, 28865]), 'attention_mask': torch.Size([1, 28865]), 'pixel_values_videos': torch.Size([1, 2

  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
