In [1]:
from copy import deepcopy

import av
import numpy as np
import pandas as pd
import torch
from transformers import LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig, AutoProcessor
from peft import get_peft_model, LoraConfig

from src.data.preference_datamodule import PreferenceDataModule

# data_module = PreferenceDataModule(
#     preference_data_path="data/video_description_preference_lava_gt_sorted_fixed.csv",
#     video_dir = "data/MUStARD/videos/utterances_final",
#     batch_size=1,
#     num_input_tokens=1170,
#     num_output_tokens=100,
#     num_workers=4,
#     pin_memory=True,
# )

# trainloader = data_module.train_dataloader()

  from .autonotebook import tqdm as notebook_tqdm


[2025-05-02 16:34:29,439] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/manish/miniconda3/envs/textbridge/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@G

In [2]:
def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [3]:
lora_cofig = LoraConfig(
    r=1,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.0,
    bias="none",
)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

ref_model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

policy_model = deepcopy(ref_model)

policy_model = get_peft_model(policy_model, lora_cofig)

ref_model.eval()
for params in ref_model.parameters():
    params.requires_grad = False

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]


In [4]:

# Sample random row from the preference data
preference_data_path = "data/video_description_preference_lava_gt_sorted_fixed.csv"
preference_data = pd.read_csv(preference_data_path)
video_dir = "data/MUStARD/videos/utterances_final"
num_frames = 1
num_input_tokens = 162
num_output_tokens = 200

row = preference_data.iloc[0]
video_id = row["video_id"]

video_container = av.open(f"{video_dir}/{video_id}.mp4")
total_frames = video_container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / num_frames).astype(int)
clip = read_video_pyav(video_container, indices)

processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
query = "Describe the video in details"
prompt = f"USER: <video>\n{query} ASSISTANT:"
        
inputs_video = processor(
    text=prompt,
    videos=clip,
    return_tensors="pt",
    max_length=num_input_tokens,
    truncation=True,
).to(policy_model.device)

eos_token = processor.tokenizer.eos_token
preferred_desc, dispreferred_desc = (
    row["preferred_description"] + eos_token,
    row["dispreferred_description"] + eos_token,
)

gt_desc = processor.tokenizer(
    [preferred_desc, dispreferred_desc],
    return_tensors="pt",
    max_length=num_output_tokens,
    truncation=True,
    add_special_tokens=True,
    padding="max_length",
    padding_side="right",
).to(policy_model.device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
def update_positional_and_cache_ids(inputs_video, first_input=True):
    """
    Update the positional ids of the video.
    Args:
        inputs_video (`Dict`): Dictionary containing the input tensors.
        first_input (`bool`): Whether this is the first input or not in auto-regressive generation.
    Returns:
        inputs_video (`Dict`): Updated dictionary with new positional ids.
    """
    device = inputs_video["input_ids"].device
    inputs_video = dict(inputs_video)
    if first_input:
        batch_size, num_tokens = inputs_video["input_ids"].shape[:2]
        ids = torch.arange(num_tokens, device=device)
        inputs_video["cache_position"] = ids
        inputs_video["postional_ids"] = ids.expand(batch_size, num_tokens)
    else:
        batch_size = inputs_video["input_ids"].shape[0]
        ids = torch.max(inputs_video["postional_ids"]) + 1
        inputs_video["cache_position"] = ids.expand(batch_size)
        inputs_video["postional_ids"] = ids.expand(batch_size, 1)
    
    return inputs_video


def prepare_inputs_for_generation(inputs_video, predicted_outputs=None):
    """
    Prepare the inputs for generation.
    Args:
        inputs_video (`Dict`): Dictionary containing the input tensors.
        predicted_outpur (`Dict | None`): The predicted outputs from the model with `inputs_video`. 
            Contains `logits` and `past_key_values`. None if this is the first input.
    Returns:
        inputs_video (`Dict`): Updated dictionary with new input tensors.
    """
    device = inputs_video["input_ids"].device
    inputs_video = dict(inputs_video)
    
    if predicted_outputs is None:
        inputs_video = update_positional_and_cache_ids(inputs_video, first_input=True)
        inputs_video["past_key_values"] = None
        inputs_video["logits_to_keep"] = 1
        inputs_video["use_cache"] = True
    else:
        inputs_video["input_ids"] = predicted_outputs["logits"].argmax(dim=-1)
        inputs_video["attention_mask"] = torch.cat(
            [
                torch.ones(
                    (inputs_video["attention_mask"].shape[0], 1), device=device
                ),
                inputs_video["attention_mask"],
            ],
            dim=1,
        )
        inputs_video = update_positional_and_cache_ids(inputs_video, first_input=False)
        inputs_video["past_key_values"] = predicted_outputs["past_key_values"]
        inputs_video["logits_to_keep"] = 1
        inputs_video["use_cache"] = True
        inputs_video["pixel_values_videos"] = None
    return inputs_video

In [6]:
ref_outputs = None
policy_outputs = None

device = ref_model.device
print(device)

ref_inputs = inputs_video.copy()
policy_inputs = inputs_video.copy()

compute_logps = torch.nn.LogSoftmax(dim=-1)

ref_logits = []
policy_logits = []

for i in range(num_output_tokens):
    
    print(f"Generating token {i}")
    
    ref_inputs = prepare_inputs_for_generation(ref_inputs, ref_outputs)
    with torch.no_grad():
        ref_outputs = ref_model(
            **inputs_video,
            return_dict=True,
        )
        ref_logits.append(ref_outputs["logits"])

    policy_inputs = prepare_inputs_for_generation(policy_inputs, policy_outputs)
    policy_outputs = policy_model(
        **inputs_video,
        return_dict=True,
    )
    policy_logits.append(policy_outputs["logits"])

ref_logits = torch.cat(ref_logits, dim=1)
policy_logits = torch.cat(policy_logits, dim=1)
    

cuda:0
Generating token 0
Generating token 1
Generating token 2
Generating token 3
Generating token 4
Generating token 5
Generating token 6
Generating token 7
Generating token 8
Generating token 9
Generating token 10
Generating token 11
Generating token 12
Generating token 13
Generating token 14
Generating token 15
Generating token 16
Generating token 17
Generating token 18
Generating token 19
Generating token 20
Generating token 21
Generating token 22
Generating token 23
Generating token 24
Generating token 25
Generating token 26
Generating token 27
Generating token 28
Generating token 29
Generating token 30
Generating token 31
Generating token 32
Generating token 33
Generating token 34
Generating token 35
Generating token 36
Generating token 37
Generating token 38
Generating token 39
Generating token 40
Generating token 41


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 23.55 GiB of which 1.69 MiB is free. Including non-PyTorch memory, this process has 23.54 GiB memory in use. Of the allocated memory 23.02 GiB is allocated by PyTorch, and 66.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [19]:
class LogProbability(torch.nn.Module):
    def __init__(self, dim: int = -1):
        super().__init__()
        self.dim = dim

    def forward(self, logits: torch.Tensor, labels: torch.LongTensor) -> torch.Tensor:
        """
        Calculate the log probability of the labels given the logits.
        Args:
            logits (`torch.Tensor`): The logits from the model.
            labels (`torch.Tensor`): The labels for which to calculate the log probability.
        Returns:
            `torch.Tensor`: The log probabilities of the labels.
        """
        # Dimensions check, labels must one dim less than logits
        if labels.dim() != logits.dim() - 1:
            raise ValueError(
                f"Expected labels of dimension {logits.dim() - 1} which is one dim less than logits, but got {labels.dim()}"
            )

        # Apply softmax to get probabilities
        probs = torch.nn.functional.softmax(logits, dim=self.dim)

        # Gather the probabilities for the true labels
        gathered_probs = probs.gather(self.dim, labels.unsqueeze(self.dim)).squeeze(
            self.dim
        )

        # Calculate log probabilities
        log_probs = torch.log(gathered_probs)

        return log_probs


# Test the LogProbability class
log_prob = LogProbability(dim=-1)
inputs = torch.randn(2, 3, 5)  # Example logits
labels = torch.tensor([[0, 1, 2], [1, 2, 4]])  # Example labels
log_ps = log_prob(inputs, labels)