In [10]:
import numpy as np
from PIL import Image
import requests
import av
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, LlavaNextVideoForConditionalGeneration




def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf", device_map="auto", torch_dtype=torch.float16
)
processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")

prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
video_path = hf_hub_download(
    repo_id="raushan-testing-hf/videos-test",
    filename="sample_demo_1.mp4",
    repo_type="dataset",
)
container = av.open(video_path)

# sample uniformly 8 frames from the video (model was trained with 32 frames per video, but this video is short)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
clip = read_video_pyav(container, indices)
inputs_video = processor(text=prompt, videos=clip, return_tensors="pt").to(model.device)

inputs_video = dict(inputs_video)
# print(inputs_video.input_ids[:, -10:])
# load an image to generate from an image
# prompt = "USER:<image>\nWhat is shown in this image? ASSISTANT:"
# url = "https://www.ilankelman.org/stopsigns/australia.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# inputs_image = processor(text=prompt, images=image, return_tensors="pt").to(
#     model.device
# )


def update_positional_and_cache_ids(inputs_video, first_input=True):
    """
    Update the positional ids of the video.
    Args:
        inputs_video (`Dict`): Dictionary containing the input tensors.
        first_input (`bool`): Whether this is the first input or not in auto-regressive generation.
    Returns:
        inputs_video (`Dict`): Updated dictionary with new positional ids.
    """
    device = inputs_video["input_ids"].device
    inputs_video = dict(inputs_video)
    if first_input:
        batch_size, num_tokens = inputs_video["input_ids"].shape[:2]
        ids = torch.arange(num_tokens, device=device)
        inputs_video["cache_position"] = ids
        inputs_video["postional_ids"] = ids.expand(batch_size, num_tokens)
    else:
        batch_size = inputs_video["input_ids"].shape[0]
        ids = torch.max(inputs_video["postional_ids"]) + 1
        inputs_video["cache_position"] = ids.expand(batch_size)
        inputs_video["postional_ids"] = ids.expand(batch_size, 1)
    
    return inputs_video


def prepare_inputs_for_generation(inputs_video, predicted_outputs=None):
    """
    Prepare the inputs for generation.
    Args:
        inputs_video (`Dict`): Dictionary containing the input tensors.
        predicted_outpur (`Dict | None`): The predicted outputs from the model with `inputs_video`. 
            Contains `logits` and `past_key_values`. None if this is the first input.
    Returns:
        inputs_video (`Dict`): Updated dictionary with new input tensors.
    """
    device = inputs_video["input_ids"].device
    inputs_video = dict(inputs_video)
    
    if predicted_outputs is None:
        inputs_video = update_positional_and_cache_ids(inputs_video, first_input=True)
        inputs_video["past_key_values"] = None
        inputs_video["logits_to_keep"] = 1
        inputs_video["use_cache"] = True
    else:
        inputs_video["input_ids"] = predicted_outputs["logits"].argmax(dim=-1)
        inputs_video["attention_mask"] = torch.cat(
            [
                torch.ones(
                    (inputs_video["attention_mask"].shape[0], 1), device=device
                ),
                inputs_video["attention_mask"],
            ],
            dim=1,
        )
        inputs_video = update_positional_and_cache_ids(inputs_video, first_input=False)
        inputs_video["past_key_values"] = predicted_outputs["past_key_values"]
        inputs_video["logits_to_keep"] = 1
        inputs_video["use_cache"] = True
        inputs_video["pixel_values_videos"] = None
    return inputs_video

# print(inputs_video)
# Generate from video

# generated_output = model.generate(
#     **inputs_video, max_new_tokens=500, output_logits=True, return_dict_in_generate=True, output_hidden_states=True
# )

# print(generated_output.logits[0].requires_grad)
# processor.batch_decode(
#     generated_output.sequences,
#     skip_special_tokens=True,
#     clean_up_tokenization_spaces=False,
# )[0]


# max_new_tokens = 50
# with torch.no_grad():
#     # inputs_video = processor(text=prompt, videos=clip, return_tensors="pt").to(model.device)
#     # inputs_video.input_ids = inputs_video.input_ids[:, :1]
#     # inputs_video.attention_mask = inputs_video.attention_mask[:, :1]
#     # inputs_video.video_mask = inputs_video.video_mask[:, :1]
#     # inputs_video.video_attention_mask = inputs_video.video_attention_mask[:, :1]
#     # inputs_video.video_position_ids = inputs_video.video_position_ids[:, :1]
#     for _ in range(max_new_tokens):
#         # print(inputs_video.input_ids.shape)
#         output = model(
#             input_ids=inputs_video.input_ids,
#             attention_mask=inputs_video.attention_mask,
#             pixel_values_videos=inputs_video.pixel_values_videos,
#             past_key_values=inputs_video.past_key_values,
#             logits_to_keep=1,
#         )
#         # output = model(logits_to_keep=1, **inputs_video)
#         predicted_ids = output.logits.argmax(-1)
#         print(predicted_ids)
#         # inputs_video.input_ids = torch.cat([inputs_video.input_ids, predicted_ids], dim=1)

#         # inputs_video = inputs_video
#         inputs_video.input_ids = predicted_ids
#         inputs_video.attention_mask = torch.cat(
#             [
#                 torch.ones(
#                     (inputs_video.attention_mask.shape[0], 1), device=model.device
#                 ),
#                 inputs_video.attention_mask,
#             ],
#             dim=1,
#         )
#         inputs_video.past_key_values = output.past_key_values
#         inputs_video.pixel_values_videos = None
#         positional_ids =
# inputs_video.

# # Generate from image
# generate_ids = model.generate(**inputs_image, max_new_tokens=50)
# processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
print(generated_output.logits[0].requires_grad)

False


In [12]:
max_new_tokens = 200
model.train()
predicted_outputs = None
inputs_video_clone = inputs_video.copy()
# with torch.no_grad():
for i in range(max_new_tokens):
    print("Generating new token", i)
    inputs_video_clone = prepare_inputs_for_generation(inputs_video_clone, predicted_outputs)
    predicted_outputs = model(**inputs_video_clone)
    # print(predicted_outputs["logits"].shape)

Generating new token 0
Generating new token 1
Generating new token 2
Generating new token 3
Generating new token 4
Generating new token 5
Generating new token 6
Generating new token 7
Generating new token 8
Generating new token 9
Generating new token 10
Generating new token 11
Generating new token 12
Generating new token 13
Generating new token 14
Generating new token 15
Generating new token 16
Generating new token 17
Generating new token 18
Generating new token 19
Generating new token 20
Generating new token 21
Generating new token 22
Generating new token 23
Generating new token 24
Generating new token 25
Generating new token 26
Generating new token 27
Generating new token 28
Generating new token 29
Generating new token 30
Generating new token 31
Generating new token 32
Generating new token 33
Generating new token 34
Generating new token 35
Generating new token 36
Generating new token 37
Generating new token 38
Generating new token 39
Generating new token 40
Generating new token 41
Ge

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 1 has a total capacity of 23.55 GiB of which 7.69 MiB is free. Including non-PyTorch memory, this process has 23.53 GiB memory in use. Of the allocated memory 21.94 GiB is allocated by PyTorch, and 1.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
processor.tokenizer(["USER: <video>\nWhy is this video funny? ASSISTANT:</s>", "Hello there!"],
    max_length=50,
    truncation=True,
    padding="max_length",
    add_special_tokens=True,
)

{'input_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3148, 1001, 29901, 29871, 32000, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 15043, 727, 29991]], 'attention_mask': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]}

In [5]:
processor.tokenizer.eos_token

'</s>'

In [6]:
from torch import nn
import torch

# Example of target with class indices
loss = nn.CrossEntropyLoss(reduction="none")
input = torch.randn(3, 5,  100, requires_grad=True)
target = torch.empty(3, 100, dtype=torch.long).random_(5)
output = loss(input, target)
print(output.shape)
output.mean().backward()

# Example of target with class probabilities
input = torch.randn(3,  5, 100, requires_grad=True)
target = torch.randn(3, 5, 100).softmax(dim=1)
output = loss(input, target)
print(output.shape)
output.mean().backward()

torch.Size([3, 100])
torch.Size([3, 100])


In [7]:
# "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and endearing situation of a young child, who appears to be a baby or toddler, attempting to read a book. The child's small size and the fact that they are reading a book"

In [8]:
import torch

dict1 = {'a': torch.tensor([1,2]), 'b': 2}
dict2 = dict1.copy()

dict2['a'] = torch.tensor([2,1])  # {'a': 1, 'b': 2}

print(dict1, dict2)  # {'a': 1, 'b': 2}

{'a': tensor([1, 2]), 'b': 2} {'a': tensor([2, 1]), 'b': 2}
