In [1]:
# from transformers import LlavaNextVideoProcessor

# processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")

# conversation = [
#     {
#         "role": "system",
#         "content": [
#             {"type": "text", "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."},
#             ],
#     },
#     {
#         "role": "user",
#         "content": [
#             {"type": "text", "text": "What’s shown in this image?"},
#             {"type": "image"},
#             ],
#     },
#     {
#         "role": "assistant",
#         "content": [{"type": "text", "text": "This image shows a red stop sign."},]
#     },
#     {

#         "role": "user",
#         "content": [
#             {"type": "text", "text": "Why is this video funny?"},
#             {"type": "video"},
#             ],
#     },
# ]

# text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# # Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your visuals
# print(text_prompt)

In [2]:
import os

# Set the environment variable before importing torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
import av
import torch
import numpy as np
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor, BitsAndBytesConfig
from huggingface_hub import hf_hub_download

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load the model in half-precision
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)

# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos)
video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
container = av.open(video_path)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)

conversation = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": "Why is this video funny?"},
            {"type": "video"},
            ],
    },
]

In [None]:
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, videos=video, return_tensors="pt").to(device)

out = model.generate(**inputs, max_new_tokens=60)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)

['USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and playful interaction between the child and the person holding the camera. The child is seen reading a book, and then suddenly starts to interact with the camera, making a face as if to say "cheese" for the photo. The person holding the']

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][50]

{'label': 0,
 'text': "The delivery driver mistakenly rang my doorbell, having confused 133 and 113.\\n\\nRather than taking a step back and analyzing the situation, he begins to accuse my wife and I of ordering and refusing to pay for this pizza.\\n\\nThe driver then gets on his cell phone and rather than calling the number than was given when the order was placed, begins to call his bosses and starts threatening me with felony charges. \\n\\nSo I take the initiative and ask the fine upstanding gentleman what the phone number of the order-er was, phone my neighbor and discover the mistake. Rather than a thank you or a sorry, he just speeds off (breaking the speed limit on our block) to reach his destination 50 feet away.\\n\\nI would call to complain, but based on the other reviews, its clear the owners do not care about Carnegie or it's residents, and its pretty well known around town just how awful their food is, so it would be pointless to boycott a place I'd never order from again

In [20]:
from transformers import AutoTokenizer
#CUDA_LAUNCH_BLOCKING=1
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(80))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(20))
print(len(small_train_dataset))
print(len(small_eval_dataset))
print(small_train_dataset[59])  # Inspect a sample


80
20
{'label': 3, 'text': "After 24 hours on the Strip, you need to get away. A 15-minute drive out to an anonymous, deserted strip mall on W. Sahara you'll find Rosemary's, whose food can compete bite-for-bite w/ the Michelin 1- and 2-star restaurants at the resorts. We deferred to the prix fixe menus and were not disappointed!", 'input_ids': [101, 1258, 1572, 2005, 1113, 1103, 18534, 117, 1128, 1444, 1106, 1243, 1283, 119, 138, 1405, 118, 2517, 2797, 1149, 1106, 1126, 12631, 117, 13180, 6322, 8796, 1113, 160, 119, 21465, 1128, 112, 1325, 1525, 20983, 112, 188, 117, 2133, 2094, 1169, 4845, 6513, 118, 1111, 118, 6513, 192, 120, 1103, 8372, 1394, 122, 118, 1105, 123, 118, 2851, 7724, 1120, 1103, 23574, 119, 1284, 19353, 1200, 4359, 1106, 1103, 185, 14799, 8239, 1162, 13171, 1116, 1105, 1127, 1136, 9333, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [22]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [23]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", max_steps=1000)

In [26]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
        ]
)
model.add_adapter(lora_config)

ValueError: Adapter with name default already exists. Please use a different name.

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

In [28]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.04 GiB is allocated by PyTorch, and 397.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)