In [6]:
import torch
from transformers import AutoTokenizer
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor

In [7]:
# This cell takes a long time to run. Like 5 or ten minutes.
pretrained_ckpt = 'MAGAer13/mplug-owl-llama-7b-video'
model = MplugOwlForConditionalGeneration.from_pretrained(
   pretrained_ckpt,
   torch_dtype=torch.bfloat16,
   device_map={'': 0},
)
# GPU VRAM should be at about 15gb used when this cell is complete.

In [8]:
# This cell also takes a long time to run.  Like 5 or ten minutes.
image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt)
processor = MplugOwlProcessor(image_processor, tokenizer)

In [9]:
# <|video|> denotes an video placeholder. 
prompts = [
'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Human: <|video|>
Human: What is happening in this video?
AI: ''']

In [19]:
video_list = ['schneeeman-Scene-010.mp4']
from IPython.display import Video
Video("schneeeman-Scene-010.mp4", height=320)

In [16]:
# generate kwargs (the same in transformers) can be passed in the do_generate()
generate_kwargs = {
    'do_sample': True,
    'top_k': 5,
    'max_length': 512
}

In [20]:
inputs = processor(text=prompts, videos=video_list, num_frames=4, return_tensors='pt')
inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
inputs = {k: v.to(model.device) for k, v in inputs.items()}

In [21]:
with torch.no_grad():
    res = model.generate(**inputs, **generate_kwargs)
sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
print(sentence)

A close-up image of a leather dog collar is being filmed on a background with colorful strings. The collar is hanging from the string, and the string appears to be tied in a knot, creating an interesting and unique visual effect. The overall scene gives the impression that the leather collar is an artistic piece, with the strings and knot adding depth and dimension to the image.
