#### Large RAM is required to load the larger models. Running on GPU can optimize inference speed.

In [1]:
import sys
import os
import torch
import numpy as np
from lavis.models import load_model_and_preprocess

import decord
from decord import VideoReader
from decord import cpu, gpu
decord.bridge.set_bridge('torch')

  from .autonotebook import tqdm as notebook_tqdm
  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


#### Load an example video

In [2]:
def load_video(vr, start_time, end_time, fps, num_frames=20):
    start_index = int(round(start_time * fps))
    end_index = int(round(end_time * fps))
    select_frame_index = np.rint(np.linspace(start_index, end_index-1, num_frames)).astype(int).tolist()
    frames = vr.get_batch(select_frame_index).permute(3, 0, 1, 2).to(torch.float32)
    return frames

file_path = "example/video.mp4"
vr = VideoReader(file_path, ctx=cpu(0))
total_frames = len(vr)
fps = vr.get_avg_fps()
duration = total_frames / fps

print("video_duration: {:.1f}, fps: {:.1f}".format(duration, fps))

video_duration: 70.3, fps: 24.0


In [3]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
print(device)

cuda


#### Visualize the full video

In [4]:
from IPython.display import HTML

# Embed video using HTML
HTML("""
<video width="640" height="480" controls>
  <source src="example/video.mp4" type="video/mp4">
</video>
""")

#### Load pre-trained InstructBlip model weights

In [None]:
# We associate a model with its preprocessors to make it easier for inference.
# You can specify the memory_bank_length and num_frames here.
import torch.distributed as dist
import os

os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '65533'

# 初始化分布式进程组
dist.init_process_group(backend='nccl', init_method='env://')
model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_vicuna_instruct_clip2_malmm", model_type="vicuna7b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
)
checkpoint = "/t-ng/Deng/py/MMA-LLM/lavis/output/msvd_qa/blip2_vicuna_instruct_clip2_vicuna7b/train/b16_e7_lr0.0001_wd0.05_q32_f20_fb10_freezevit/checkpoint_best.pth"
model.load_checkpoint(checkpoint)
model.eval()
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_vicuna_instruct_malmm", model_type="vicuna13b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
# )

[rank0]:[W127 00:16:58.848512581 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
  state_dict = torch.load(cached_file, map_location="cpu")
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.00it/s]
  pretrained_dict = torch.load(pretrained_path, map_location='cpu')


model key: query_tokens
model key: ln_vision.weight
model key: ln_vision.bias
model key: clip_text_embedding.position_ids
model key: clip_text_embedding.word_embeddings.weight
model key: clip_text_embedding.position_embeddings.weight
model key: clip_text_embedding.LayerNorm.weight
model key: clip_text_embedding.LayerNorm.bias
model key: T1.text_projection
model key: T1.qformer_projection
model key: T1.logit_scale
model key: T1.text_embedding.position_ids
model key: T1.text_embedding.word_embeddings.weight
model key: T1.text_embedding.position_embeddings.weight
model key: T1.text_embedding.LayerNorm.weight
model key: T1.text_embedding.LayerNorm.bias
model key: Qformer.bert.embeddings.position_ids
model key: Qformer.bert.embeddings.word_embeddings.weight
model key: Qformer.bert.embeddings.position_embeddings.weight
model key: Qformer.bert.embeddings.LayerNorm.weight
model key: Qformer.bert.embeddings.LayerNorm.bias
model key: Qformer.bert.encoder.layer.0.attention.self.query.weight
model

  checkpoint = torch.load(cached_file, map_location="cpu")
  checkpoint = torch.load(url_or_filename, map_location="cpu")


_IncompatibleKeys(missing_keys=['visual_encoder.cls_token', 'visual_encoder.pos_embed', 'visual_encoder.patch_embed.proj.weight', 'visual_encoder.patch_embed.proj.bias', 'visual_encoder.blocks.0.norm1.weight', 'visual_encoder.blocks.0.norm1.bias', 'visual_encoder.blocks.0.attn.q_bias', 'visual_encoder.blocks.0.attn.v_bias', 'visual_encoder.blocks.0.attn.qkv.weight', 'visual_encoder.blocks.0.attn.proj.weight', 'visual_encoder.blocks.0.attn.proj.bias', 'visual_encoder.blocks.0.norm2.weight', 'visual_encoder.blocks.0.norm2.bias', 'visual_encoder.blocks.0.mlp.fc1.weight', 'visual_encoder.blocks.0.mlp.fc1.bias', 'visual_encoder.blocks.0.mlp.fc2.weight', 'visual_encoder.blocks.0.mlp.fc2.bias', 'visual_encoder.blocks.1.norm1.weight', 'visual_encoder.blocks.1.norm1.bias', 'visual_encoder.blocks.1.attn.q_bias', 'visual_encoder.blocks.1.attn.v_bias', 'visual_encoder.blocks.1.attn.qkv.weight', 'visual_encoder.blocks.1.attn.proj.weight', 'visual_encoder.blocks.1.attn.proj.bias', 'visual_encoder.bl

#### Load finetuned model weights

In [6]:
# Model loads the default config from lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml.
# If you want to load a finetuned checkpoints, such as the finetuned model weight of ActivityNet-QA dataset,
# you need to first set the load_finetuned=True and specify the finetuned checkpoint path and reload the model again.

# load_finetuned: True
# finetuned: "saved_model/ActivityNet_qa/checkpoint_best.pth"

# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_vicuna_instruct_malmm", model_type="vicuna7b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
# )

#### Test on long videos

In [7]:
# The default max_num_frames is set to 120 in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml. 
# To test model on long videos, please set the max_num_frames to a larger value and then reload the model again.

#### Instructed zero-shot video-to-language generation

In [31]:
# load video by specifying the start_time and end_time
video = load_video(vr, start_time=0, end_time=duration, fps=fps, num_frames=120)
# prepare the video as model input using the associated processors
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what is the recipe of this video? Answer:"})

['omelette']

#### Online off-the-shelf setting with custom questions

In [29]:
video = load_video(vr, start_time=0, end_time=37, fps=fps, num_frames=120)
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what will happen for the next 5 seconds? Answer:"})

['egg']

#### Generate multiple answers

In [None]:
video = load_video(vr, start_time=0, end_time=duration, fps=fps, num_frames=20)
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what does this video show? Answer:"}, num_captions=5)

['cooked', 'grill', 'grilled', 'no', 'yes']