In [1]:
import numpy as np
import os
import io
import json
import cv2

import torch

from demo.config import (Config,
                         eval_dict_leaf)

from demo.utils import (retrieve_text,
                        _frame_from_video,
                        setup_internvideo2)

  from .autonotebook import tqdm as notebook_tqdm


[2024-05-07 00:20:23,026] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
video = cv2.VideoCapture('demo/example1.mp4')
frames = [x for x in _frame_from_video(video)]
print(len(frames))
print(frames[0].shape)

40
(480, 640, 3)


In [3]:
text_candidates = [
    "A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.",
    "A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.",
    "A person dressed in a blue jacket shovels the snow-covered pavement outside their house.",
    "A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.",
    "A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.",
    "A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.",
    "A playful dog slides down a snowy hill, wagging its tail with delight.",
    "A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.",
    "A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.",
    "A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.",
    "Lebron James dribbles the basketball down the court, fakes out his defender, and scores a layup.",
    "Lebron James makes a slamp dunk in a fast break.",
    "Lebron James of the Lakers makes a slamp dunk in a fast break.",
]

In [4]:
config = Config.from_file('demo/internvideo2_stage2_config.py')
config = eval_dict_leaf(config)
for k, v in config.items():
    if k == 'available_corpus':
        continue
    if k == "model":
        print(f"{k}: {json.dumps(v, indent=4)}")
    else:
        print(f"{k}: {v}")

VisionEncoders: {}
TextEncoders: {'bert': {'name': 'bert_base', 'pretrained': 'bert-base-uncased', 'config': 'configs/config_bert.json', 'd_model': 768, 'fusion_layer': 9}, 'bert_large': {'name': 'bert_large', 'pretrained': 'bert-large-uncased', 'config': 'configs/config_bert_large.json', 'd_model': 1024, 'fusion_layer': 19}, 'med_bert': {'name': 'med_bert_base', 'pretrained': 'bert-base-uncased', 'config': 'configs/med_config.json', 'd_model': 768}, 'med_bert_large': {'name': 'med_bert_large', 'pretrained': 'bert-base-uncased', 'config': 'configs/med_large_config.json', 'd_model': 768}}
num_workers: 6
num_frames: 4
num_frames_test: 4
batch_size: 8
batch_size_test: 4
size_t: 224
max_txt_l: 40
origin_num_frames: 4
use_half_precision: False
use_bf16: False
inputs: {'image_res': 224, 'video_input': {'num_frames': 4, 'sample_type': 'rand', 'num_frames_test': 4, 'sample_type_test': 'middle', 'random_aug': False}, 'max_txt_l': {'image': 40, 'video': 40}, 'batch_size': {'image': 8, 'video': 8

In [5]:
model_pth = 'your_model_path/InternVideo2-stage2_1b-224p-f4.pt'
config['pretrained_path'] = model_pth

In [6]:
intern_model, tokenizer = setup_internvideo2(config)

vision_encoder: using pretrain_internvideo2_1b_patch14_224




text_encoder: using bert_large
load_state_dict: _IncompatibleKeys(missing_keys=[], unexpected_keys=['temp', 'itm_head.weight', 'itm_head.bias'])


In [7]:
texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=10, config=config)

for t, p in zip(texts, probs):
    print(f'text: {t} ~ prob: {p:.4f}')



text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.7927
text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.1769
text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0291
text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.0006
text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0003
text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0002
text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0001
text: A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees. ~ prob: 0.0001
text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loa