# TC-CLIP Inference Demo for Custom Videos

## Set model path, custom video path, class names.

In [1]:
### Change to your settings ###
output="workspace/inference"
tc_clip_model_path = "workspace/weights/zero_shot_k400_llm_tc_clip.pth"   # Your pretrained model saved path
class_names = ['swinging baseball bat', 'cutting apple', 'moon walking', 'changing gear in car']  # Class names
video_path = "/131_data/datasets/k600/test/uveYxu3T2Fc_000009_000019.mp4" # Custom video path

## No need to change below codes, just run the cells.

In [2]:
# Import libraries
import os
from hydra import initialize, compose
from omegaconf import OmegaConf
from pathlib import Path
import torch

from datasets.pipeline import Compose
from trainers.build_trainer import returnCLIP
from utils.logger import create_logger
from utils.print_utils import colorstr
from utils.tools import load_checkpoint

### Init configs and logger

In [3]:
# Initialize Hydra configs
overrides = [
    f"output={output}",
    "eval=test",
    "trainer=tc_clip",
    f"resume={tc_clip_model_path}"
]

# Initialize Hydra with config path
with initialize(version_base=None, config_path="configs"):
    config = compose(config_name="zero_shot.yaml", overrides=overrides)

In [4]:
# Init settings
OmegaConf.set_struct(config, False)  # Needed to add fields at runtime below

# Define working dir
Path(config.output).mkdir(parents=True, exist_ok=True)

# Logger
logger = create_logger(output_dir=config.output, dist_rank=0, name=f"{config.trainer_name}")
logger.info(f"working dir: {config.output}")

# Whether to use pytorch or apex amp
major, minor = int(torch.__version__.split('.')[0]), int(torch.__version__.split('.')[1])
config.use_torch_amp = (major >= 2)

[32m[09-25 13:51:28 TCCLIP][0m[33m(3979588942.py 9)[0m: INFO working dir: workspace/inference


### Build model & load checkpoint

In [5]:
# Build model
model = returnCLIP(config, logger, class_names)
model.cuda()

[32m[09-25 13:51:32 TCCLIP][0m[33m(build_trainer.py 56)[0m: INFO Loading CLIP (backbone: ViT-B/16)
Using spatial positional embedding
Weights not found for some missing keys:  ['visual.transformer.resblocks.1.attn.local_global_bias_table', 'visual.transformer.resblocks.2.attn.local_global_bias_table', 'visual.transformer.resblocks.3.attn.local_global_bias_table', 'visual.transformer.resblocks.4.attn.local_global_bias_table', 'visual.transformer.resblocks.5.attn.local_global_bias_table', 'visual.transformer.resblocks.6.attn.local_global_bias_table', 'visual.transformer.resblocks.7.attn.local_global_bias_table', 'visual.transformer.resblocks.8.attn.local_global_bias_table', 'visual.transformer.resblocks.9.attn.local_global_bias_table', 'visual.transformer.resblocks.10.attn.local_global_bias_table', 'visual.transformer.resblocks.11.attn.local_global_bias_table']
[32m[09-25 13:51:36 TCCLIP][0m[33m(build_trainer.py 59)[0m: INFO [34m[1mBuilding TCCLIP[0m
[32m[09-25 13:51:36 TCCLI

TCCLIP(
  (prompt_learner): VPPromptLearner()
  (image_encoder): TCVisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): TCAttentionBlock(
          (attn): TCAttention(
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,

In [6]:
# Load checkpoint
if config.resume:
    epoch_loaded, max_accuray_loaded = load_checkpoint(config, model, None, None, logger, model_only=True)
    logger.info(
            f"Loaded checkpoint at epoch {epoch_loaded} with max accuracy {max_accuray_loaded:.1f}")

[32m[09-25 13:51:40 TCCLIP][0m[33m(tools.py 208)[0m: INFO resume model: _IncompatibleKeys(missing_keys=['prompt_learner.token_prefix', 'prompt_learner.token_suffix'], unexpected_keys=[])
[32m[09-25 13:51:40 TCCLIP][0m[33m(tools.py 218)[0m: INFO => loaded successfully 'workspace/weights/zero_shot_k400_llm_tc_clip.pth' (epoch 9)
[32m[09-25 13:51:40 TCCLIP][0m[33m(1028797808.py 4)[0m: INFO Loaded checkpoint at epoch 10 with max accuracy 82.1


### Video preprocessing pipeline

In [7]:
# Video preprocessing pipeline

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)

scale_resize = int(256 / 224 * config.input_size)
collect_keys = ['imgs']

val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.num_frames, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.input_size),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=collect_keys, meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
if config.num_crop == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.input_size))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.input_size)
if config.num_clip > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.num_frames,
                           multiview=config.num_clip)
val_pipeline = [p for p in val_pipeline if p is not None]

pipeline = Compose(val_pipeline)

dict_file = {'filename': video_path, 'tar': False, 'modality': 'RGB', 'start_index': 0}

### TC-CLIP inference

In [8]:
video = pipeline(dict_file)
video_tensor = video['imgs'].unsqueeze(0).cuda().float() # Size: [1, T, 3, H, W]

# Inference with TC-CLIP
with torch.no_grad():
    if config.use_torch_amp:
        with torch.cuda.amp.autocast():
            output = model(video_tensor)
    else:
        output = model(video_tensor)
    
    logits = output['logits']

pred_index = logits.argmax(-1)

In [9]:
print(f'Logits: {logits}')
print(f'Predicted action category is "{class_names[pred_index]}"')

Logits: tensor([[30.6250, 23.0625, 23.5000, 24.1094]], device='cuda:0',
       dtype=torch.float16)
Predicted action category is "swinging baseball bat"


Acknowledgements: [ViFi-CLIP's repository](https://github.com/muzairkhattak/ViFi-CLIP).