# Code for Performing Inference with ViFi-CLIP on Custom Videos

This section provides code to perform inference with the **ViFi-CLIP** model on custom videos and instructions, allowing for similarity scoring based on specific instructions. It includes the initialization of the model, preprocessing pipeline, and functions for computing similarity scores between videos and instructions.

- **Model Initialization and Preprocessing Pipeline**: The code sets up the ViFi-CLIP model and processes videos through a pipeline designed for the task.
  
- **Similarity Score Computation**: Given a video and an instruction, the code computes a similarity score to assess how well the model’s output aligns with the provided instruction.
  
- **Custom Configuration and Checkpoints**: Users can specify the configuration and checkpoint of the fine-tuned ViFi-CLIP model for inference.

- **Scalability for DataFrames**: The code is designed to handle multiple videos by accepting a dataframe input, enabling scalability for large datasets.


Import libraries.

In [6]:
!pip install yacs ftfy timm regex -q
!pip install mmcv-full -q
!pip install decord -q
!pip install opencv-python -q
!pip install numpy==1.22.4 -q
!pip install torch==2.6 -q

# depending on possible errors
#!pip install mmcv -q
#!pip install mmengine -q
#!pip install torch==1.11.0 -q
#!pip install torchvision==0.12.0 -q

In [9]:
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import time
import numpy as np
from utils.config import get_config
from trainers import vificlip
from datasets.pipeline import *
import warnings, logging
from collections import OrderedDict
import pandas as pd
from tqdm import tqdm



In [10]:
warnings.simplefilter("ignore", category=FutureWarning)
logging.disable(logging.CRITICAL)

### Setting up configuration
Initializing requires specifying a configuration file and the pretrained model path.

In [11]:
# Step 1:
# Configuration class
class parse_option:
    def __init__(self, config, pretrained_model_path):
        self.config = config
        self.output = "exp"  # Name of output folder to store logs and save weights
        self.resume = pretrained_model_path
        # No need to change below args.
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0

### Loading ViFi-CLIP and its pretrained weights

In [12]:
def init_model(config, logger):
    model = vificlip.returnCLIP(config, logger=logger)
    model = model.float().cuda()

    checkpoint = torch.load(config.MODEL.RESUME, map_location="cpu", weights_only=False)
    load_state_dict = checkpoint["model"]

    # remove unwanted keys
    if "module.prompt_learner.token_prefix" in load_state_dict:
        del load_state_dict["module.prompt_learner.token_prefix"]

    if "module.prompt_learner.token_suffix" in load_state_dict:
        del load_state_dict["module.prompt_learner.token_suffix"]

    if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
        del load_state_dict["module.prompt_learner.complete_text_embeddings"]

    # create new OrderedDict that does not contain `module.`
    new_state_dict = OrderedDict()
    for k, v in load_state_dict.items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v

    # load params
    msg = model.load_state_dict(new_state_dict, strict=False)
    logger.info(f"resume model: {msg}")
    return model

### Preprocessing input video 

In [13]:
def init_preprocessing_pipeline(config):
    # Preprocessing for video
    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False
    )
    scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)
    val_pipeline = [
        dict(type="DecordInit"),
        dict(
            type="SampleFrames",
            clip_len=1,
            frame_interval=1,
            num_clips=config.DATA.NUM_FRAMES,
            test_mode=True,
        ),
        dict(type="DecordDecode"),
        dict(type="Resize", scale=(-1, scale_resize)),
        dict(type="CenterCrop", crop_size=config.DATA.INPUT_SIZE),
        dict(type="Normalize", **img_norm_cfg),
        dict(type="FormatShape", input_format="NCHW"),
        dict(type="Collect", keys=["imgs"], meta_keys=[]),
        dict(type="ToTensor", keys=["imgs"]),
    ]
    if config.TEST.NUM_CROP == 3:
        val_pipeline[3] = dict(type="Resize", scale=(-1, config.DATA.INPUT_SIZE))
        val_pipeline[4] = dict(type="ThreeCrop", crop_size=config.DATA.INPUT_SIZE)
    if config.TEST.NUM_CLIP > 1:
        val_pipeline[1] = dict(
            type="SampleFrames",
            clip_len=1,
            frame_interval=1,
            num_clips=config.DATA.NUM_FRAMES,
            multiview=config.TEST.NUM_CLIP,
        )
    pipeline = Compose(val_pipeline)

    return pipeline

### ViFi-CLIP inference with given video

In [14]:
def predict_one(video_path, instruction, model, pipeline):
    dict_file = {
        "filename": video_path,
        "tar": False,
        "modality": "RGB",
        "start_index": 0,
    }
    video = pipeline(dict_file)
    video_tensor = video["imgs"].unsqueeze(0).cuda().float()
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            logits = model(video_tensor, instruction)

    if len(instruction) > 1:
        return logits
    else:
        return logits.item()

In [15]:
def predict_dev(config, model, saving_path, pipeline):
    # adjust paths!
    dev = pd.read_csv(
        "/home/jovyan/BA/Github/thesis-edit-evaluation/data/magicbrush/dev_data_with_mask.csv"
    )
    magicbrush_dir = "/home/jovyan/BA/Github/MagicBrush"

    results = []
    for index, row in tqdm(dev.iterrows(), "Predicting..."):
        # read paths for source image and target image with corresponding instruction
        current_id = int(row["img_id"])
        current_turn = int(row["turn_index"])
        instruction = row["instruction"].lower().replace(".", "")

        # read number of frames from config file
        frames = config.DATA.NUM_FRAMES

        if frames == 2:
            video_path = (
                f"{magicbrush_dir}/vifi_format/videos/{current_id}_{current_turn}.mp4"
            )
        elif frames == 8:
            video_path = f"{magicbrush_dir}/vifi_format/videos_8_frames/{current_id}_{current_turn}.mp4"

        # predict similarity
        similarity = predict_one(video_path, [instruction], model, pipeline)

        row = {
            "id": current_id,
            "turn": current_turn,
            "score": round(similarity, 3),
        }
        results.append(row)

    vifi_scores = pd.DataFrame(results)
    if saving_path:
        vifi_scores.to_csv(saving_path, index=False)
    return vifi_scores

In [16]:
def compute_correlation(directory, save_dir):
    config = directory + "16_32_vifi_clip_all_shot.yaml"
    pretrained_model_path = directory + "ckpt_epoch_10.pth"

    args = parse_option(config, pretrained_model_path)
    config = get_config(args)
    logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")

    model = init_model(config, logger)
    pipeline = init_preprocessing_pipeline(config)

    df = predict_dev(config, model, save_dir, pipeline)
    return df

## Predict at Scale
To perform predictions at scale for the entire validation split of MagicBrush, specify the folder containing the fine-tuned model path, as well as a name for the CSV file where the predictions should be saved afterwards.

In [14]:
base = "/home/jovyan/BA/Github/thesis-edit-evaluation/"

df = compute_correlation(
    base + "ViFi-CLIP/output/crossvalidation/vitb16_2_humanedit_freeze_none/fold5/",
    base + "labeling/analysis/cv2/vitb16_2_humanedit.csv",
)

=> merge config from /home/jovyan/BA/Github/thesis-edit-evaluation/ViFi-CLIP-og/output/few_shot/vitb16_2_frames/humanedit/2.5k_train_data/16_32_vifi_clip_all_shot.yaml


Predicting...: 528it [00:08, 60.56it/s]


## Single Inference
Slightly modify the above-initialized setting to account for a single inference. Specification of a folder containing the checkpoints and config file is required, however, no path for saving anymore.

In [17]:
def get_configs(directory):
    config = directory + "16_32_vifi_clip_all_shot.yaml"
    pretrained_model_path = directory + "ckpt_epoch_10.pth"

    args = parse_option(config, pretrained_model_path)
    config = get_config(args)
    logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")

    model = init_model(config, logger)
    pipeline = init_preprocessing_pipeline(config)
    return model, pipeline

In [19]:
base = "/home/jovyan/BA/Github/thesis-edit-evaluation/"

model, pipeline = get_configs(
    base + "ViFi-CLIP/output/crossvalidation/vitb16_2_humanedit_freeze_none/fold1/",
)

=> merge config from /home/jovyan/BA/Github/thesis-edit-evaluation/ViFi-CLIP/output/crossvalidation/vitb16_2_humanedit_freeze_none/fold1/16_32_vifi_clip_all_shot.yaml


In [25]:
video_path = "/home/jovyan/BA/Github/MagicBrush/vifi_format/videos/100626_1.mp4"
instruction = [
    "put a glass of soda on the table",
    "let the bears sit on a leather couch",
    "let's add a man in the kitchen",
    "let there be a cup of yogurt",
    "spill milk onto the floor",
]
predict_one(video_path, instruction, model, pipeline)

tensor([[23.8594, 18.6250, 21.5156, 19.2500, 21.0469]], device='cuda:0',
       dtype=torch.float16)