## Code for performing inference with ViFi-CLIP on custom videos

In [None]:
# install in environment
"""
!pip install torch==1.11.0 -q
!pip install torchvision==0.12.0 -q
!pip install yacs ftfy timm regex -q
!pip install mmcv-full -q
!pip install decord
"""
# depending on possible errors
#!pip install mmcv -q
#!pip install mmengine -q

In [8]:
import pandas as pd
import json
import re
import cv2
import torch
import torch.nn as nn
import time
import numpy as np
import warnings, logging
from tqdm import tqdm

from scipy import stats
from utils.config import get_config
from utils.logger import create_logger
from trainers import vificlip
from datasets.pipeline import *
from collections import OrderedDict
#from mmcv import Compose

In [9]:
# suppress warnings
warnings.simplefilter("ignore", category=FutureWarning)
logging.disable(logging.CRITICAL)

In [10]:
# set up config and logger
def setup_config(config):
    class parse_option():
        def __init__(self):
            self.config = config
            self.output =  "exp" 
            self.resume = pretrained_model_path
            self.only_test = True
            self.opts = None
            self.batch_size = None
            self.pretrained = None
            self.accumulation_steps = None
            self.local_rank = 0

    args = parse_option()
    config = get_config(args)
    logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")
    
    return config, logger

In [11]:
# preprocess pipeline for the video frames
def get_pipeline(config):
    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53], 
        std=[58.395, 57.12, 57.375], 
        to_bgr=False
    )

    scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)

    val_pipeline = [
        dict(type='DecordInit'),
        dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
        dict(type='DecordDecode'),
        dict(type='Resize', scale=(-1, scale_resize)),
        dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
        dict(type='Normalize', **img_norm_cfg),
        dict(type='FormatShape', input_format='NCHW'),
        dict(type='Collect', keys=['imgs'], meta_keys=[]),
        dict(type='ToTensor', keys=['imgs'])
    ]

    if config.TEST.NUM_CROP == 3:
        val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
        val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)

    if config.TEST.NUM_CLIP > 1:
        val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)

    pipeline = Compose(val_pipeline)
    
    return pipeline

In [12]:
# predict similarity score for video + instruction
def predict(video_path, instruction):
    model = vificlip.returnCLIP(
        config,
        logger=logger,
        class_names=instruction,
    )

    model = model.float().cuda()

    dict_file = {'filename': video_path, 
                 'tar': False, 
                 'modality': 'RGB', 
                 'start_index': 0}
    
    video = pipeline(dict_file)
    video_tensor = video['imgs'].unsqueeze(0).cuda().float()

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            logits = model(video_tensor)

    if len(instruction)>1: # for multiple instrutions given 
        return logits[0].tolist()
    else: # single instruction
        return logits[0].item()


In [13]:
# get correlation (spearman, person) scores for columns of two datafranes
def get_correlation_df(df1, df2):
    merged_df = pd.merge(df1, df2, on=['turn', 'id'], suffixes=('_df1', '_df2'))

    correlation_results = []

    for df1_col in df1.columns:
        if df1_col in ['turn', 'id']:
            continue 

        for df2_col in df2.columns:
            if df2_col in ['turn', 'id']:
                continue 
            
            spearman_corr, spearman_p_value = stats.spearmanr(merged_df[df1_col], merged_df[df2_col])
            pearson_corr, pearson_p_value = stats.pearsonr(merged_df[df1_col], merged_df[df2_col])

            correlation_results.append({
                'df1': df1_col,
                'df2': df2_col,
                'spearman_corr': spearman_corr, 
                'spearman_p_value': spearman_p_value,
                'pearson_corr': pearson_corr, 
                'pearson_p_value': pearson_p_value
            })

    correlation_results_df = pd.DataFrame(correlation_results)
    return correlation_results_df

In [14]:
# read required data (huamn judgement, edit turns, data), analyse and save in df
def compute_correlation(save_dir, save_pred_scores=False, save_pred_path=None):
    if save_pred_scores:
        assert save_pred_path
        
    samples = pd.read_csv("/home/jovyan/BA/Github/thesis-edit-evaluation/open_clip/src/open_clip_inference/human_scores.csv", sep=";")
    pattern = r'(\d+)-output(\d+)'
    
    with open('/home/jovyan/BA/Github/thesis-edit-evaluation/open_clip/src/open_clip_inference/edit_turns.json') as f:
        turns = json.load(f)

    dev = pd.read_csv("/home/jovyan/BA/Github/MagicBrush/dev_data.csv", sep=",")

    results = []
    for index, row in tqdm(samples.iterrows(), "Predicting..."):
        current_id = int(row["id"])
        current_turn = int(row["turn"])

        for entry in turns:
            output = entry["output"]
            match = re.search(pattern, output)

            if match:
                found_id = int(match.group(1)) # get id of sample
                found_turn = int(match.group(2)) # get turn of sample

                if int(found_id) == current_id and int(found_turn) == current_turn:
                    instruction = entry["instruction"].lower()
                    
                    frames = config.DATA.NUM_FRAMES
                    magicbrush_dir = "/home/jovyan/BA/Github/MagicBrush"
                    
                    if frames==2:
                        video_path = f"{magicbrush_dir}/vifi_format/videos/{current_id}_{current_turn}.mp4"
                    elif frames==8:
                        video_path = f"{magicbrush_dir}/videos_8_frames/{current_id}_{current_turn}.mp4"

                    similarity = predict(video_path, instruction)

                    row = {
                        "id": current_id,
                        "turn": current_turn,
                        "vificlip_score": similarity,
                    }
                    results.append(row)

    vifi_scores = pd.DataFrame(results)
    if save_pred_scores:
        vifi_scores.to_csv(save_pred_path, index=False)

    #vifi_scores = vifi_scores.drop_duplicates()
    correlation_df = get_correlation_df(samples, vifi_scores)
    correlation_df.to_csv(f"{save_dir}", sep=",")

In [15]:
config = "output/base2novel/humanedit/vitb32_2_80_20/16_32_vifi_clip_all_shot.yaml"
pretrained_model_path = "output/base2novel/humanedit/vitb32_2_80_20/ckpt_epoch_10.pth"

config, logger = setup_config(config)
pipeline = get_pipeline(config)

=> merge config from output/base2novel/humanedit/vitb32_2_80_20/16_32_vifi_clip_all_shot.yaml


In [16]:
# option 1: perform single inference

video_path = "/home/jovyan/BA/Github/MagicBrush/vifi_format/videos/100081_1.mp4"
instruction = ["remove the dog"]
similarity = predict(video_path, instruction)
print(f"Similarity score: {similarity}")


Similarity score: 20.421875


In [17]:
# option 2: compute correlation between human scores and vifi-clip's predictions
compute_correlation(save_dir="correlation/base2novel/humanedit/test.csv")


Predicting...: 77it [02:14,  1.74s/it]


AttributeError: 'bool' object has no attribute 'all'