## Code for performing inference with ViFi-CLIP on custom videos

### Please set the corresponding values in the cell below. Afterwards, just run the cells for inference with ViFi-CLIP model

In [2]:
### Set values here ###
config = "output/few_shot_vitb32_32_8/16_32_vifi_clip_all_shot.yaml"
#'configs/few_shot/finetuning_few_shot/magicbrush/16_32_vifi_clip_all_shot.yaml'
output_folder_name = "exp"
pretrained_model_path = "output/few_shot_vitb32_32_8/ckpt_epoch_49.pth"
#"/home/jovyan/BA/Github/ViFi-CLIP/output/ckpt_epoch_49.pth"
# List the action names for which ViFi-CLIP will perform action recognition
class_names = ['add a dog', 'make the bag red', 'turn the zebra in a horse']
# Load your video example:
video_path = '/home/jovyan/BA/Github/ViFi-CLIP/example_video.mp4'

### Import libraries 

In [22]:
!pip install --upgrade pip

[0m

In [31]:
!pip install torch torchvision -q
!pip install yacs -q
!pip install ftfy -q
!pip install regex -q
!pip install mmcv -q
!pip install timm -q

[0m

In [24]:
!pip install --upgrade mmengine -q
#!pip install mmcv --upgrade -q

[0m

In [25]:
!pip install mmcv-full -q

[0m

In [3]:
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import time
import numpy as np
from utils.config import get_config
from trainers import vificlip
from datasets.pipeline import *

### Setting up configuration, no need to change anything.

In [4]:
# Step 1:
# Configuration class 
class parse_option():
    def __init__(self):
        self.config = config
        self.output =  output_folder_name   # Name of output folder to store logs and save weights
        self.resume = pretrained_model_path
        # No need to change below args.
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0
args = parse_option()
config = get_config(args)
logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")
logger.info(f"working dir: {config.OUTPUT}")

=> merge config from output/few_shot_vitb32_32_8/16_32_vifi_clip_all_shot.yaml
[32m[2024-12-06 10:50:11 ViT-B/32][0m[33m(1883247791.py 18)[0m: INFO working dir: exp


### Loading ViFi-CLIP and its pretrained weights

In [5]:
# Step 2:
# Create the ViFi-CLIP models and load pretrained weights
model = vificlip.returnCLIP(config,
                            logger=logger,
                            class_names=class_names,)
model = model.float().cuda()  # changing to cuda here


[32m[2024-12-06 10:50:12 ViT-B/32][0m[33m(vificlip.py 219)[0m: INFO Loading CLIP (backbone: ViT-B/32)
[32m[2024-12-06 10:50:13 ViT-B/32][0m[33m(vificlip.py 222)[0m: INFO Building ViFi-CLIP CLIP
[32m[2024-12-06 10:50:13 ViT-B/32][0m[33m(vificlip.py 239)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2024-12-06 10:50:13 ViT-B/32][0m[33m(vificlip.py 263)[0m: INFO Total learnable items: 301


In [6]:
logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
load_state_dict = checkpoint['model']
# now remove the unwanted keys:
if "module.prompt_learner.token_prefix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_prefix"]

if "module.prompt_learner.token_suffix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_suffix"]

if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
    del load_state_dict["module.prompt_learner.complete_text_embeddings"]
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in load_state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v



  checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')


In [7]:
# load params
msg = model.load_state_dict(new_state_dict, strict=False)

### Preprocessing input video 

In [8]:
#from torchvision.transforms import Compose
#!pip install decord==0.6.0
from mmcv import Compose

In [1]:
# Step 3: 
# Preprocessing for video
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)
val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
if config.TEST.NUM_CROP == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)
if config.TEST.NUM_CLIP > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)
pipeline = Compose(val_pipeline)

NameError: name 'config' is not defined

In [10]:
dict_file = {'filename': video_path, 'tar': False, 'modality': 'RGB', 'start_index': 0}

### ViFi-CLIP inference with given video

In [None]:
!pip install decord==0.6.0

In [None]:
video = pipeline(dict_file)
video_tensor = video['imgs'].unsqueeze(0).cuda().float()
# Inference through ViFi-CLIP
with torch.no_grad():
    with torch.cuda.amp.autocast():
        logits = model(video_tensor)
pred_index = logits.argmax(-1)

In [None]:
print(f'logits: {logits}')
print(f'predicted action category is : {class_names[pred_index]}')

In [10]:
# Function for alpha blending
def alpha_blend(frame1, frame2, alpha):
    return (1 - alpha) * frame1 + alpha * frame2

# Function to generate 8 frames from 2 input frames
def generate_8_frames_from_video(video_imgs):
    # Extract first and last frame
    first_frame = np.float32(video_imgs[0])
    last_frame = np.float32(video_imgs[1])
    
    # Generate 6 intermediate frames using alpha blending
    frames = [first_frame]  # First frame
    for i in range(1, 7):
        alpha = i / 8.0
        blended_frame = alpha_blend(first_frame, last_frame, alpha)
        frames.append(blended_frame)
    frames.append(last_frame)  # Last frame
    
    # Convert back to uint8
    frames = [np.uint8(frame) for frame in frames]
    return frames

## Inference & Correlation

In [5]:
"""
!pip install torch==1.11.0 -q
!pip install torchvision==0.12.0 -q
!pip install yacs ftfy timm regex -q
#!pip install mmcv -q
!pip install mmcv-full -q
!pip install decord
"""
#!pip install mmengine -q

'\n!pip install torch==1.11.0 -q\n!pip install torchvision==0.12.0 -q\n!pip install yacs ftfy timm regex -q\n#!pip install mmcv -q\n!pip install mmcv-full -q\n!pip install decord\n'

In [1]:
import pandas as pd
import json
import re
import cv2
from scipy import stats
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import time
import numpy as np
from trainers import vificlip
from datasets.pipeline import *

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
#from mmcv import Compose

In [14]:
#!export CUDA_VISIBLE_DEVICES=1

In [18]:
config = "output/base2novel/humanedit/vitb32_2_80_20/16_32_vifi_clip_all_shot.yaml"
output_folder_name = "exp"
pretrained_model_path = "output/base2novel/humanedit/vitb32_2_80_20/ckpt_epoch_10.pth"


In [25]:
class parse_option():
    def __init__(self):
        self.config = config
        self.output =  output_folder_name 
        self.resume = pretrained_model_path
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0
args = parse_option()
config = get_config(args)

logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")

=> merge config from output/base2novel/humanedit/vitb32_2_80_20/16_32_vifi_clip_all_shot.yaml


In [26]:
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)

scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)

val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]

if config.TEST.NUM_CROP == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)
    
if config.TEST.NUM_CLIP > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)

pipeline = Compose(val_pipeline)

In [27]:
#classes = pd.read_csv("/home/jovyan/BA/Github/ViFi-CLIP/labels/magicbrush_labels.csv")

In [28]:
def predict(turn, id, instruction, frames):
    if frames==2:
        video_path = f"/home/jovyan/BA/Github/ViFi-CLIP/datasets_splits/magicbrush_splits/videos/{id}_{turn}.mp4"
    elif frames==8:
        video_path = f"/home/jovyan/BA/Github/ViFi-CLIP/datasets_splits/magicbrush_splits/videos_8_frames/{id}_{turn}.mp4"

    model = vificlip.returnCLIP(config,
                            logger=logger,
                            class_names=[instruction],)

    model = model.float().cuda()  # changing to cuda here
    
    checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
    load_state_dict = checkpoint['model']

    if "module.prompt_learner.token_prefix" in load_state_dict:
        del load_state_dict["module.prompt_learner.token_prefix"]

    if "module.prompt_learner.token_suffix" in load_state_dict:
        del load_state_dict["module.prompt_learner.token_suffix"]

    if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
        del load_state_dict["module.prompt_learner.complete_text_embeddings"]

    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in load_state_dict.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v

    msg = model.load_state_dict(new_state_dict, strict=False)

    
    dict_file = {'filename': video_path, 'tar': False, 'modality': 'RGB', 'start_index': 0}
    
    video = pipeline(dict_file)
    video_tensor = video['imgs'].unsqueeze(0).cuda().float()

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            #logits = model(video_tensor, label_id)
            logits = model(video_tensor)

    return logits[0].item()
    """
    image_features, text_features, logit_scale = logits
    similarity = image_features.view(1, -1).softmax(dim=-1)
    return similarity
    """
    

In [29]:
def get_correlation_df_with_columns(df1, df2):
    merged_df = pd.merge(df1, df2, on=['turn', 'id'], suffixes=('_df1', '_df2'))

    correlation_results = []

    for df1_col in df1.columns:
        if df1_col in ['turn', 'id']:
            continue 

        for df2_col in df2.columns:
            if df2_col in ['turn', 'id']:
                continue 
            
            spearman_corr, spearman_p_value = stats.spearmanr(merged_df[df1_col], merged_df[df2_col])
            pearson_corr, pearson_p_value = stats.pearsonr(merged_df[df1_col], merged_df[df2_col])

            correlation_results.append({
                'df1': df1_col,
                'df2': df2_col,
                'spearman_corr': spearman_corr, #round(spearman_corr, 3),
                'spearman_p_value': spearman_p_value, #round(spearman_p_value, 5),
                'pearson_corr': pearson_corr, #round(pearson_corr, 3),
                'pearson_p_value': pearson_p_value #round(pearson_p_value, 5)
            })

    correlation_results_df = pd.DataFrame(correlation_results)
    return correlation_results_df

In [30]:
def get_correlation(rater, name, own, frames):
    if own:
        samples = pd.read_csv("/home/jovyan/BA/Github/open_clip/src/open_clip_inference/human_scores.csv", sep=";")
    else:
        samples = pd.read_csv(f"/home/jovyan/BA/Github/open_clip/src/open_clip_inference/rater{rater}.csv", sep=",")
    pattern = r'(\d+)-output(\d+)'
    with open('/home/jovyan/BA/Github/open_clip/src/open_clip_inference/edit_turns.json') as f:
        turns = json.load(f)

    dev = pd.read_csv("/home/jovyan/BA/Github/MagicBrush/dev_data.csv", sep=",")

    results = []
    for index, row in samples.iterrows():
        id = int(row["id"])
        turn = int(row["turn"])

        for entry in turns:
            output = entry["output"]
            match = re.search(pattern, output)

            if match:
                found_id = int(match.group(1)) # get id of sample
                found_turn = int(match.group(2)) # get turn of sample

                if int(found_id) == id and int(found_turn) == turn:
                    instruction = entry["instruction"].lower()

                    similarity = predict(found_turn, found_id, instruction, frames)
                    #print(similarity)

                    row = {
                        "id": id,
                        "turn": turn,
                        "vificlip_score": similarity #round(similarity, 3),
                    }
                    results.append(row)

    clip_scores = pd.DataFrame(results)
    #clip_scores.to_csv("cor/values/of_vitb16_2_frames_11_epochs_contr_loss_precise.csv", index=False) # save CLIP's predictions

    #clip_scores = clip_scores.drop_duplicates()
    correlation_df = get_correlation_df_with_columns(samples, clip_scores)
    correlation_df.to_csv(f"correlation/base2novel/humanedit/{name}.csv", sep=",")

In [31]:
get_correlation(rater=1, name="vitb32_2_10_epochs_80_20", own=True, frames=2)

[32m[2025-01-05 13:48:57 ViT-B/32][0m[33m(vificlip.py 217)[0m: INFO Loading CLIP (backbone: ViT-B/32)
[32m[2025-01-05 13:48:59 ViT-B/32][0m[33m(vificlip.py 220)[0m: INFO Building ViFi-CLIP CLIP
[32m[2025-01-05 13:48:59 ViT-B/32][0m[33m(vificlip.py 237)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2025-01-05 13:48:59 ViT-B/32][0m[33m(vificlip.py 261)[0m: INFO Total learnable items: 301


  checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')


[32m[2025-01-05 13:49:00 ViT-B/32][0m[33m(vificlip.py 217)[0m: INFO Loading CLIP (backbone: ViT-B/32)


  with torch.cuda.amp.autocast():


[32m[2025-01-05 13:49:01 ViT-B/32][0m[33m(vificlip.py 220)[0m: INFO Building ViFi-CLIP CLIP
[32m[2025-01-05 13:49:01 ViT-B/32][0m[33m(vificlip.py 237)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2025-01-05 13:49:01 ViT-B/32][0m[33m(vificlip.py 261)[0m: INFO Total learnable items: 301
[32m[2025-01-05 13:49:02 ViT-B/32][0m[33m(vificlip.py 217)[0m: INFO Loading CLIP (backbone: ViT-B/32)
[32m[2025-01-05 13:49:03 ViT-B/32][0m[33m(vificlip.py 220)[0m: INFO Building ViFi-CLIP CLIP
[32m[2025-01-05 13:49:03 ViT-B/32][0m[33m(vificlip.py 237)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2025-01-05 13:49:03 ViT-B/32][0m[33m(vificlip.py 261)[0m: INFO Total learnable items: 301
[32m[2025-01-05 13:49:04 ViT-B/32][0m[33m(vificlip.py 217)[0m: INFO Loading CLIP (backbone: ViT-B/32)
[32m[2025-01-05 13:49:06 ViT-B/32][0m[33m(vificlip.py 220)[0m: INFO Building ViFi-CLIP CLIP
[32m[2025-01-05 13:49:06 ViT-B/32][0m[33m(vificlip.py 2

In [38]:
import torch
torch.cuda.device_count()

1

In [4]:
print(1)

1


In [3]:
with open("output/few_shot_vitb32_32_8/ckpt_epoch_49.pth", 'rb') as f:
    checkpoint = torch.load(f, map_location='cpu')

  checkpoint = torch.load(f, map_location='cpu')


In [4]:
checkpoint

{'model': OrderedDict([('logit_scale', tensor(4.6042)),
              ('prompt_learner.complete_text_embeddings',
               tensor([[[-1.6737e-03,  7.3075e-05, -4.2000e-03,  ..., -3.4103e-03,
                         -3.9291e-03, -5.5313e-05],
                        [-7.1259e-03, -8.8959e-03, -7.0877e-03,  ..., -9.4748e-04,
                          8.7585e-03, -9.7809e-03],
                        [ 1.8005e-02,  1.5732e-02, -1.3107e-02,  ..., -1.3054e-02,
                          2.0889e-02, -1.8280e-02],
                        ...,
                        [-3.9062e-03, -6.3248e-03,  7.3509e-03,  ..., -1.0658e-02,
                         -2.2766e-02, -1.0910e-02],
                        [-3.9062e-03, -6.3248e-03,  7.3509e-03,  ..., -1.0658e-02,
                         -2.2766e-02, -1.0910e-02],
                        [-3.9062e-03, -6.3248e-03,  7.3509e-03,  ..., -1.0658e-02,
                         -2.2766e-02, -1.0910e-02]],
               
                       [[-1.67