In [None]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm
import pandas as pd
from PIL import Image, ImageSequence
import re
import lpips
import torchvision.transforms as transforms

In [None]:
# CLIP SCORE

# Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# Prompts dictionary
prompts = {
    "car": "a futuristic car is driving on the road",
    "cat": "a dog with a sombrero is sitting in front of a tree",
    "girl_dance": "a man is dancing",
    "motorbike": "a Spider-Man is driving a bicycle in the forest",
    "rabbit_jump": "a origami rabbit is jumping on the grass",
    "squirrel_carrot": "a rabbit with sunglasses is eating a carrot"
}

base_folder = "./videos"
results = []

# Loop through all model folders
for model_folder in [folder for folder in os.listdir(base_folder) if folder != "Original"]:
    model_path = os.path.join(base_folder, model_folder)
    if not os.path.isdir(model_path):
        continue

    print(f"\nEvaluating model: {model_folder}")

    for video_name in os.listdir(model_path):
        if not video_name.endswith(".gif"):
            continue

        match = re.search(r'\[(.*?)\]', video_name)
        if not match:
            print(f"Skipping {video_name}: no prompt tag found.")
            continue

        video_key = match.group(1)
        prompt = prompts.get(video_key)
        if not prompt:
            print(f"Skipping {video_name}: unknown prompt key '{video_key}'")
            continue

        # Tokenize and encode text
        text = clip.tokenize([prompt]).to(device)
        with torch.no_grad():
            text_features = model.encode_text(text)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        similarities = []
        video_path = os.path.join(model_path, video_name)
        video_basename = os.path.splitext(video_name)[0]

        with Image.open(video_path) as gif:
            for frame in tqdm(ImageSequence.Iterator(gif), desc=f"Processing {video_basename}"):
                frame_rgb = frame.convert("RGB")
                image_tensor = preprocess(frame_rgb).unsqueeze(0).to(device)

                with torch.no_grad():
                    image_features = model.encode_image(image_tensor)
                    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
                    similarity = torch.cosine_similarity(image_features, text_features).item()
                    similarities.append(similarity)

        avg_similarity = sum(similarities) / len(similarities)
        results.append({
            "model": model_folder,
            "video": video_basename,
            "prompt_key": video_key,
            "clip_avg": avg_similarity
        })

# Save to DataFrame
df_clip = pd.DataFrame(results)
print(df_clip)

# Group by model
df_summary = df_clip.groupby("model")["clip_avg"].mean().sort_values(ascending=False)
print("\nAverage CLIP score per model:")
print(df_summary)


Evaluating model: Tune-A-Video


Processing [rabbit_jump]_rabbit_jump_origami: 8it [00:00, 73.48it/s]
Processing [motorbike]_motorbike_spiderman: 24it [00:00, 87.72it/s]
Processing [car]_car_futuristic: 8it [00:00, 69.26it/s]
Processing [squirrel_carrot]_squirrel_carrot_rabbit: 24it [00:00, 86.79it/s]
Processing [cat]_cat_dog: 24it [00:00, 88.23it/s]
Processing [girl_dance]_girl_dance_man: 24it [00:00, 81.56it/s]



Evaluating model: Video-P2P


Processing [rabbit_jump]_rabbit_jump_origami: 8it [00:00, 56.89it/s]
Processing [motorbike]_motorbike_spiderman: 8it [00:00, 49.92it/s]
Processing [car]_car_futuristic: 8it [00:00, 62.13it/s]
Processing [squirrel_carrot]_squirrel_carrot_rabbit: 8it [00:00, 88.12it/s]
Processing [cat]_cat_dog: 8it [00:00, 61.25it/s]
Processing [girl_dance]_girl_dance_man: 8it [00:00, 70.27it/s]



Evaluating model: Video-P2P-EI-Plus


Processing [rabbit_jump]_rabbit_jump_origami: 8it [00:00, 61.19it/s]
Processing [motorbike]_motorbike_spiderman: 8it [00:00, 69.82it/s]
Processing [car]_car_futuristic: 8it [00:00, 48.94it/s]
Processing [squirrel_carrot]_squirrel_carrot_rabbit: 8it [00:00, 87.12it/s]
Processing [cat]_cat_dog: 8it [00:00, 69.80it/s]
Processing [girl_dance]_girl_dance_man: 8it [00:00, 84.30it/s]



Evaluating model: Video-P2P-EI


Processing [rabbit_jump]_rabbit_jump_origami: 8it [00:00, 54.30it/s]
Processing [motorbike]_motorbike_spiderman: 8it [00:00, 51.72it/s]
Processing [car]_car_futuristic: 8it [00:00, 51.49it/s]
Processing [squirrel_carrot]_squirrel_carrot_rabbit: 8it [00:00, 85.44it/s]
Processing [cat]_cat_dog: 8it [00:00, 64.98it/s]
Processing [girl_dance]_girl_dance_man: 8it [00:00, 59.81it/s]

                model                                     video  \
0        Tune-A-Video         [rabbit_jump]_rabbit_jump_origami   
1        Tune-A-Video           [motorbike]_motorbike_spiderman   
2        Tune-A-Video                      [car]_car_futuristic   
3        Tune-A-Video  [squirrel_carrot]_squirrel_carrot_rabbit   
4        Tune-A-Video                             [cat]_cat_dog   
5        Tune-A-Video               [girl_dance]_girl_dance_man   
6           Video-P2P         [rabbit_jump]_rabbit_jump_origami   
7           Video-P2P           [motorbike]_motorbike_spiderman   
8           Video-P2P                      [car]_car_futuristic   
9           Video-P2P  [squirrel_carrot]_squirrel_carrot_rabbit   
10          Video-P2P                             [cat]_cat_dog   
11          Video-P2P               [girl_dance]_girl_dance_man   
12  Video-P2P-EI-Plus         [rabbit_jump]_rabbit_jump_origami   
13  Video-P2P-EI-Plus           [motorbike]_motorbike_spiderma




In [None]:
# LPIPS evaluation

# Setup LPIPS model
loss_fn = lpips.LPIPS(net='alex')
device = "cuda" if torch.cuda.is_available() else "cpu"
loss_fn = loss_fn.to(device)

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Define paths
base_dir = "./videos"
original_dir = os.path.join(base_dir, "Original")
model_dirs = ["Tune-A-Video", "Video-P2P", "Video-P2P-EI", "Video-P2P-EI-Plus"]

results = []

# Process each model folder
for model in model_dirs:
    input_dir = os.path.join(base_dir, model)

    for video_file in os.listdir(input_dir):
        match = re.search(r'\[(.*?)\]', video_file)
        if not match:
            print(f"Skipping {video_file} (missing tag)")
            continue

        prompt_key = match.group(1)
        gen_path = os.path.join(input_dir, video_file)
        orig_path = os.path.join(original_dir, f"{prompt_key}.gif")

        if not os.path.isfile(orig_path):
            print(f"Original missing: {orig_path}")
            continue

        try:
            gen_gif = Image.open(gen_path)
            orig_gif = Image.open(orig_path)
        except Exception as e:
            print(f"Error opening GIFs for {video_file}: {e}")
            continue

        gen_frames = list(ImageSequence.Iterator(gen_gif))
        orig_frames = list(ImageSequence.Iterator(orig_gif))
        distances = []

        for g_frame, o_frame in tqdm(zip(gen_frames, orig_frames), total=len(gen_frames), desc=f"{model} - {video_file}"):
            g = transform(g_frame.convert("RGB")).unsqueeze(0).to(device)
            o = transform(o_frame.convert("RGB")).unsqueeze(0).to(device)
            with torch.no_grad():
                dist = loss_fn(g, o).item()
            distances.append(dist)

        avg_lpips = sum(distances) / len(distances) if distances else None
        results.append({
            "model": model,
            "video": video_file,
            "prompt_key": prompt_key,
            "lpips": avg_lpips
        })

# Build DataFrame
df_lpips = pd.DataFrame(results)
print("\nAverage LPIPS per model:")
print(df_lpips.groupby("model")["lpips"].mean())

df_lpips

Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /usr/local/lib/python3.10/dist-packages/lpips/weights/v0.1/alex.pth


  self.load_state_dict(torch.load(model_path, map_location='cpu'), strict=False)
Tune-A-Video - [rabbit_jump]_rabbit_jump_origami.gif: 100%|██████████| 8/8 [00:00<00:00, 136.61it/s]
Tune-A-Video - [motorbike]_motorbike_spiderman.gif:  33%|███▎      | 8/24 [00:00<00:00, 136.91it/s]
Tune-A-Video - [car]_car_futuristic.gif: 100%|██████████| 8/8 [00:00<00:00, 114.72it/s]
Tune-A-Video - [squirrel_carrot]_squirrel_carrot_rabbit.gif:  33%|███▎      | 8/24 [00:00<00:00, 119.94it/s]
Tune-A-Video - [cat]_cat_dog.gif:  33%|███▎      | 8/24 [00:00<00:00, 117.64it/s]
Tune-A-Video - [girl_dance]_girl_dance_man.gif:  33%|███▎      | 8/24 [00:00<00:00, 121.95it/s]
Video-P2P - [rabbit_jump]_rabbit_jump_origami.gif: 100%|██████████| 8/8 [00:00<00:00, 127.13it/s]
Video-P2P - [motorbike]_motorbike_spiderman.gif: 100%|██████████| 8/8 [00:00<00:00, 126.38it/s]
Video-P2P - [car]_car_futuristic.gif: 100%|██████████| 8/8 [00:00<00:00, 128.07it/s]
Video-P2P - [squirrel_carrot]_squirrel_carrot_rabbit.gif: 100%|█


📊 Average LPIPS per model:
model
Tune-A-Video         0.535085
Video-P2P            0.266016
Video-P2P-EI         0.249953
Video-P2P-EI-Plus    0.277495
Name: lpips, dtype: float64





Unnamed: 0,model,video,prompt_key,lpips
0,Tune-A-Video,[rabbit_jump]_rabbit_jump_origami.gif,rabbit_jump,0.524327
1,Tune-A-Video,[motorbike]_motorbike_spiderman.gif,motorbike,0.620513
2,Tune-A-Video,[car]_car_futuristic.gif,car,0.474106
3,Tune-A-Video,[squirrel_carrot]_squirrel_carrot_rabbit.gif,squirrel_carrot,0.474364
4,Tune-A-Video,[cat]_cat_dog.gif,cat,0.63342
5,Tune-A-Video,[girl_dance]_girl_dance_man.gif,girl_dance,0.483778
6,Video-P2P,[rabbit_jump]_rabbit_jump_origami.gif,rabbit_jump,0.185608
7,Video-P2P,[motorbike]_motorbike_spiderman.gif,motorbike,0.376606
8,Video-P2P,[car]_car_futuristic.gif,car,0.197736
9,Video-P2P,[squirrel_carrot]_squirrel_carrot_rabbit.gif,squirrel_carrot,0.236948
