### Load in packages and model

In [1]:
%cd ../../..

/home/nils/NILS/Master/DL2/DL2-ZeroVis


In [2]:
from src.fromage_inf.inf_utils import PromptParser
import pickle
import itertools
import torch
from collections import Counter
from PIL import Image
import torchvision.transforms as transforms
import nltk.translate.bleu_score as BLEU
import torch.nn.functional as F
from transformers import CLIPTextModel, AutoTokenizer

In [3]:
# Load in the parser.
parser = PromptParser("src/fromage_inf/fromage_model/")

# Load the relations dictionary to make the relations.
relations = pickle.load(open("src/code/relations_dict.pkl", "rb"))

Using facebook/opt-6.7b for the language model.
Freezing the LM.
Initializing embedding for the retrieval token [RET] (id = 50266).


# Image-to-Text Visual Arithmetics via Zero-Shot prompting

### By using the greedy sampling Image Captioning pipeline

In [4]:
def recall(generated, ground_truth):
    # Split the generated sentence into words.
    words = generated.split()
    
    # Count occurrences of each word in the inputs.
    words_counter = Counter(words)
    truth_counter = Counter(ground_truth)

    true_positives = 0

    # For each unique word in the sentence, get the minimum count in the inputs.
    for word in words_counter:
        if word in truth_counter:
            true_positives += min(words_counter[word], truth_counter[word])
    
    # Calculate the recall.
    recall = true_positives / sum(truth_counter.values())

    return recall

In [5]:
# Initialize the CLIP text encoder with the same clip model as used in FROMAGe.
clip_text = CLIPTextModel.from_pretrained('openai/clip-vit-large-patch14')
tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-large-patch14')

# Set smoothing function for BLEU.
chencherry = BLEU.SmoothingFunction()

total_scores = {}

# Loop all relations to retrieve all tuples that represent the relation.
for relation, values in relations.items():
    print(relation)
    print('=' * 120)

    amount_combi = 0
    recall5 = 0.
    bleu1 = 0.
    clip_s = 0.

    # Create all possible combinations of the relation tuples.
    combinations = itertools.combinations(values, 2)

    # Loop all combinations.
    for combo in combinations:
        tuple1, tuple2 = combo
         
        # Describe the task and prepare the prompt.
        prompt = ["Task description: Finish the analogy.", 
            [tuple1[0]], " is to ", [tuple1[1]], ", as", [tuple2[0]], " is to "]
        
        print('='*60)
        print('Prompt:')
        print("Task description: Finish the analogy.")
        print("[{}] is to [{}], as [{}] is to ".format(tuple1[0], tuple1[1], tuple2[0]))
        print("Expected output: [{}]".format(tuple2[1]))

        # Perform the greedy Image-text retrieval pipeline.
        print('=' * 30)
        # num_words is set to 5 as zerocap uses beam 5 for its experiments.
        model_outputs = parser.model.generate_for_images_and_texts(prompt, ret_scale_factor=0, num_words=5)

        print('Model generated outputs:')
        parser.display(model_outputs)

        # Create the ground truth, eg. 'leaders/xi_jingping' -> ['xi', 'jingping'].
        amount_combi += 1
        ground_truth = tuple1[1].split("/")[1].replace("_"," ").split()

        # Calculate Recall @ 5.
        recall5 += recall(model_outputs[0], ground_truth)

        # Calculate BLEU-1.
        bleu1 += BLEU.sentence_bleu([ground_truth], model_outputs[0].split(),  weights=(1.,0.), smoothing_function=chencherry.method1)

        # Calculate the CLIP-score described in the ZeroCap paper.
        # This is different from the well known CLIP-score metric.
        x1 = tokenizer(['Image of a {}'.format(ground_truth)], padding=False, return_tensors='pt')
        x2 = tokenizer(model_outputs[0], padding=False, return_tensors='pt')
        x1_tensor = clip_text(**x1).last_hidden_state.squeeze()
        x2_tensor = clip_text(**x2).last_hidden_state.squeeze()

        cos = F.normalize(x1_tensor) @ F.normalize(x2_tensor).T
        clip_s += torch.mean(cos.squeeze()).item()
        
        # Flip the ordering of the tuples and caption again.
        prompt = ["Task description: Finish the analogy.", 
            [tuple2[0]], " is to ", [tuple2[1]], ", as", [tuple1[0]], " is to "]

        print('='*45)
        print('Prompt:')
        print("Task description: Finish the analogy.")
        print("[{}] is to [{}], as [{}] is to ".format(tuple2[0], tuple2[1], tuple1[0]))
        print("Expected output: [{}]".format(tuple1[1]))

        print('=' * 30)
        model_outputs = parser.model.generate_for_images_and_texts(prompt, ret_scale_factor=0, num_words=5)

        print('Model generated outputs:')
        parser.display(model_outputs)

        amount_combi += 1
        ground_truth = tuple2[1].split("/")[1].replace("_"," ").split()

        recall5 += recall(model_outputs[0], ground_truth)

        bleu1 += BLEU.sentence_bleu([ground_truth], model_outputs[0].split(),  weights=(1.,0.), smoothing_function=chencherry.method1)

        x1 = tokenizer(['Image of a {}'.format(ground_truth)], padding=False, return_tensors='pt')
        x2 = tokenizer(model_outputs[0], padding=False, return_tensors='pt')
        x1_tensor = clip_text(**x1).last_hidden_state.squeeze()
        x2_tensor = clip_text(**x2).last_hidden_state.squeeze()

        cos = F.normalize(x1_tensor) @ F.normalize(x2_tensor).T
        clip_s += torch.mean(cos.squeeze()).item()

    # Finalize the relation scores and save them.
    relation_scores = {"CLIP-s":clip_s/amount_combi,
                       "Recall@5": recall5/amount_combi,
                       "BLEU-1": bleu1/amount_combi}
    total_scores[relation] = relation_scores

Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.12.self_attn.v_proj.weight', 'vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_model.encoder.layers.3.self_attn.k_proj.weight', 'vision_model.encoder.layers.22.layer_norm2.bias', 'vision_model.encoder.layers.9.layer_norm1.weight', 'vision_model.encoder.layers.1.mlp.fc1.weight', 'vision_model.encoder.layers.9.mlp.fc1.bias', 'vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_model.encoder.layers.8.mlp.fc1.weight', 'vision_model.encoder.layers.13.self_attn.v_proj.bias', 'vision_model.encoder.layers.2.self_attn.v_proj.weight', 'vision_model.encoder.layers.21.self_attn.out_proj.bias', 'vision_model.encoder.layers.11.self_attn.v_proj.bias', 'vision_model.encoder.layers.16.mlp.fc1.weight', 'vision_model.encoder.layers.17.self_attn.out_proj.bias', 'vision_model.encoder.layers.10.mlp.fc2.bias', 'vision_model.encoder.layers.4.self_attn.q_proj

CEOs -> companies
Prompt:
Task description: Finish the analogy.
[CEOs/mark_zuckerberg] is to [companies/facebook], as [CEOs/bill_gates] is to 
Expected output: [companies/microsoft]
Model generated outputs:
iphone app, the
Prompt:
Task description: Finish the analogy.
[CEOs/bill_gates] is to [companies/microsoft], as [CEOs/mark_zuckerberg] is to 
Expected output: [companies/facebook]
Model generated outputs:
___________?
Prompt:
Task description: Finish the analogy.
[CEOs/mark_zuckerberg] is to [companies/facebook], as [CEOs/elon_musk] is to 
Expected output: [companies/tesla]
Model generated outputs:
ive been a lot of
Prompt:
Task description: Finish the analogy.
[CEOs/elon_musk] is to [companies/tesla], as [CEOs/mark_zuckerberg] is to 
Expected output: [companies/facebook]
Model generated outputs:
ive been a long time
Prompt:
Task description: Finish the analogy.
[CEOs/mark_zuckerberg] is to [companies/facebook], as [CEOs/jeff_bezos] is to 
Expected output: [companies/amazon]
Model g

In [6]:
# Print the results of the metrics for each relation.
for relation, values in total_scores.items():
    print('='*30)
    print("Relationship: ")
    print(relation)
    print("Scores: \n")

    for key, val in values.items():
        print(key, ": ", val)

Relationship: 
CEOs -> companies
Scores: 

CLIP-s :  0.1623511239886284
Recall@5 :  0.05
BLEU-1 :  0.0125
Relationship: 
flags -> capital
Scores: 

CLIP-s :  0.14926920430452534
Recall@5 :  0.0
BLEU-1 :  0.0
Relationship: 
food -> countries
Scores: 

CLIP-s :  0.16675872107346854
Recall@5 :  0.0
BLEU-1 :  0.0
Relationship: 
building -> countries
Scores: 

CLIP-s :  0.14957103558949061
Recall@5 :  0.0
BLEU-1 :  0.0
Relationship: 
flags -> leaders
Scores: 

CLIP-s :  0.14836884654230542
Recall@5 :  0.0
BLEU-1 :  0.0
