# Import

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
from PIL import Image

import clip
import torch

# Define variables

In [2]:
model_names = ["Vilt", "Blip_large", "GiT_base", "GiT_large"] # skip , "Blip_base"

image_names = ["Gaussian_5_2", "Gaussian_5_3", "Gaussian_5_4", "Gaussian_5_5",
               "white_house_brown_cat", "white_house_brown_cow", "white_house_brown_dog", "white_house_brown_horse",
               "white_house_green_cat", "white_house_green_unknown",
               "white_house_pink_cow", "white_house_pink_unknown",
               "white_house_purple_dog", "white_house_purple_horse",
               "white_house_red_duck", "white_house_red_unknown",
               "white_house_yellow_unknown"]

# image_difficulty = {"Gaussian_5_2":1, "Gaussian_5_3":1, "Gaussian_5_4":1, "Gaussian_5_5":1,
#                     "white_house_brown_cat":0, "white_house_brown_cow":0, "white_house_brown_dog":0, "white_house_brown_horse":0,
#                     "white_house_green_cat":0.5, "white_house_green_unknown":0.5,
#                     "white_house_pink_cow":0.5, "white_house_pink_unknown":0.5,
#                     "white_house_purple_dog":0.5, "white_house_purple_horse":0.5,
#                     "white_house_red_duck":0.5, "white_house_red_unknown":0.5,
#                     "white_house_yellow_unknown":0.5}
# image_names = list(image_difficulty.keys())

model_type = "VQA"

criteria = ["image_type", "used_language", "specificity", "question_length", "complexity", "image_relatedness"]
major_criteria = ["image_difficulty", "used_language", "image_relatedness"]
fixed_cols = ["question", "answer_gt"] + criteria

In [3]:
fixed_cols

['question',
 'answer_gt',
 'image_type',
 'used_language',
 'specificity',
 'question_length',
 'complexity',
 'image_relatedness']

# Define functions

## General

In [4]:
def T_1(logits):
  N = len(logits)
  x_max, x_min = np.max(logits), np.min(logits)
  return (x_max-x_min)/np.log(N)

def T_2(logits_list, epsilon):
  t_min_list = list()
  for logits in logits_list:
    t_min = list()
    for logit in logits:
      x_max, x_min = np.max(logit), np.min(logit)
      t_temp = (x_max-x_min)/np.log(1+epsilon)
      t_min.append(t_temp)
    t_min_list.append(np.min(t_min)) 
  return np.min(t_min_list)

def softmax_stable(x, T=1):
  return (np.exp((x - np.max(x))/T) / np.exp((x - np.max(x))/T).sum())

def D_to_T(D=0, D_min=0, D_max=1, T_min=1, T_max=1000):
  return ((D-D_min)*(T_max-T_min)/(D_max-D_min))+T_min

def Diff_score(question_scores=dict(), image_scores=dict(), relatedness_scores=dict(), major=list(), type_=1):
  D_all, D_i, D_t, D_r = 0, 0, 0, 0
  Ds = list()

  D_i = criteria_score(image_scores, major)
  Ds.append(D_i)
  if question_scores and relatedness_scores:
    D_t = criteria_score(question_scores, major)
    D_r = criteria_score(relatedness_scores, major)
    Ds.extend([D_t, D_r])
  if (type_==2) and (np.max(Ds)==1):
    D_all = 1
  elif type_==1:
    D_all = np.mean(Ds)
  return D_all

def criteria_score(criteria_dict=dict(), major=list()):
  sum = 0
  for item, score in criteria_dict.items():
    if (item in major) and (score==1): return 1
    sum += score
  return sum/(len(criteria_dict))

## Difficulty automatic calculation

In [5]:
def encoded_length(image):
    return len(cv2.imencode('.png',image, [int(cv2.IMWRITE_PNG_COMPRESSION),9])[1])

def D_text(question):
    pass

def qst_length(question):
    return len(question.split())

def d_spec(question, nouns):
    c = 2 # max number of nouns/adjectives in our case
    for n in nouns:
        if n in question: c-=1
    return c

import en_core_web_sm
en_nlp = en_core_web_sm.load()
def walk_tree(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree(child, depth + 1) for child in node.children)
    else:
        return depth

def map_range(value, in_min, in_max, out_min, out_max):
    if (in_min==0) and (in_max==0): return value
    out_range = out_max - out_min
    in_range = in_max - in_min
    in_val = value - in_min
    val=(float(in_val)/in_range)*out_range
    out_val = out_min + val
    return out_val

def D_relatedness(image, question):
    pass

In [6]:
# Clip similarity functions
clip_model, preprocess = clip.load("ViT-B/32")

def measure_similarity(item_1_features, item_2_features):
    # item_1_features = extract_features(item_1)
    # item_2_features = extract_features(item_2)
    item_1_features /= item_1_features.norm(dim=-1, keepdim=True)
    item_2_features /= item_2_features.norm(dim=-1, keepdim=True)
    similarity = item_1_features.cpu().numpy() @ item_2_features.cpu().numpy().T
    return 1-similarity[0][0] # less score for more similar

def extract_features(item):
    if isinstance(item, str): return extract_features_text(item)
    return extract_features_image(item)
 
def extract_features_image(image):
    # image = (image*255).astype(np.uint8)    
    image = Image.fromarray(image)
    image = preprocess(image)
    image = torch.tensor(np.expand_dims(image, axis=0))
    with torch.no_grad():
        image_features = clip_model.encode_image(image).float()
    return image_features

def extract_features_text(text):
    text = clip.tokenize([text])
    with torch.no_grad():
        text_features = clip_model.encode_text(text).float()
    return text_features

# Create logits file

In [7]:
# logits_dict = dict()

# for model_name in tqdm(model_names):
#   model_dict = dict()
#   for image_name in image_names:
#     results_path = f"./{model_name}/"
#     file_name = f"{model_type}_{model_name}_{image_name}"
#     results_path = results_path + file_name + ".npy"
#     if not os.path.isfile(results_path): f"!!! Warning: No file !!!\n{file_name} file is not available!"
#     l = np.load(results_path, allow_pickle=True)
#     model_dict[image_name] = l.copy()
#   logits_dict[model_name] = model_dict.copy()

In [8]:
# np_logits_file_path = f"./{model_type}_test_logits.npy"
# np.save(np_logits_file_path, logits_dict)

# Read logits file

In [9]:
np_logits_file_path = f"./{model_type}_test_logits.npy"
logits_dict = np.load(np_logits_file_path, allow_pickle=True).item()
logits_dict.keys()

dict_keys(['Vilt', 'Blip_base', 'Blip_large', 'GiT_base', 'GiT_large'])

In [10]:
N_dict = dict()
for m in model_names:
    N_dict[m] = len(logits_dict[m]["Gaussian_5_2"][0][0])
del m

print(N_dict)

{'Vilt': 3129, 'Blip_large': 30524, 'GiT_base': 30522, 'GiT_large': 30522}


# Create results df and file

## Results df

In [11]:
# model_type = "VQA"

# all_results_df = pd.DataFrame()

# for image_name in tqdm(image_names):
#   image_df = pd.DataFrame()
#   new_image = True
  
#   for model_name in model_names:
#     results_path = f".//{model_name}/"
#     file_name = f"{model_type}_{model_name}_{image_name}"
#     results_path = results_path + file_name + ".xlsx"
#     if not os.path.isfile(results_path): f"!!! Warning: No file !!!\n{file_name} sheet is not available!"
#     usecols="B:M" if new_image else "J:M"
#     df = pd.read_excel(results_path, sheet_name='answers', usecols=usecols)
#     for c in df.columns:
#       # if ("question" in c) or ("gt" in c): new_col_name = c
#       if c in set(fixed_cols): new_col_name = c
#       else: new_col_name = f"{model_name}_{c}"
#       image_df[new_col_name] = df[c]
#     new_image = False
#     # columns = [f"{model_name}_{c}" for c in df.columns]
#     # df.columns = columns.copy()
#     # image_df.concat([image_df, df], axis=1)
#   image_name_col = [image_name for _ in range(len(image_df))]
#   image_df.insert(loc=0, column='image_name', value=image_name_col)
#   all_results_df = pd.concat([all_results_df, image_df], ignore_index = True)

# all_results_df.to_excel(f"./{model_type}_test_answers.xlsx", sheet_name=f"{model_type}_answers")

In [12]:
all_results_df = pd.read_excel(f"./all_excels/{model_type}_test_answers.xlsx", sheet_name=f"{model_type}_answers")

In [13]:
len(all_results_df)

3569

In [14]:
print(len(all_results_df))
all_results_df.head()

3569


Unnamed: 0.1,Unnamed: 0,image_name,image_type,used_language,specificity,question_length,complexity,image_relatedness,question,answer_gt,...,Blip_large_correct,Blip_large_no_tokens,GiT_base_full_answer,GiT_base_clean_answer,GiT_base_correct,GiT_base_no_tokens,GiT_large_full_answer,GiT_large_clean_answer,GiT_large_correct,GiT_large_no_tokens
0,0,Gaussian_5_2,1,0,1,0.0,0.0,0.0,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,...,0,2,[CLS] wallpaper [SEP],wallpaper,0,3,[CLS] nothing [SEP],nothing,1,2
1,1,Gaussian_5_2,1,0,1,0.0,0.0,0.0,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,...,1,2,[CLS] gray [SEP],gray,1,2,[CLS] gray [SEP],gray,1,2
2,2,Gaussian_5_2,1,0,1,0.0,0.0,0.0,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,...,0,2,[CLS] wallpaper [SEP],wallpaper,0,3,[CLS] no [SEP],no,0,2
3,3,Gaussian_5_2,1,0,1,0.0,0.0,0.0,why is the image random?,random+t know+not sure+unknown+can't tell+none...,...,0,5,[CLS] it's not [SEP],it's not,0,5,[CLS] it's not [SEP],it's not,0,5
4,4,Gaussian_5_2,1,0,1,0.0,0.0,0.0,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,...,0,3,[CLS] they are in this picture [SEP],they are in this picture,0,6,[CLS] they are not [SEP],they are not,0,4


## Original questions

In [15]:
original_qsts = pd.read_excel("./qsts_original.xlsx", sheet_name="qsts_original")
original_qsts.head()

Unnamed: 0,image_type,used_language,specificity,question_length,complexity,image_relatedness,question,answer_gt
0,0,0,0,0.0,0.0,0.0,what is the color1 object?,object1
1,0,0,0,0.0,0.0,0.0,what is in the image with the object1?,object2+sky
2,0,0,0,0.0,0.0,0.0,what is in the image with the object2?,object1+sky
3,0,0,0,0.0,0.0,0.0,what is the object1's color in the image?,color1
4,0,0,0,0.0,0.0,0.0,what is the object2's color in the image?,color2


# Automatic difficulty

## D_I

In [16]:
# images = [cv2.cvtColor(cv2.imread(f"images\{image_name}.png"),cv2.COLOR_BGR2RGB)/255 for image_name in image_names]
# encoding_complexity = list(map(encoded_length, images))
# rel_complexities = encoding_complexity/np.max(encoding_complexity)

# # D_images = D_image(images_complexity)

# image_difficulty = dict(map(lambda i,j : (i,j), image_names, rel_complexities))

# image_difficulty

In [17]:
images = [cv2.resize(cv2.cvtColor(cv2.imread(f"images\{image_name}.png"),cv2.COLOR_BGR2RGB)/255, (28, 28)) for image_name in image_names]
encoding_complexity = list(map(encoded_length, images))
rel_complexities = encoding_complexity/np.max(encoding_complexity)

# D_images = D_image(images_complexity)

image_difficulty = dict(map(lambda i,j : (i,j), image_names, rel_complexities))

image_difficulty


{'Gaussian_5_2': 0.995475113122172,
 'Gaussian_5_3': 0.9291101055806938,
 'Gaussian_5_4': 0.9381598793363499,
 'Gaussian_5_5': 1.0,
 'white_house_brown_cat': 0.5731523378582202,
 'white_house_brown_cow': 0.6199095022624435,
 'white_house_brown_dog': 0.5746606334841629,
 'white_house_brown_horse': 0.5972850678733032,
 'white_house_green_cat': 0.5761689291101055,
 'white_house_green_unknown': 0.579185520361991,
 'white_house_pink_cow': 0.5942684766214178,
 'white_house_pink_unknown': 0.5852187028657617,
 'white_house_purple_dog': 0.583710407239819,
 'white_house_purple_horse': 0.6244343891402715,
 'white_house_red_duck': 0.5686274509803921,
 'white_house_red_unknown': 0.5822021116138764,
 'white_house_yellow_unknown': 0.5761689291101055}

## D_T

In [18]:
main_nouns = ["object", "color"]
nouns = list()

for n in main_nouns:
    for i in range(1, 5):
        nouns.append(f"{n}{i}")

nouns.extend(["foreground object", "background object"])

nouns

['object1',
 'object2',
 'object3',
 'object4',
 'color1',
 'color2',
 'color3',
 'color4',
 'foreground object',
 'background object']

In [19]:
# import spacy.cli
# spacy.cli.download("en_core_web_sm")

In [20]:
# Creat Text difficulty for both image types

D_T = dict()

In [21]:
qsts_length, qsts_spec, qsts_complexity = list(), list(), list()
question_length, specificity, complexity = list(), list(), list()

for qst in original_qsts["question"]:
    qsts_length.append(qst_length(qst))
    qsts_spec.append(d_spec(qst, nouns))
    qsts_complexity.append(list(walk_tree(sent.root, 0) for sent in en_nlp(qst).sents)[0])

question_length = [map_range(value, min(qsts_length), max(qsts_length), 0, 1) for value in qsts_length]
specificity = [map_range(value, min(qsts_spec), max(qsts_spec), 0, 1) for value in qsts_spec]
complexity = [map_range(value, min(qsts_complexity), max(qsts_complexity), 0, 1) for value in qsts_complexity]

max(question_length), min(question_length), max(specificity), min(specificity), max(complexity), min(complexity)


(1.0, 0.0, 1.0, 0.0, 1.0, 0.0)

In [22]:
max(qsts_length), min(qsts_length)

(32, 4)

In [23]:
max(qsts_complexity), min(qsts_complexity)

(10, 1)

In [24]:
question_length = dict(map(lambda i,j : (i,j), original_qsts["question"].to_list(), question_length))
specificity = dict(map(lambda i,j : (i,j), original_qsts["question"].to_list(), specificity))
complexity = dict(map(lambda i,j : (i,j), original_qsts["question"].to_list(), complexity))

In [25]:
# image_type = 0

# for image_type in [0,1]:
#     idxs = original_qsts.index[original_qsts["image_type"]==image_type].tolist()
#     D_T[image_type] = {
#         "question_length":[question_length[i] for i in idxs],
#         "specificity":[specificity[i] for i in idxs],
#         "complexity":[complexity[i] for i in idxs]
#     }

In [26]:
np.unique(list(question_length.values()), return_counts=True)

(array([0.        , 0.03571429, 0.07142857, 0.10714286, 0.14285714,
        0.17857143, 0.21428571, 0.25      , 0.28571429, 0.35714286,
        0.39285714, 0.42857143, 0.46428571, 0.5       , 0.53571429,
        0.57142857, 0.60714286, 0.64285714, 0.67857143, 0.71428571,
        0.75      , 0.78571429, 0.82142857, 0.85714286, 0.89285714,
        0.92857143, 1.        ]),
 array([ 2, 10, 10,  9, 23, 28, 10,  2,  1,  2,  8, 10, 10, 21, 28, 10,  2,
         1,  1,  6,  6,  8, 17, 25, 20,  6,  1], dtype=int64))

In [27]:
np.unique(list(specificity.values()), return_counts=True)

(array([0. , 0.5, 1. ]), array([ 33, 111, 133], dtype=int64))

In [28]:
np.unique(list(complexity.values()), return_counts=True)

(array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
        0.55555556, 0.66666667, 0.77777778, 1.        ]),
 array([ 2, 13, 43, 77, 44, 38, 48, 10,  2], dtype=int64))

## D_R (No need, we use manual score for "semi" experiments)

In [30]:
# # images features
# image_features = dict()
# for image_name in tqdm(image_names):
#     image = (plt.imread(f"images\{image_name}.png")[:,:,:3]*255).astype(np.uint8)
#     features = extract_features(image)
#     image_features[image_name] = features.detach().clone()

# # questions features
# question_features = dict()
# for qst in tqdm(all_results_df["question"]):
#     features = extract_features(qst)
#     question_features[qst] = features.detach().clone()


# # Saving the features
# features = {"images":image_features.copy(), "questions":question_features.copy()}
# np.save("./CLIP_features/CLIP_features.npy", features)

In [31]:
# # Loading CLIP features
# features = np.load("./CLIP_features/CLIP_features.npy", allow_pickle=True)

# image_features = features.item().get('images')
# question_features = features.item().get('questions')

In [32]:
# im_name = "white_house_brown_cat"
# # im_name = "white_house_green_cat"
# im_name = "Gaussian_5_2"
# im = (plt.imread(f"images\{im_name}.png")[:,:,:3]*255).astype(np.uint8)

# qst = "what is the brown object?"

# im_feat = extract_features_image(im)
# qst_feat = extract_features_text(qst)

# s = measure_similarity(im_feat, qst_feat)
# print(s)

In [21]:
# Experiment for the appendix

im_names = ["Gaussian_5_5", "white_house_brown_cat", "white_house_pink_unknown"]
qst = "what is the brown object?"

for im_name in im_names:
    print(im_name)
    im = (plt.imread(f"images\{im_name}.png")[:,:,:3]*255).astype(np.uint8)
    im_feat = extract_features_image(im)
    qst_feat = extract_features_text(qst)
    CLIP_similarity = 1-measure_similarity(im_feat, qst_feat)
    print(round(CLIP_similarity,3))

Gaussian_5_5
0.247
white_house_brown_cat
0.214
white_house_pink_unknown
0.197


# Create T_max_2 dict

In [33]:
# # T_max_2 per model, do them outside the loop
# epsilons = [i/100 for i in range(1, 201)]
# T_max_2_dict = dict()
# for model_name in tqdm(model_names):
#     T_max_2_dict[model_name] = list()
#     T_max_2 = list()
#     model_epsilons = epsilons.copy()
#     model_epsilons.append(N_dict[model_name])
#     for epsilon in model_epsilons:
#         for image_name in image_names:
#             logits_list = logits_dict[model_name][image_name]
#             T_max_2.append(T_2(logits_list, epsilon))
#         T_max_2_dict[model_name].append(np.min(T_max_2))

# np.save("./T_max_2_dict.npy", T_max_2_dict)

# Read T_max_2 dict

In [34]:
epsilons = [i/100 for i in range(1, 201)]
epsilons.append("N")
###########################
epsilons = epsilons[-1] # only epsilon=N
###########################

T_max_2_dict = np.load("./T_max_2_dict.npy", allow_pickle=True).item()
T_max_2_dict.keys()

dict_keys(['Vilt', 'Blip_base', 'Blip_large', 'GiT_base', 'GiT_large'])

In [35]:
for v in T_max_2_dict.values():
    print(np.min(v))

1.7380787111952216
1.2383381355547283
1.2383381355547283
1.1705838358703073
1.2085129044555991


# Create extended df

In [36]:
full_df_columns = ["image_name"] + fixed_cols
all_results_df_sub = all_results_df[full_df_columns]
all_results_df_sub

Unnamed: 0,image_name,question,answer_gt,image_type,used_language,specificity,question_length,complexity,image_relatedness
0,Gaussian_5_2,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,1,0,1,0.0,0.0,0.0
1,Gaussian_5_2,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,1,0,1,0.0,0.0,0.0
2,Gaussian_5_2,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,1,0,1,0.0,0.0,0.0
3,Gaussian_5_2,why is the image random?,random+t know+not sure+unknown+can't tell+none...,1,0,1,0.0,0.0,0.0
4,Gaussian_5_2,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,1,0,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
3564,white_house_yellow_unknown,"i generated this image with an online tool, i ...",no dragon+no talk+t know+not sure+unknown+can'...,0,0,1,1.0,1.0,1.0
3565,white_house_yellow_unknown,"i generated this image with an online tool, i ...",no books+t know+not sure+unknown+can't tell+no...,0,0,1,1.0,1.0,1.0
3566,white_house_yellow_unknown,mi van a kepen?,t know+not sure+unknown+can't tell+none+no ide...,0,1,0,0.0,0.0,0.0
3567,white_house_yellow_unknown,"tekintettel arra, hogy ilyen képet kap, kérem,...",t know+not sure+unknown+can't tell+none+no ide...,0,1,0,0.5,0.0,0.0


In [37]:
df = all_results_df_sub.loc[(all_results_df_sub['image_name']=="Gaussian_5_2")]
for idx, row in df.iterrows():
    print(f"{idx} > {row.tolist()}")

0 > ['Gaussian_5_2', 'what is in the image?', "random+noise+nothing+t know+not sure+unknown+can't tell+none+no idea+can't say", 1, 0, 1, 0.0, 0.0, 0.0]
1 > ['Gaussian_5_2', 'what is the dominant color of the image?', "color+gray+grey+nothing+t know+not sure+unknown+can't tell+none+no idea+can't say", 1, 0, 1, 0.0, 0.0, 0.0]
2 > ['Gaussian_5_2', 'what does the image represent?', "random+noise+nothing+t know+not sure+unknown+can't tell+none+no idea+can't say", 1, 0, 1, 0.0, 0.0, 0.0]
3 > ['Gaussian_5_2', 'why is the image random?', "random+t know+not sure+unknown+can't tell+none+no idea+can't say", 1, 0, 1, 0.0, 0.0, 0.0]
4 > ['Gaussian_5_2', "why aren't there any objects in the image?", "random+noise+t know+not sure+unknown+can't tell+none+no idea+can't say", 1, 0, 1, 0.0, 0.0, 0.0]


In [38]:
all_results_df_sub.loc[(all_results_df_sub['image_name']=="Gaussian_5_2")]

Unnamed: 0,image_name,question,answer_gt,image_type,used_language,specificity,question_length,complexity,image_relatedness
0,Gaussian_5_2,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,1,0,1,0.0,0.0,0.0
1,Gaussian_5_2,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,1,0,1,0.0,0.0,0.0
2,Gaussian_5_2,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,1,0,1,0.0,0.0,0.0
3,Gaussian_5_2,why is the image random?,random+t know+not sure+unknown+can't tell+none...,1,0,1,0.0,0.0,0.0
4,Gaussian_5_2,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,1,0,1,0.0,0.0,0.0


In [39]:
str_cols = ["x_max_str", "x_min_str", "Prob_str", "T_1_max_str", "T_1_str", "P_T_1_str"]
num_cols = ["Prob", "P_T_1"]
str_cols_epsilon, num_cols_epsilon = list(), list()
for epsilon in epsilons:
    str_cols_epsilon.append(f"T_2_max_{epsilon}_str")
    str_cols_epsilon.append(f"T_2_{epsilon}_str")
    str_cols_epsilon.append(f"P_T_2_{epsilon}_str")
    num_cols_epsilon.append(f"P_T_2_{epsilon}")
len(str_cols_epsilon), len(num_cols_epsilon)

(3, 1)

In [40]:
num_cols_epsilon

['P_T_2_N']

In [41]:
fixed_cols

['question',
 'answer_gt',
 'image_type',
 'used_language',
 'specificity',
 'question_length',
 'complexity',
 'image_relatedness']

In [42]:
# | image_name | example_question | question | answer_gt | used_language | specificity | question_length | complexity | image_relatedness | image_difficulty |
full_df_columns = ["image_name", "example_question", "question", "answer_gt", "used_language", "specificity", \
                   "question_length", "complexity", "image_relatedness", "image_difficulty", "difficulty"]
# | model_name | full_answer | clean_answer | no_tokens | correct |
full_df_columns = full_df_columns + ["model_name", "full_answer", "clean_answer", "no_tokens", "correct", "N"]
# | Prob | P_T_1 | P_T_2_0.01 | P_T_2_0.02 | ...
full_df_columns = full_df_columns + num_cols + num_cols_epsilon
# | x_max_str | x_min_str | Prob_str | T_1_max_str | T_1_str | P_T_1_str | T_2_max_0.01_str | T_2_0.01_str | P_T_2_0.01_str | ...
full_df_columns = full_df_columns + str_cols + str_cols_epsilon
print(len(full_df_columns))
full_df_columns

29


['image_name',
 'example_question',
 'question',
 'answer_gt',
 'used_language',
 'specificity',
 'question_length',
 'complexity',
 'image_relatedness',
 'image_difficulty',
 'difficulty',
 'model_name',
 'full_answer',
 'clean_answer',
 'no_tokens',
 'correct',
 'N',
 'Prob',
 'P_T_1',
 'P_T_2_N',
 'x_max_str',
 'x_min_str',
 'Prob_str',
 'T_1_max_str',
 'T_1_str',
 'P_T_1_str',
 'T_2_max_N_str',
 'T_2_N_str',
 'P_T_2_N_str']

# Create full results df and file

In [43]:
# original_qsts = pd.read_excel("./qsts_original.xlsx", sheet_name="qsts_original")
# original_qsts.head()

In [44]:
D_type = 1
all_rows = list()
for image_name in tqdm(image_names):
    df = all_results_df_sub.loc[(all_results_df_sub['image_name']==image_name)]
    image_type = 0
    image_diff = image_difficulty[image_name]
    if "Gaussian" in image_name: image_type=1
    for model_name in model_names:
        # one_row.append()
        logits_list = logits_dict[model_name][image_name]
        # print(f"len logits_list: ", len(logits_list), " len df: ", len(df))
        for idx in range(len(df)):
            one_row = list()
            row = df.iloc[idx].values.tolist()
            # print(f"{idx} > {row.tolist()}")
            example_question = original_qsts.loc[(original_qsts['image_type']==image_type)]["question"].values[idx]
            question, answer_gt = row[1:3]
            # Use automatic scores
            # used_lang, spec, question_len, comp, image_relatedness = row[4:] # skip image_type; replaced by image_difficulty
            used_lang = row[4] # same as manual
            image_relatedness = row[8] # same as manual
            question_len = question_length[example_question]
            spec = specificity[example_question]
            comp = complexity[example_question]

            question_scores = {"used_language":used_lang, "specificity":spec, "question_length":question_len, "complexity":comp}
            image_scores = {"image_difficulty":image_diff}
            relatedness_scores = {"image_relatedness":image_relatedness}
            diff = Diff_score(question_scores, image_scores, relatedness_scores, major=major_criteria, type_=D_type)

            full_answer = all_results_df.loc[(all_results_df['image_name']==image_name) & (all_results_df['question']==question)][f"{model_name}_full_answer"].values[0]
            clean_answer = all_results_df.loc[(all_results_df['image_name']==image_name) & (all_results_df['question']==question)][f"{model_name}_clean_answer"].values[0]
            no_tokens = all_results_df.loc[(all_results_df['image_name']==image_name) & (all_results_df['question']==question)][f"{model_name}_no_tokens"].values[0]
            if "Blip" in model_name: no_tokens=no_tokens-1 # exclude [SEP]
            if "GiT" in model_name: no_tokens=no_tokens-1 # exclude [SEP]
            correct = all_results_df.loc[(all_results_df['image_name']==image_name) & (all_results_df['question']==question)][f"{model_name}_correct"].values[0]
            # step 1
            # add the following column's valeus
            # | image_name | example_question | question | answer_gt | used_language | specificity | question_length | complexity | image_relatedness | ...
            #   image_difficulty | difficulty | model_name | full_answer | clean_answer | no_tokens | correct |
            one_row = [image_name, example_question, question, answer_gt, used_lang, spec, question_len, comp, image_relatedness]
            one_row.extend([image_diff, diff, model_name, full_answer, clean_answer, no_tokens, correct, N_dict[model_name]])
                        
            # step 2
            # add the following column's valeus
            # | x_max_str | x_min_str | Prob_str | T_1_max_str | T_1_str | P_T_1_str |
            # | Prob | P_T_1 |
            logits = logits_list[idx]
            if ("Blip" in model_name) and (no_tokens!=len(logits)-1): print(f"{model_name} -> {image_name} -> {idx}")
            if ("GiT" in model_name) and (no_tokens!=len(logits)-1): print(f"{model_name} -> {image_name} -> {idx}")
            num_list, str_list = list(), list()

            # x_maxs_str, x_mins_str, probs_str, t_1s_str, t_1_maxs_str, p_T_1s_str  = str(), str(), str(), str(), str(), str()
            x_maxs, x_mins, probs, t_1s, t_1_maxs, p_T_1s = list(), list(), list(), list(), list(), list()
            for logit in logits:
                # decoded token (do not ignore stopwords)
                x_maxs.append(np.max(logit))
                x_mins.append(np.min(logit))
                probs.append(np.max(softmax_stable(logit, T=1)))
                T_1_max = T_1(logit)
                t_1_maxs.append(T_1_max)
                t_1 = D_to_T(D=diff, T_max=T_1_max)
                t_1s.append(t_1)
                p_T_1s.append(np.max(softmax_stable(logit, T=t_1)))

                # prob = np.max(softmax_stable(logit, T=1))
                # seq_P = seq_P * prob
                # prob_str = prob_str + f"{str(prob)}+"
                # t_1 = T_1(logit)
                # p_T_1 = np.max(softmax_stable(logit, T=t_1))
                # seq_P_T_1 = seq_P_T_1 * p_T_1
                # t_1_str = t_1_str + f"{str(t_1)}+"
                # p_T_1_str = p_T_1_str + f"{str(p_T_1)}+"
            # one_row.extend([prob_str, seq_P]) #, t_1_str, p_T_1_str, seq_P_T_1])

            for item in [x_maxs, x_mins, probs, t_1_maxs, t_1s, p_T_1s]:
                str_list.append("+".join(list(map(str, item))))

            for item in [probs, p_T_1s]:
                temp_item = item.copy()
                if "Blip" in model_name: temp_item=temp_item[:-1] # exclude [SEP]
                if "GiT" in model_name: temp_item=temp_item[:-1] # exclude [SEP]
                num_list.append(np.prod(temp_item))       

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
            # TODO:
            # what happens when the model has no logits (empty answer)? np.prod(list()) is equal to 1 !!!
            # in case of GiT [CLS] is already not int he logits maybe??? or [SEP]
            # Stretch check: does CLS represent GiT confidnece??
            # Rubber Ducky in GiT is a good fauiler example, the qsts have same D one right and one wrong, but exact same confidence
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>              
            # step 3
            # add the following column's valeus
            # | T_2_max_0.01 | T_2_0.01 | P_T_2_0.01_str | P_T_2_0.01 | ... |
            t_2_maxs = T_max_2_dict[model_name]
            t_2_maxs = [t_2_maxs[-1]] # only epsioln=N
            for T_2_max in t_2_maxs:
                t_2 = D_to_T(D=diff, T_max=T_2_max)
                str_list.append(str(T_2_max))
                str_list.append(str(t_2))
                # p_T_2_str = str()
                # seq_P_T_2 = 1
                p_T_2s = list()
                for logit in logits:
                    p_T_2s.append(np.max(softmax_stable(logit, T=t_2)))
                str_list.append("+".join(list(map(str, p_T_2s))))
                if "Blip" in model_name: p_T_2s=p_T_2s[:-1] # exclude [SEP]
                if "GiT" in model_name: p_T_2s=p_T_2s[:-1] # exclude [SEP]
                num_list.append(np.prod(p_T_2s))
                # one_row.extend([p_T_2_str, seq_P_T_2])
            # print(len(one_row), one_row)
            one_row.extend(num_list)
            one_row.extend(str_list)
            all_rows.append(one_row)

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [01:01<00:00,  3.61s/it]


In [45]:
len(full_df_columns)

29

In [46]:
full_df = pd.DataFrame(np.array(all_rows), columns=full_df_columns)
full_df.head()

Unnamed: 0,image_name,example_question,question,answer_gt,used_language,specificity,question_length,complexity,image_relatedness,image_difficulty,...,P_T_2_N,x_max_str,x_min_str,Prob_str,T_1_max_str,T_1_str,P_T_1_str,T_2_max_N_str,T_2_N_str,P_T_2_N_str
0,Gaussian_5_2,what is in the image?,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,0,1.0,0.0357142857142857,0.2222222222222222,0.0,0.995475113122172,...,0.03421847,-2.3610375,-16.713715,0.07764137,1.7832804974941396,1.3420218417625025,0.032721758,1.7380787111952216,1.3222843425519764,0.03421847
1,Gaussian_5_2,what is the dominant color of the image?,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,0,1.0,0.1428571428571428,0.3333333333333333,0.0,0.995475113122172,...,0.16320807,-0.49326575,-27.210875,0.20900321,3.319589136322892,2.055044035268893,0.079705656,1.7380787111952216,1.3357083931854872,0.16320807
2,Gaussian_5_2,what does the image represent?,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,0,1.0,0.0357142857142857,0.1111111111111111,0.0,0.995475113122172,...,0.03863878,-2.180196,-17.993324,0.08220834,1.964737514053651,1.4123228521774975,0.031281423,1.7380787111952216,1.3154502804112798,0.03863878
3,Gaussian_5_2,why is the image random?,why is the image random?,random+t know+not sure+unknown+can't tell+none...,0,1.0,0.0357142857142857,0.1111111111111111,0.0,0.995475113122172,...,0.05039108,-2.5192337,-19.845095,0.12248334,2.152690347564782,1.492652731817528,0.03310744,1.7380787111952216,1.3154502804112798,0.05039108
4,Gaussian_5_2,why aren't there any objects in the image?,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,0,1.0,0.1428571428571428,0.3333333333333333,0.0,0.995475113122172,...,0.027359627,-3.1362438,-18.810205,0.06400901,1.9474464197595112,1.4309373924249116,0.022342704,1.7380787111952216,1.3357083931854872,0.027359627


In [47]:
numeric_cols = ["used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty", "difficulty", \
                "no_tokens", "correct", "N"]
numeric_cols = numeric_cols + num_cols + num_cols_epsilon
numeric_cols

['used_language',
 'specificity',
 'question_length',
 'complexity',
 'image_relatedness',
 'image_difficulty',
 'difficulty',
 'no_tokens',
 'correct',
 'N',
 'Prob',
 'P_T_1',
 'P_T_2_N']

In [48]:
rel = full_df["image_relatedness"]
max(rel), min(rel)

('1.0', '0.0')

In [49]:
full_df[numeric_cols]

Unnamed: 0,used_language,specificity,question_length,complexity,image_relatedness,image_difficulty,difficulty,no_tokens,correct,N,Prob,P_T_1,P_T_2_N
0,0,1.0,0.03571428571428571,0.2222222222222222,0.0,0.995475113122172,0.43665308003543296,1,0,3129,0.07764137,0.032721758,0.03421847
1,0,1.0,0.14285714285714285,0.3333333333333333,0.0,0.995475113122172,0.45484091072326366,1,0,3129,0.20900321,0.079705656,0.16320807
2,0,1.0,0.03571428571428571,0.1111111111111111,0.0,0.995475113122172,0.42739382077617377,1,0,3129,0.08220834,0.031281423,0.03863878
3,0,1.0,0.03571428571428571,0.1111111111111111,0.0,0.995475113122172,0.42739382077617377,1,0,3129,0.12248334,0.03310744,0.05039108
4,0,1.0,0.14285714285714285,0.3333333333333333,0.0,0.995475113122172,0.45484091072326366,1,0,3129,0.06400901,0.022342704,0.027359627
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14271,0,0.5,0.8928571428571429,0.7777777777777778,1.0,0.5761689291101055,0.7062758864229451,1,0,30522,0.39408246,0.11134465,0.2185995
14272,0,1.0,0.8571428571428571,0.7777777777777778,1.0,0.5761689291101055,0.7449663626134214,1,0,30522,0.33916935,0.08689355,0.18697995
14273,1,1.0,0.0,0.1111111111111111,0.0,0.5761689291101055,0.5253896430367019,1,0,30522,0.8140675,0.42738912,0.61511827
14274,1,1.0,0.35714285714285715,0.3333333333333333,0.0,0.5761689291101055,0.5253896430367019,1,0,30522,0.5139403,0.211758,0.32532677


In [50]:
full_df[numeric_cols] = full_df[numeric_cols].apply(pd.to_numeric)

In [54]:
# full_df.to_excel(f"./{model_type}_full_results_D_type_{D_type}_semi.xlsx", sheet_name=f"{model_type}_D_type_{D_type}_results", index_label='ID')

# For latex visualization (score distribution)

In [55]:
        #   ($\mathrm{D_{R}}$, 1229)
        #   ($\mathrm{D_{I}}$, 1068) 
        #   ($\mathrm{d_{Tu}}$, 3452)
        #   ($\mathrm{d_{Ts}}$, 1794) 
        #   ($\mathrm{d_{Tl}}$, 1177) 
        #   ($\mathrm{d_{Tc}}$, 1229)

In [56]:
D_type = 1
full_df = pd.read_excel(f"./{model_type}_full_results_D_type_{D_type}_semi.xlsx", sheet_name=f"{model_type}_D_type_{D_type}_results")
full_df.head()

Unnamed: 0,ID,image_name,example_question,question,answer_gt,used_language,specificity,question_length,complexity,image_relatedness,...,P_T_2_N,x_max_str,x_min_str,Prob_str,T_1_max_str,T_1_str,P_T_1_str,T_2_max_N_str,T_2_N_str,P_T_2_N_str
0,0,Gaussian_5_2,what is in the image?,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,0,1.0,0.035714,0.222222,0.0,...,0.034218,-2.3610375,-16.713715,0.07764137,1.7832804974941396,1.3420218417625025,0.032721758,1.738079,1.322284,0.03421847
1,1,Gaussian_5_2,what is the dominant color of the image?,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,0,1.0,0.142857,0.333333,0.0,...,0.163208,-0.49326575,-27.210875,0.20900321,3.319589136322892,2.055044035268893,0.079705656,1.738079,1.335708,0.16320807
2,2,Gaussian_5_2,what does the image represent?,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,0,1.0,0.035714,0.111111,0.0,...,0.038639,-2.180196,-17.993324,0.08220834,1.964737514053651,1.4123228521774975,0.031281423,1.738079,1.31545,0.03863878
3,3,Gaussian_5_2,why is the image random?,why is the image random?,random+t know+not sure+unknown+can't tell+none...,0,1.0,0.035714,0.111111,0.0,...,0.050391,-2.5192337,-19.845095,0.12248334,2.152690347564782,1.492652731817528,0.03310744,1.738079,1.31545,0.05039108
4,4,Gaussian_5_2,why aren't there any objects in the image?,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,0,1.0,0.142857,0.333333,0.0,...,0.02736,-3.1362438,-18.810205,0.06400901,1.9474464197595112,1.4309373924249116,0.022342704,1.738079,1.335708,0.027359627


In [57]:
min(full_df["image_relatedness"].to_list()), max(full_df["image_relatedness"].to_list())

(0.0, 1.0)

In [58]:
score_ranges = [(i/100, (i+20)/100) for i in range(0,100,20)]
score_ranges

[(0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)]

In [59]:
sub_df = full_df[full_df["model_name"]=="Vilt"]
len(sub_df)

3569

In [64]:
np.unique(sub_df["image_relatedness"].to_list(), return_counts=True)

(array([0. , 0.5, 1. ]), array([1229, 1170, 1170], dtype=int64))

In [60]:
D_R = sub_df["image_relatedness"].to_list()
D_I = sub_df["image_difficulty"].to_list()
d_Tu = sub_df["used_language"].to_list()
d_Ts = sub_df["specificity"].to_list()
d_Tl = sub_df["question_length"].to_list()
d_Tc = sub_df["complexity"].to_list()

D_lists = {
    "{D_{R}}": D_R,
    "{D_{I}}": D_I,
    "{d_{Tu}}": d_Tu,
    "{d_{Ts}}": d_Ts,
    "{d_{Tl}}": d_Tl,
    "{d_{Tc}}": d_Tc
}

In [61]:
sum((i>=0 and i<0.2) for i in D_R)

1229

In [62]:
# r_idx = 0


for r_idx in range(len(score_ranges)-1):
    a, b = score_ranges[r_idx]
    for name, D in D_lists.items():
        s = sum((i>=a and i<b) for i in D)
        line = f"($\mathrm{name}$, {s})"
        print(line)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")

($\mathrm{D_{R}}$, 1229)
($\mathrm{D_{I}}$, 0)
($\mathrm{d_{Tu}}$, 3530)
($\mathrm{d_{Ts}}$, 429)
($\mathrm{d_{Tl}}$, 1034)
($\mathrm{d_{Tc}}$, 177)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
($\mathrm{D_{R}}$, 0)
($\mathrm{D_{I}}$, 0)
($\mathrm{d_{Tu}}$, 0)
($\mathrm{d_{Ts}}$, 0)
($\mathrm{d_{Tl}}$, 299)
($\mathrm{d_{Tc}}$, 1546)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
($\mathrm{D_{R}}$, 1170)
($\mathrm{D_{I}}$, 3003)
($\mathrm{d_{Tu}}$, 0)
($\mathrm{d_{Ts}}$, 1443)
($\mathrm{d_{Tl}}$, 1027)
($\mathrm{d_{Tc}}$, 1066)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
($\mathrm{D_{R}}$, 0)
($\mathrm{D_{I}}$, 546)
($\mathrm{d_{Tu}}$, 0)
($\mathrm{d_{Ts}}$, 0)
($\mathrm{d_{Tl}}$, 312)
($\mathrm{d_{Tc}}$, 754)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [65]:
r_idx = 4
a, b = score_ranges[r_idx]
for name, D in D_lists.items():
    s = sum((i>=a) for i in D)
    line = f"($\mathrm{name}$, {s})"
    print(line)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")

($\mathrm{D_{R}}$, 1170)
($\mathrm{D_{I}}$, 20)
($\mathrm{d_{Tu}}$, 39)
($\mathrm{d_{Ts}}$, 1697)
($\mathrm{d_{Tl}}$, 897)
($\mathrm{d_{Tc}}$, 26)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [66]:
Diff = sub_df["difficulty"].to_list()

vs, cs = np.unique(Diff, return_counts=True)

vs, cs

(array([0.22294195, 0.22445025, 0.22495302, ..., 0.76105485, 0.76252274,
        0.76403104]),
 array([2, 2, 2, ..., 2, 2, 2], dtype=int64))

In [67]:
min(vs), max(vs)

(0.2229419545596016, 0.7640310397663339)

In [68]:
cont_D_ranges = [(i/100, (i+4)/100) for i in range(0,86,4)]
len(cont_D_ranges), cont_D_ranges

(22,
 [(0.0, 0.04),
  (0.04, 0.08),
  (0.08, 0.12),
  (0.12, 0.16),
  (0.16, 0.2),
  (0.2, 0.24),
  (0.24, 0.28),
  (0.28, 0.32),
  (0.32, 0.36),
  (0.36, 0.4),
  (0.4, 0.44),
  (0.44, 0.48),
  (0.48, 0.52),
  (0.52, 0.56),
  (0.56, 0.6),
  (0.6, 0.64),
  (0.64, 0.68),
  (0.68, 0.72),
  (0.72, 0.76),
  (0.76, 0.8),
  (0.8, 0.84),
  (0.84, 0.88)])

In [69]:
for (a, b) in cont_D_ranges:
    interval = f"{a}--{b}"
    interval = f"{a}"
    print(interval, end=",")

0.0,0.04,0.08,0.12,0.16,0.2,0.24,0.28,0.32,0.36,0.4,0.44,0.48,0.52,0.56,0.6,0.64,0.68,0.72,0.76,0.8,0.84,

In [70]:
line = "Dscore\t&\tDcount\t\\\\"
print(line)
for (a, b) in cont_D_ranges:
    interval = f"{a}--{b}"
    interval = f"{a}"
    s = sum((i>=a and i<b) for i in Diff)
    line = f"{interval}\t&\t{s}\t\\\\"
    print(line)

Dscore	&	Dcount	\\
0.0	&	0	\\
0.04	&	0	\\
0.08	&	0	\\
0.12	&	0	\\
0.16	&	0	\\
0.2	&	33	\\
0.24	&	175	\\
0.28	&	292	\\
0.32	&	284	\\
0.36	&	314	\\
0.4	&	287	\\
0.44	&	323	\\
0.48	&	258	\\
0.52	&	317	\\
0.56	&	220	\\
0.6	&	285	\\
0.64	&	324	\\
0.68	&	293	\\
0.72	&	158	\\
0.76	&	6	\\
0.8	&	0	\\
0.84	&	0	\\


In [71]:
x_coords, coords = str(), str()

for v, c in zip(vs, cs):
    x = round(v, 2)
    x_coords += f"{x},"
    coords += f"({x},{c}) "

In [72]:
# raw = np.column_stack((vs, cs))
# np.savetxt(f"./rawdata.dat", raw)