# Evaluation 

## Load Config

In [24]:
import yaml
import torch
import numpy as np
import random
import os
from PIL import Image

def load_config(config_path,config_name):
    with open(os.path.join(config_path, config_name)) as file:
        config = yaml.safe_load(file)
    return config

config = load_config("../","config.yaml")

## Constants

In [25]:
EVAL_MODEL = config["eval"]["llava"]["model"]
IMG_PATH = config["image_path"]
RESULT_JSON_PATH = config["eval"]["llava"]["json_path"]

In [26]:
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained(EVAL_MODEL)
processor = CLIPProcessor.from_pretrained(EVAL_MODEL)

config.json: 100%|██████████| 4.19k/4.19k [00:00<00:00, 16.1MB/s]
pytorch_model.bin: 100%|██████████| 605M/605M [01:15<00:00, 7.99MB/s] 
preprocessor_config.json: 100%|██████████| 316/316 [00:00<00:00, 1.38MB/s]
tokenizer_config.json: 100%|██████████| 568/568 [00:00<00:00, 2.52MB/s]
vocab.json: 100%|██████████| 862k/862k [00:00<00:00, 3.33MB/s]
merges.txt: 100%|██████████| 525k/525k [00:00<00:00, 34.2MB/s]
tokenizer.json: 100%|██████████| 2.22M/2.22M [00:01<00:00, 1.75MB/s]
special_tokens_map.json: 100%|██████████| 389/389 [00:00<00:00, 2.46MB/s]


In [32]:
import json

def unpack_json(json_file_path):
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File '{json_file_path}' not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in '{json_file_path}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [33]:
unpacked_data = unpack_json(RESULT_JSON_PATH+"LLaVa_3W_split_003.json")

descriptions = "<SEP>".join([i["description"] for i in unpacked_data])

In [34]:
def calculate_score(image_path, text):
    # desc
    labels = text.split("<SEP>")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    # load image
    image = Image.open(image_path)
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image.detach().numpy()

    results_dict = {
        label: score / 100.0 for label, score in zip(labels, logits_per_image[0])
    }
    return dict(sorted(results_dict.items(), key=lambda item: item[1], reverse=True))
#     return results_dict

In [35]:
calculate_score(IMG_PATH,descriptions)

{'The food truck is white and red, and it is parked in front of a monument.': 0.34172752380371096,
 'A large group of people is gathered around a food truck, with some of them standing in line to order food.': 0.27247621536254885,
 'There are several traffic lights in the area, with one close to the food truck and others scattered around the scene.': 0.2581420135498047,
 'A bus is visible in the background, adding to the busy atmosphere of the scene.': 0.25417074203491213,
 'The monument is a large stone structure with a statue on top.': 0.2522931671142578,
 'A truck is parked further back in the scene, adding to the overall sense of a bustling urban environment.': 0.24343490600585938,
 'A bench is located near the monument, providing a place for people to sit and enjoy the view.': 0.22047126770019532,
 'The people in the image are of various ages and genders, reflecting the diverse nature of the city.': 0.21911945343017578,
 'A person is holding a cell phone, possibly taking a picture