In [2]:
import openai
import json
from openai import OpenAI
import os
from tqdm import tqdm
import numpy as np

# Set your OpenAI API key
os.environ['OPENAI_API_KEY'] = YOUR_API_KEY
client = OpenAI()

def check_match_with_gpt(question, ground_truth, predicted):
    # Construct the prompt for GPT-4
    prompt = f"Question: {question}\nGround Truth Answer: {ground_truth}\nPredicted Answer: {predicted}\nDoes the predicted answer match the ground truth? Answer 1 for match and 0 for not match. Use semantic meaning not exact match. Synonyms are also treated as a match, e.g., football and soccer, playground and ground track field, building and rooftop, pond and swimming pool. Do not explain the reason.\n"

    response = client.chat.completions.create(
        # model="gpt-3.5-turbo-1106",
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text", 
                        "text": prompt,
                    },
                ]
            }
        ],
        max_tokens=100,
    )

    # answer = response.choices[0].text.strip()
    answer =  response.choices[0].message.content
    
    return answer



# VRSBench

In [None]:
qa_list = [json.loads(line) for line in open('Results/outputs_RSBench_new/rsbench_vqa_v2.json','r').readlines()]

# Iterate over the list and check matches
results = []
f = open('Results/outputs_RSBench_new/rsbench_vqa_v2_gpt.json', 'w') 
for ii, qa in enumerate(tqdm(qa_list[37342:])):
    question = qa['question']
    ground_truth = qa['ground_truth'].lower()
    predicted = qa['answer'].lower()
    if ground_truth in predicted:
        match_result = '1'
    elif ground_truth in ['yes', 'no'] + list(map(str, range(100))):
        match_result = '1' if ground_truth == predicted else '0'
    elif 'correct' not in qa or qa['correct'] not in ['1', '0']:
        match_result = check_match_with_gpt(question, ground_truth, predicted)
    else:
        match_result = qa['correct']
        
    result = {
        'question_id': qa['question_id'],
        'image_id': qa['image_id'],
        "type": qa['type'],
        "question": question,
        "ground_truth": ground_truth,
        "predicted": predicted,
        "correct": match_result,
    }
    results.append(result)

    f.write(json.dumps(result)+'\n')
    f.flush()

f.close()
for result in results:
    if ii>5:
        break
    print(result)


In [None]:
f = open('Results/outputs_RSBench_new/rsbench_vqa_v2_gpt.json', 'r') 
results = [json.loads(line) for line in f.readlines()]
f.close()
correct = sum([int(result['correct']) for result in results if result['correct'] in ['1', '0']])
print(f"Correct: {correct}/{len(results)}:", correct/len(results))

# Metrics per types

In [None]:
import inflect

# Create an engine instance
convert = inflect.engine()

data_path = 'Results/outputs_MGM-7B_RSBench-new/rsbench_vqa_v2_gpt.json'

correct = 0
total = 0

all_types = ['object category', 'object existence', 'object quantity', 'object color', 'object shape', 'object size', 'object position', 'object direction', 'image', 'scene type', 'reasoning', 'rural or urban']

print('number of question types:', len(all_types))

all_numbers = [convert.number_to_words(x) for x in range(100)]

# create a dict with types as key and value to zero
correct_per_type = {k: 0 for k in all_types}
total_per_type = {k: 0 for k in all_types}
invalid_type = 0
skip_qas = 0
with open(data_path, 'r') as file:
    for line in file:
        # Convert JSON string to Python dictionary
        item = json.loads(line.strip())
        img_id = item['image_id']

        gt_ans = item['ground_truth'].lower()
        pred_ans = item['predicted'].lower()
        
        q_type = item['type'].lower()
        if q_type == 'image': q_type = 'scene type'
        if q_type == 'rural or urban': q_type = 'scene type'

        if q_type in all_types:
            total_per_type[q_type] += 1
        else:
            print('unknown type:', q_type)
            invalid_type += 1

        if item['correct'] == '1':
            correct += 1
            if q_type in all_types:
                correct_per_type[q_type] += 1
        
        total += 1

print('number of questions:', total, 'invalid_type:', invalid_type, 'valid', sum(total_per_type.values()))
print('Overall acc:', correct/total * 100)
# divide by the number of questions of that type
print('##############')
acc_list = []
for k in all_types:
    if total_per_type[k] == 0:
        continue
    print(f'{k} accuracy: {correct_per_type[k]/total_per_type[k] * 100}, out of {total_per_type[k]}')
    acc = correct_per_type[k]/total_per_type[k] * 100
    acc_list.append(acc)

In [None]:
# print acc_list in format xx.x for each type, split by & for latex table
print(' & '.join([f'{acc:.1f}' for acc in list(acc_list) + [np.mean(acc_list)]]))

In [None]:
skip_qas