# Load data

In [33]:
import json
from statistics import mean

# Load the JSON data from the file
file_path = 'batch.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Define the list of candidate models and judge models
candidate_model = ["llama_vid", "gpt4v", "video_chatgpt", "mplug_owl_Video", "video_llava"]
judge_model = ['gpt4v']  # Assuming 'mplug_owl_Video' is not a judge

# Initialize the data structure for storing scores
dataset2data = {}

# Function to extract ratings from JSON-formatted strings
import re
import json

def extract_rating(rating_string):
    """
    Extracts the rating number from a string using regex to handle different formats.
    
    Args:
    rating_string (str): The input string containing the rating.
    
    Returns:
    int: The extracted rating number or None if not found.
    """
    # Regex pattern to find rating in different formats
    patterns = [
        r"rating['\"]?\s*:\s*['\"]?(\d+)",   # Matches JSON-like formats with 'rating' key
        r"rating['\"]?\s*:\s*['\"]?\[\[(\d+)\]\]",  # Matches JSON-like formats with rating enclosed in double brackets
        r"rating['\"]?\s*:\s*['\"]?\[(\d+)\]",  # Matches JSON-like formats with rating enclosed in single brackets
        r"^(\d+)\s*\(",                      # Matches formats like "4 (Excellent)"
        r"^(\d+)$",                          # Matches plain number formats like "4"
    ]


    for pattern in patterns:
        match = re.search(pattern, rating_string)
        if match:
            return int(match.group(1))
    
    return None

selected_dataset =( 
    "cvrr_time_order_understanding",
    "cvrr_continuity_and_object_instance_count",
    "cvrr_fine_grained_action_understanding",
    "cvrr_interpretation_of_social_context",
    "cvrr_interpretation_of_visual_context",
    "cvrr_multiple_actions_in_a_single_video",
)

# Iterate through each dataset and each sample, collecting ratings
for dataset, samples in data.items():
    if dataset not in selected_dataset:
        continue
    results = {judge: {candidate: [] for candidate in candidate_model} for judge in judge_model}
    for sample_id, sample_data in samples.items():
        for judge in judge_model:
            if judge in sample_data:
                rating_info = sample_data[judge]
                for candidate, rating_str in rating_info.items():
                    rating = extract_rating(rating_str)
                    if rating is not None:
                        results[judge][candidate].append(rating)
                    else:
                        print(f'rating_str:{rating_str}')
                        print(f'rating:{rating}')
                        results[judge][candidate].append(3)
    dataset2data[dataset] = results


rating_str:
rating:None
rating_str:
rating:None
rating_str:
rating:None


In [34]:
len(dataset2data)

6

In [35]:
import json
# gpt_debate score
input_file = "../logs_phase1_processed/correct/batch_with_score.json"
with open(input_file, 'r') as f:
    data = json.load(f)

In [36]:
for dataset, samples in data.items():
    if dataset not in selected_dataset:
        continue
    results = {'gpt_debate': {candidate: [] for candidate in candidate_model}}
    for sample_id, sample_data in samples.items():
        for candidate in candidate_model:
            rating = sample_data[candidate][1]
            results['gpt_debate'][candidate].append(rating)
    dataset2data[dataset]['gpt_debate'] = results['gpt_debate']

# Get statistics of judge

In [37]:
judge_model_all = judge_model + ['gpt_debate']
judge2statistics = {judge: {i:0 for i in range(6)} for judge in judge_model_all}
for dataset, data in dataset2data.items():
    for judge in judge_model_all:
        for candidate in candidate_model:
            for rating in dataset2data[dataset][judge][candidate]:
                judge2statistics[judge][rating] += 1
judge2statistics

{'gpt4v': {0: 0, 1: 2784, 2: 1275, 3: 708, 4: 1854, 5: 529},
 'gpt_debate': {0: 0, 1: 3561, 2: 1427, 3: 551, 4: 1197, 5: 414}}

In [38]:
judge_model_all = judge_model + ['gpt_debate']
candidate2statistics = {dataset: {candidate: {judge:[] for judge in judge_model_all} for candidate in candidate_model} for dataset in dataset2data}

for dataset, data in dataset2data.items():
    if dataset not in selected_dataset:
        continue
    for judge in judge_model_all:
        for candidate in candidate_model:
            for rating in dataset2data[dataset][judge][candidate]:
                candidate2statistics[dataset][candidate][judge].append(rating)
            candidate2statistics[dataset][candidate][judge] = sum(candidate2statistics[dataset][candidate][judge]) / len(candidate2statistics[dataset][candidate][judge])
candidate2statistics

{'cvrr_continuity_and_object_instance_count': {'llama_vid': {'gpt4v': 2.07909604519774,
   'gpt_debate': 1.9435028248587571},
  'gpt4v': {'gpt4v': 2.6271186440677967, 'gpt_debate': 2.4745762711864407},
  'video_chatgpt': {'gpt4v': 2.073446327683616,
   'gpt_debate': 2.135593220338983},
  'mplug_owl_Video': {'gpt4v': 2.1129943502824857,
   'gpt_debate': 1.8531073446327684},
  'video_llava': {'gpt4v': 2.1129943502824857,
   'gpt_debate': 1.9661016949152543}},
 'cvrr_fine_grained_action_understanding': {'llama_vid': {'gpt4v': 2.3086956521739133,
   'gpt_debate': 1.934782608695652},
  'gpt4v': {'gpt4v': 3.108695652173913, 'gpt_debate': 3.082608695652174},
  'video_chatgpt': {'gpt4v': 2.1826086956521737,
   'gpt_debate': 1.9652173913043478},
  'mplug_owl_Video': {'gpt4v': 2.391304347826087,
   'gpt_debate': 2.0695652173913044},
  'video_llava': {'gpt4v': 2.3869565217391306,
   'gpt_debate': 2.0652173913043477}},
 'cvrr_interpretation_of_social_context': {'llama_vid': {'gpt4v': 1.87142857142

# Agreement

In [39]:
from sklearn.metrics import cohen_kappa_score

def get_agreement_with_baseline(dataset2data, dataset, judge, candidate):
    ratings1 = dataset2data[dataset][judge][candidate]
    ratings2 = dataset2data[dataset]['gpt_debate'][candidate]
    # print(f"ratings1:{ratings1}")
    # print(f"ratings2:{ratings2}")
    # Calculate Weighted Kappa with linear weights
    # kappa = cohen_kappa_score(ratings1, ratings2)
    # print("Kappa:", kappa)

    # # Calculate Weighted Kappa with linear weights
    # weighted_kappa_linear = cohen_kappa_score(ratings1, ratings2, weights='linear')
    # print("Weighted Kappa (Linear):", weighted_kappa_linear)

    # Calculate Weighted Kappa with quadratic weights
    weighted_kappa_quadratic = cohen_kappa_score(ratings1, ratings2, weights='quadratic')
    # print("Weighted Kappa (Quadratic):", weighted_kappa_quadratic)
    return weighted_kappa_quadratic

dataset2agreement = {}
for dataset, data in dataset2data.items():
    dataset2agreement[dataset] = {}
    for judge in judge_model:
        dataset2agreement[dataset][judge] = {}
        judge_total = 0
        for candidate in candidate_model:
            dataset2agreement[dataset][judge][candidate] = get_agreement_with_baseline(dataset2data, dataset, judge, candidate)
            judge_total += dataset2agreement[dataset][judge][candidate]
        dataset2agreement[dataset][judge]['average'] = judge_total / len(dataset2agreement[dataset][judge])

In [40]:
dataset2agreement

{'cvrr_continuity_and_object_instance_count': {'gpt4v': {'llama_vid': 0.29907670153741905,
   'gpt4v': 0.41423291542865426,
   'video_chatgpt': 0.3889085224955824,
   'mplug_owl_Video': 0.2515637483193358,
   'video_llava': 0.30288946130585725,
   'average': 0.33133426981736974}},
 'cvrr_fine_grained_action_understanding': {'gpt4v': {'llama_vid': 0.48335431881138247,
   'gpt4v': 0.3307940538538857,
   'video_chatgpt': 0.6069815696326133,
   'mplug_owl_Video': 0.5221222521101938,
   'video_llava': 0.5766174291743367,
   'average': 0.5039739247164825}},
 'cvrr_interpretation_of_social_context': {'gpt4v': {'llama_vid': 0.5458612975391499,
   'gpt4v': 0.45141065830721006,
   'video_chatgpt': 0.6193746131855825,
   'mplug_owl_Video': 0.5943039550836522,
   'video_llava': 0.6005217674873635,
   'average': 0.5622944583205917}},
 'cvrr_interpretation_of_visual_context': {'gpt4v': {'llama_vid': 0.2916566134246972,
   'gpt4v': 0.400340758186966,
   'video_chatgpt': 0.41180602939742994,
   'mplug

In [41]:
dataset2average = {}
judge_model.append('gpt_debate')
for dataset, data in dataset2data.items():
    dataset2average[dataset] = {}
    for judge in judge_model:
        dataset2average[dataset][judge] = {}
        judge_total = 0
        for candidate in candidate_model:
            judge_total += sum(dataset2data[dataset][judge][candidate]) / len(dataset2data[dataset][judge][candidate])
        dataset2average[dataset][judge] = judge_total / len(candidate_model)

In [42]:
dataset2average

{'cvrr_continuity_and_object_instance_count': {'gpt4v': 2.201129943502825,
  'gpt_debate': 2.074576271186441},
 'cvrr_fine_grained_action_understanding': {'gpt4v': 2.4756521739130433,
  'gpt_debate': 2.223478260869565},
 'cvrr_interpretation_of_social_context': {'gpt4v': 2.322857142857143,
  'gpt_debate': 2.137857142857143},
 'cvrr_interpretation_of_visual_context': {'gpt4v': 2.7282051282051283,
  'gpt_debate': 2.183882783882784},
 'cvrr_multiple_actions_in_a_single_video': {'gpt4v': 2.3150943396226413,
  'gpt_debate': 1.89937106918239},
 'cvrr_time_order_understanding': {'gpt4v': 2.7197368421052635,
  'gpt_debate': 2.025}}

In [43]:
for dataset, data in dataset2agreement.items():
    for judge in judge_model:
        if judge == 'gpt_debate':
            continue
        print(f'{dataset}-{judge}')
        print(dataset2agreement[dataset][judge]['average'])

cvrr_continuity_and_object_instance_count-gpt4v
0.33133426981736974
cvrr_fine_grained_action_understanding-gpt4v
0.5039739247164825
cvrr_interpretation_of_social_context-gpt4v
0.5622944583205917
cvrr_interpretation_of_visual_context-gpt4v
0.39723295571751116
cvrr_multiple_actions_in_a_single_video-gpt4v
0.3907170734051716
cvrr_time_order_understanding-gpt4v
0.34094229000150933


In [44]:
dataset2agreement[dataset]

{'gpt4v': {'llama_vid': 0.23355831109715908,
  'gpt4v': 0.37167349271359185,
  'video_chatgpt': 0.35144749290444655,
  'mplug_owl_Video': 0.39161189034384003,
  'video_llava': 0.35642026294850915,
  'average': 0.34094229000150933}}