In [None]:
import json

input_file = "./batch.json"
with open(input_file, 'r') as f:
    data = json.load(f)

In [None]:
import json
from statistics import mean

# Load the JSON data from the file
file_path = 'batch.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Define the list of candidate models and judge models
candidate_model = ["llama_vid", "gpt4v", "video_chatgpt", "mplug_owl_Video", "video_llava"]
judge_model = ['video_llava', 'llama_vid', 'gpt4v']  # Assuming 'mplug_owl_Video' is not a judge

# Initialize the data structure for storing scores
dataset2data = {}

# Function to extract ratings from JSON-formatted strings
import re
import json

def extract_rating(rating_string):
    """
    Extracts the rating number from a string using regex to handle different formats.
    
    Args:
    rating_string (str): The input string containing the rating.
    
    Returns:
    int: The extracted rating number or None if not found.
    """
    # Regex pattern to find rating in different formats
    # patterns = [
    #     r"rating['\"]?\s*:\s*['\"]?(\d+)",  # Matches JSON-like formats with 'rating' key
    #     r"^(\d+)\s*\(",                     # Matches formats like "4 (Excellent)"
    #     r"^(\d+)$",                         # Matches plain number formats like "4"
    # ]
    patterns = [
        r"rating['\"]?\s*:\s*['\"]?(\d+)",   # Matches JSON-like formats with 'rating' key
        r"rating['\"]?\s*:\s*['\"]?\[\[(\d+)\]\]",  # Matches JSON-like formats with rating enclosed in double brackets
        r"rating['\"]?\s*:\s*['\"]?\[(\d+)\]",  # Matches JSON-like formats with rating enclosed in single brackets
        r"^(\d+)\s*\(",                      # Matches formats like "4 (Excellent)"
        r"^(\d+)$",                          # Matches plain number formats like "4"
    ]


    for pattern in patterns:
        match = re.search(pattern, rating_string)
        if match:
            return int(match.group(1))
    
    return None

# Iterate through each dataset and each sample, collecting ratings
for dataset, samples in data.items():
    results = {judge: {candidate: [] for candidate in candidate_model} for judge in judge_model}
    for sample_id, sample_data in samples.items():
        for judge in judge_model:
            if judge in sample_data:
                rating_info = sample_data[judge]
                for candidate, rating_str in rating_info.items():
                    rating = extract_rating(rating_str)
                    if rating is not None:
                        results[judge][candidate].append(rating)
                    else:
                        print(f'rating_str:{rating_str}')
                        print(f'rating:{rating}')
                        results[judge][candidate].append(None)
    dataset2data[dataset] = results

    # # Calculate average ratings for each candidate model under each judge for the dataset
    # for judge, candidates in results.items():
    #     for candidate, ratings in candidates.items():
    #         if ratings:
    #             average = round(mean(ratings), 2)
    #         else:
    #             average = None
    #         if dataset not in dataset2data:
    #             dataset2data[dataset] = {}
    #         if judge not in dataset2data[dataset]:
    #             dataset2data[dataset][judge] = {}
    #             dataset2data[dataset][judge][candidate] = results[judge][candidate]
                


In [None]:
import json

input_file = "../logs_phase1_processed/batch_with_score.json"
with open(input_file, 'r') as f:
    data = json.load(f)

In [None]:
for dataset, samples in data.items():
    results = {'gpt3.5': {candidate: [] for candidate in candidate_model}}
    for sample_id, sample_data in samples.items():
        for candidate in candidate_model:
            rating = sample_data[candidate][1]
            results['gpt3.5'][candidate].append(rating)
    dataset2data[dataset]['gpt3.5'] = results['gpt3.5']

In [None]:
from sklearn.metrics import cohen_kappa_score

def get_agreement_with_baseline(dataset2data, dataset, judge, candidate):
    ratings1 = dataset2data[dataset][judge][candidate]
    ratings2 = dataset2data[dataset]['gpt3.5'][candidate]
    # print(f"ratings1:{ratings1}")
    # print(f"ratings2:{ratings2}")
    # Calculate Weighted Kappa with linear weights
    # kappa = cohen_kappa_score(ratings1, ratings2)
    # print("Kappa:", kappa)

    # # Calculate Weighted Kappa with linear weights
    # weighted_kappa_linear = cohen_kappa_score(ratings1, ratings2, weights='linear')
    # print("Weighted Kappa (Linear):", weighted_kappa_linear)

    # Calculate Weighted Kappa with quadratic weights
    weighted_kappa_quadratic = cohen_kappa_score(ratings1, ratings2, weights='quadratic')
    # print("Weighted Kappa (Quadratic):", weighted_kappa_quadratic)
    return weighted_kappa_quadratic

dataset2agreement = {}
for dataset, data in dataset2data.items():
    dataset2agreement[dataset] = {}
    for judge in judge_model:
        dataset2agreement[dataset][judge] = {}
        judge_total = 0
        for candidate in candidate_model:
            dataset2agreement[dataset][judge][candidate] = get_agreement_with_baseline(dataset2data, dataset, judge, candidate)
            judge_total += dataset2agreement[dataset][judge][candidate]
        dataset2agreement[dataset][judge]['average'] = judge_total / len(dataset2agreement[dataset][judge])

In [24]:
dataset2average = {}
judge_model.append('gpt3.5')
for dataset, data in dataset2data.items():
    dataset2average[dataset] = {}
    for judge in judge_model:
        dataset2average[dataset][judge] = {}
        judge_total = 0
        for candidate in candidate_model:
            judge_total += sum(dataset2data[dataset][judge][candidate]) / len(dataset2data[dataset][judge][candidate])
        dataset2average[dataset][judge] = judge_total / len(candidate_model)

In [25]:
dataset2average

{'cvrr_continuity_and_object_instance_count': {'video_llava': 3.9570621468926555,
  'llama_vid': 3.8237288135593226,
  'gpt4v': 2.464406779661017,
  'gpt3.5': 2.6203389830508472},
 'cvrr_fine_grained_action_understanding': {'video_llava': 3.9678260869565216,
  'llama_vid': 3.8521739130434782,
  'gpt4v': 3.058260869565218,
  'gpt3.5': 2.677391304347826},
 'cvrr_interpretation_of_social_context': {'video_llava': 3.9685714285714284,
  'llama_vid': 3.888571428571429,
  'gpt4v': 2.6592857142857147,
  'gpt3.5': 2.6014285714285714},
 'cvrr_interpretation_of_visual_context': {'video_llava': 3.970695970695971,
  'llama_vid': 3.892307692307692,
  'gpt4v': 2.726739926739927,
  'gpt3.5': 2.58974358974359},
 'cvrr_multiple_actions_in_a_single_video': {'video_llava': 3.942767295597484,
  'llama_vid': 3.752830188679245,
  'gpt4v': 2.5628930817610067,
  'gpt3.5': 2.3465408805031442},
 'cvrr_non_existent_actions_with_existent_scene_depictions': {'video_llava': 3.9,
  'llama_vid': 3.8594202898550725,
  

In [None]:
# for dataset, data in dataset2agreement.items():
#     for judge in judge_model:
#         print(f'{dataset}-{judge}')
#         print(dataset2agreement[dataset][judge]['average'])

In [None]:
# def weighted_average_ratings(ratings, weight_dict):
#     import numpy as np
    
#     # Extract weights for the judges in the order they appear in the ratings dictionary
#     weights = [weight_dict[judge] for judge in ratings.keys()]
    
#     # Normalize weights to sum to 1
#     total_weight = sum(weights)
#     normalized_weights = [w / total_weight for w in weights]

#     # Initialize list to store weighted ratings
#     aggregated_ratings = []

#     # Convert dictionary values to a list of lists for easier manipulation
#     ratings_list = list(ratings.values())

#     # Calculate the weighted average for each rating position
#     for rating_tuple in zip(*ratings_list):
#         weighted_avg = sum(r * w for r, w in zip(rating_tuple, normalized_weights))
#         aggregated_ratings.append(round(weighted_avg, 2))

#     return aggregated_ratings

# Ratings from the judges
# judge_ratings = {
#     'video_llava': [4, 4, 4, 4, 4],
#     'llama_vid': [4, 4, 4, 4, 4],
#     'gpt4v': [2, 1, 1, 1, 2],
# }

# Dictionary of weights, assigning the highest weight to gpt4v
# weights_dict = {
#     'video_llava': 0.05,  # Lowest weight
#     'llama_vid': 0.05,    # Moderate weight
#     'gpt4v': 0.9         # Highest weight
# }

In [None]:
# for dataset, data in dataset2data.items():
#     dataset2data[dataset]['interpolated_judge'] = {}
#     for judge in judge_model:
#         for candidate in candidate_model:
#             if candidate not in dataset2data[dataset]['interpolated_judge']:
#                 dataset2data[dataset]['interpolated_judge'][candidate] = {}
#             dataset2data[dataset]['interpolated_judge'][candidate][judge] = dataset2data[dataset][judge][candidate]

# for dataset, data in dataset2data.items():
#     for candidate in candidate_model:
#         dataset2data[dataset]['interpolated_judge'][candidate] = weighted_average_ratings(dataset2data[dataset]['interpolated_judge'][candidate], weights_dict)