In [None]:
import openai
import time
from tqdm import tqdm
import json
from openai import AzureOpenAI
import requests
import concurrent.futures
from functools import partial

client = AzureOpenAI(
    azure_endpoint="YOUR-API-ENDPOINT",
    api_key="YOUR-API-KEY",
    api_version="2024-05-01-preview"
)

def initialize_openai(api_key, api_base, api_version="2024-05-01-preview"):
    openai.api_type = "azure"
    openai.api_key = api_key
    openai.api_base = api_base
    openai.api_version = api_version

def get_completion_with_retry(output_text, deployment_name, max_retries=5, delay=10):
    prompt = (
        "Extract only the final answer letter (A, B, C, etc.) from the following text. "
        "Provide only the single uppercase letter without any additional text.\n\n"
        f"Text: {output_text}\n\nAnswer:"
    )

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=deployment_name,
                messages=[
                    {"role": "system", "content": "You are an AI assistant that helps people find information."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=2,
                temperature=0,
                top_p=1,
                n=1,
                stop=None
            )

            response_data = response.to_dict()
            answer = response_data["choices"][0]["message"]["content"].strip()
            if answer in {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
                          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}:
                return answer
            else:
                return "Invalid"
        # except openai.error.RateLimitError:
        #     retries = attempt + 1
        #     print(f"Rate limit exceeded. Retrying {retries}/{max_retries} in {delay} seconds...")
        #     time.sleep(delay)
        except Exception as e:
            print(f"Error during OpenAI API call: {e}. Retrying {attempt + 1}/{max_retries} in {delay} seconds...")
            time.sleep(delay)
    return "Invalid"

def validate_prediction(prediction, choices):
    valid_options = {chr(ord("A") + i) for i in range(len(choices))}
    if prediction in valid_options:
        return prediction
    else:
        return "Invalid"

def process_sample(sample, deployment_name):
    output_text = sample.get("output_text", "")
    choices = sample.get("choices", [])
    reference = sample.get("reference", "").upper()
    exact_match = sample.get("exact_match", "")
    
    if exact_match == 1:
        prediction = reference
    else:
        prediction = get_completion_with_retry(output_text, deployment_name)
    
    valid_prediction = validate_prediction(prediction, choices)
    exact_match = 1.0 if valid_prediction == reference else 0.0
    
    sample["prediction"] = valid_prediction
    sample["exact_match"] = exact_match
    
    return sample, exact_match

def correct_evaluation_results(input_file, output_file, deployment_name):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    corrected_data = {}
    
    for model_name, sections in tqdm(data.items(), desc="Models"):
        corrected_data[model_name] = {}
        
        for section, distract_types in sections.items():
            corrected_data[model_name][section] = {}
            
            for distract_type, metrics in distract_types.items():
                print(f"distract_type:{distract_type}")
                corrected_data[model_name][section][distract_type] = {
                    "exact_match_score": 0.0,
                    "total_samples": 0,
                    "samples": []
                }
                
                samples = metrics["samples"]
                process_func = partial(process_sample, deployment_name=deployment_name)
                
                with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
                    results = list(tqdm(executor.map(process_func, samples), 
                                        total=len(samples), 
                                        desc=f"Section: {section} | Distract Type: {distract_type}"))
                
                for sample, exact_match in results:
                    corrected_data[model_name][section][distract_type]["samples"].append(sample)
                    corrected_data[model_name][section][distract_type]["exact_match_score"] += exact_match
                    corrected_data[model_name][section][distract_type]["total_samples"] += 1
                
                if corrected_data[model_name][section][distract_type]["total_samples"] > 0:
                    corrected_data[model_name][section][distract_type]["exact_match_score"] /= corrected_data[model_name][section][distract_type]["total_samples"]
                else:
                    corrected_data[model_name][section][distract_type]["exact_match_score"] = 0.0
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(corrected_data, f, indent=4, ensure_ascii=False)
    
    print(f"\nCorrected evaluation results have been saved to '{output_file}'.")

# Main execution
if __name__ == "__main__":
    api_key = "YOUR-API-KEY"
    api_base = "2024-05-01-preview"
    initialize_openai(api_key, api_base)

    input_file = "evaluation_results_by_model_and_dataset_mixture.json"
    output_file = "evaluation_results_by_model_and_dataset_mixture_corrected.json"
    deployment_name = "gpt35"

    correct_evaluation_results(input_file, output_file, deployment_name)

# Results by section

In [None]:
import json 
input_file = "evaluation_results_corrected_2B.json"
# Load the existing JSON data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)
    
key = next(iter(data.keys()))
data = data[key]
print(key)
for key in data:
    print(f"key:{key}")
    res = data[key]['overall']['exact_match_score']
    print(f"res:{res}")

# Results by distract_type

In [None]:
import json 

input_file = "evaluation_results_corrected_19B.json"

# Load the existing JSON data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

model = next(iter(data.keys()))
data = data[model]

for section, section_data in data.items():
    print(model)
    print(f"Section: {section}")
    for distract_type, distract_data in section_data.items():
        if distract_type != 'overall':
            exact_match_score = distract_data['exact_match_score']
            print(f"  Distract Type: {distract_type}")
            print(f"  Exact Match Score: {exact_match_score}")
    print()  # Add a blank line between sections

# Mixture

In [3]:
import openai
import time
from tqdm import tqdm
import json
from openai import AzureOpenAI
import requests
import concurrent.futures
from functools import partial
import os
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)

# Initialize Azure OpenAI Client
client = AzureOpenAI(
    azure_endpoint="YOUR-API-ENDPOINT",
    api_key="YOUR-API-KEY",
    api_version="2024-05-01-preview"
)

def initialize_openai(api_key, api_base, api_version="2024-05-01-preview"):
    openai.api_type = "azure"
    openai.api_key = api_key
    openai.api_base = api_base
    openai.api_version = api_version

def get_completion_with_retry(output_text, deployment_name, max_retries=5, delay=10):
    prompt = (
        "Extract only the final answer letter (A, B, C, etc.) from the following text. "
        "Provide only the single uppercase letter without any additional text.\n\n"
        f"Text: {output_text}\n\nAnswer:"
    )

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=deployment_name,
                messages=[
                    {"role": "system", "content": "You are an AI assistant that helps people find information."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=2,
                temperature=0,
                top_p=1,
                n=1,
                stop=None
            )

            response_data = response.to_dict()
            answer = response_data["choices"][0]["message"]["content"].strip()
            if answer in {chr(i) for i in range(ord('A'), ord('Z')+1)}:
                return answer
            else:
                logging.warning(f"Invalid answer received: '{answer}'.")
                return "Invalid"
        except Exception as e:
            logging.error(f"Error during OpenAI API call: {e}. Retrying {attempt + 1}/{max_retries} in {delay} seconds...")
            time.sleep(delay)
    return "Invalid"

def validate_prediction(prediction, choices):
    valid_options = {chr(ord("A") + i) for i in range(len(choices))}
    if prediction in valid_options:
        return prediction
    else:
        return "Invalid"

def process_sample(sample, deployment_name):
    output_text = sample.get("output_text", "")
    choices = sample.get("choices", [])
    reference = sample.get("reference", "").upper()
    exact_match = sample.get("exact_match", 0.0)

    if exact_match == 1:
        prediction = reference
    else:
        prediction = get_completion_with_retry(output_text, deployment_name)

    valid_prediction = validate_prediction(prediction, choices)
    exact_match = 1.0 if valid_prediction == reference else 0.0

    sample["prediction"] = valid_prediction
    sample["exact_match"] = exact_match

    return sample, exact_match

def correct_evaluation_results(input_file, output_file, deployment_name):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    corrected_data = {}

    for model_name, sections in tqdm(data.items(), desc="Models"):
        corrected_data[model_name] = {}

        for section, metrics in tqdm(sections.items(), desc=f"Processing sections for model {model_name}"):
            corrected_data[model_name][section] = {
                "exact_match_score": 0.0,
                "total_samples": 0,
                "samples": []
            }

            samples = metrics.get("samples", [])
            process_func = partial(process_sample, deployment_name=deployment_name)

            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
                results = list(tqdm(executor.map(process_func, samples), 
                                    total=len(samples), 
                                    desc=f"Section: {section}"))

            for sample, exact_match in results:
                corrected_data[model_name][section]["samples"].append(sample)
                corrected_data[model_name][section]["exact_match_score"] += exact_match
                corrected_data[model_name][section]["total_samples"] += 1

            if corrected_data[model_name][section]["total_samples"] > 0:
                corrected_data[model_name][section]["exact_match_score"] /= corrected_data[model_name][section]["total_samples"]
            else:
                corrected_data[model_name][section]["exact_match_score"] = 0.0

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(corrected_data, f, indent=4, ensure_ascii=False)

    logging.info(f"\nCorrected evaluation results have been saved to '{output_file}'.")

# Main execution
if __name__ == "__main__":
    # Ensure the environment variable is set
    api_key = os.getenv("AZURE_OPENAI_API_KEY")
    
    api_base = "YOUR-API-ENDPOINT"  # Your Azure OpenAI base URL
    initialize_openai(api_key, api_base)

    input_file = "evaluation_results_by_model_and_dataset_mixture.json"  # Update with your actual input file path
    output_file = "evaluation_results_by_model_and_dataset_mixture_corrected.json"  # Desired output file path
    deployment_name = "gpt35"  # Update with your actual deployment name

    correct_evaluation_results(input_file, output_file, deployment_name)


Models:   0%|          | 0/2 [00:00<?, ?it/s]
[A2024-09-24 17:25:23,627 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2024-09-24 17:25:23,633 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2024-09-24 17:25:23,639 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2024-09-24 17:25:23,648 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2024-09-24 17:25:23,661 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2024-09-24 17:25:23,667 [INFO] HTTP Request: POST YOUR-API-ENDPOINT/openai/deployments/gpt35/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 O

In [5]:
import json 
input_file = "evaluation_results_by_model_and_dataset_mixture_cog_corrected.json"
# Load the existing JSON data
with open(input_file, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

for model in raw_data:
    data = raw_data[model]
    print(f"model:{model}")
    for key in data:
        print(f"key:{key}")
        res = data[key]['exact_match_score']
        print(f"res:{res}")

model:THUDM/cogvlm2-llama3-chat-19B
key:add_hint_no_image
res:0.5745007680491552
key:insert_hint_no_image
res:0.7831168831168831
key:add_hint_with_image
res:0.5691244239631337
key:insert_hint_with_image
res:0.7935064935064935


In [None]:
# import openai
# import time
# from tqdm import tqdm
# import json
# from openai import AzureOpenAI
# import requests
# import concurrent.futures
# from functools import partial
# import os
# import logging


# # Configure logging
# logging.basicConfig(
#     level=logging.INFO,
#     format='%(asctime)s [%(levelname)s] %(message)s',
#     handlers=[
#         logging.StreamHandler()
#     ]
# )

# # Initialize Azure OpenAI Client
# client = AzureOpenAI(
#     azure_endpoint="YOUR-API-ENDPOINT",
#     api_key="YOUR-API-KEY",
#     api_version="2024-05-01-preview"
# )


# # Initialize Azure OpenAI Client
# def initialize_openai(api_key, api_base, api_version="2024-05-01-preview"):
#     openai.api_type = "azure"
#     openai.api_key = api_key
#     openai.api_base = api_base
#     openai.api_version = api_version

# def get_completion_with_retry(output_text, deployment_name, max_retries=5, delay=10):
#     prompt = (
#         "Extract only the final answer letter (A, B, C, etc.) from the following text. "
#         "Provide only the single uppercase letter without any additional text.\n\n"
#         f"Text: {output_text}\n\nAnswer:"
#     )
    
#     for attempt in range(max_retries):
#         try:
#             response = client.chat.completions.create(
#                 model=deployment_name,
#                 messages=[
#                     {"role": "system", "content": "You are an AI assistant that helps people find information."},
#                     {"role": "user", "content": prompt}
#                 ],
#                 max_tokens=2,
#                 temperature=0,
#                 top_p=1,
#                 n=1,
#                 stop=None
#             )
#             response_data = response.to_dict()
#             answer = response_data["choices"][0]["message"]["content"].strip()
            
#             if answer in {chr(i) for i in range(ord('A'), ord('Z')+1)}:
#                 return answer
#             else:
#                 logging.warning(f"Invalid answer received: '{answer}'.")
#                 return "Invalid"
#         except Exception as e:
#             logging.error(f"Error during OpenAI API call: {e}. Retrying {attempt + 1}/{max_retries} in {delay} seconds...")
#             time.sleep(delay)
    
#     return "Invalid"

# def validate_prediction(prediction, choices):
#     valid_options = {chr(ord("A") + i) for i in range(len(choices))}
#     if prediction in valid_options:
#         return prediction
#     else:
#         return "Invalid"

# def process_sample(sample, deployment_name):
#     output_text = sample.get("output_text", "")
#     choices = sample.get("choices", [])
#     reference = sample.get("reference", "").upper()
    
#     prediction = get_completion_with_retry(output_text, deployment_name)
#     valid_prediction = validate_prediction(prediction, choices)
#     exact_match = 1.0 if valid_prediction == reference else 0.0
    
#     sample["prediction"] = valid_prediction
#     sample["exact_match"] = exact_match
    
#     return sample, exact_match

# def correct_evaluation_results(input_file, output_file, deployment_name):
#     with open(input_file, "r", encoding="utf-8") as f:
#         data = json.load(f)
    
#     corrected_data = {}
    
#     for model_name, sections in tqdm(data.items(), desc="Models"):
#         corrected_data[model_name] = {}
#         for section, metrics in tqdm(sections.items(), desc=f"Processing sections for model {model_name}"):
#             corrected_data[model_name][section] = {
#                 "exact_match_score": 0.0,
#                 "total_samples": 0,
#                 "samples": []
#             }
            
#             samples = metrics.get("samples", [])
#             process_func = partial(process_sample, deployment_name=deployment_name)
            
#             with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#                 results = list(tqdm(executor.map(process_func, samples), total=len(samples), desc=f"Section: {section}"))
            
#             for sample, exact_match in results:
#                 corrected_data[model_name][section]["samples"].append(sample)
#                 corrected_data[model_name][section]["exact_match_score"] += exact_match
#                 corrected_data[model_name][section]["total_samples"] += 1
            
#             if corrected_data[model_name][section]["total_samples"] > 0:
#                 corrected_data[model_name][section]["exact_match_score"] /= corrected_data[model_name][section]["total_samples"]
#             else:
#                 corrected_data[model_name][section]["exact_match_score"] = 0.0
    
#     with open(output_file, "w", encoding="utf-8") as f:
#         json.dump(corrected_data, f, indent=4, ensure_ascii=False)
    
#     logging.info(f"\nCorrected evaluation results have been saved to '{output_file}'.")

# # Main execution
# if __name__ == "__main__":
#     # Ensure the environment variable is set
#     api_key = os.getenv("AZURE_OPENAI_API_KEY")
#     api_base = "YOUR-API-ENDPOINT"  # Your Azure OpenAI base URL
#     initialize_openai(api_key, api_base)
    
#     input_file = "evaluation_results_by_model_and_dataset_mixture_cog.json"  # Update with your actual input file path
#     output_file = "evaluation_results_by_model_and_dataset_mixture_cog_corrected.json"  # Desired output file path
#     deployment_name = "gpt35"  # Update with your actual deployment name
    
#     correct_evaluation_results(input_file, output_file, deployment_name)