In [1]:
import json
import os
import time

from llments.lm.base.api import APIBasedLM
from tqdm import tqdm

In [None]:
MAX_API_RETRY = 10000
REQ_TIME_GAP = 4

os.environ["OPENAI_API_KEY"] = "<openai-api-key>"

### Positional Bias of the LLM Evaluator
An evaluation template with three placeholders T (Q, R1, R2), is used to query the LLM for eval- uation. For each testing question q, given two re- sponses r1 and r2 from Assistant 1 and Assistant 2, respectively, the researchers populate these re- sponses into the corresponding slots of the evalu- ation template to form a prompt: T (Q = q, R1 = r1, R2 = r2). The prompt is then used to query the LLM in order to obtain the comparison result.

In [3]:
def gen_prompt(ques, ans1, ans2):
    """Generates a prompt that compares two AI assistants' answers to a question.

    Args:
        ques (str): The question being asked.
        ans1 (str): The first assistant's answer to the question.
        ans2 (str): The second assistant's answer to the question.

    Returns:
        str: A formatted prompt including the question, both answers, and instructions 
        for evaluation (how to score both assistants).
    """
    sys_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
    prompt_template = "[Question]\n{question}\n\n[The Start of Assistant 1's Answer]\n{answer_1}\n[The End of Assistant 1's Answer]\n\n[The Start of Assistant 2's Answer]\n{answer_2}\n[The End of Assistant 2's Answer]\n\n[System]\n{prompt}\n"
    default_prompt =  """We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.
    Please rate the helpfulness, relevance, accuracy, level of details of their responses. 

    Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.
    Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. 
    Then, output two lines indicating the scores for Assistant 1 and 2, respectively.

    Output with the following format:
    Evaluation evidence: <your evluation explanation here>
    Score of the Assistant 1: <score>
    Score of the Assistant 2: <score>"""
    return sys_prompt, prompt_template.format(question=ques, answer_1=ans1, answer_2=ans2, prompt=default_prompt)

def query_gpt(system_prompt, user_prompt, eval_model, num_sequences):
    """Queries language model API with the provided prompts.

    Args:
        system_prompt (str): The system-level prompt setting the context for the responses.
        user_prompts (list): A list of prompts (for the user part of the interaction).
        eval_model (str): The name of the model to be queried.
        num_sequences (int): The number of response sequences to generate for each input.

    Returns:
        list: A list of responses generated by the language model.
    """
    try:
        base_url = "https://cmu.litellm.ai"
        responses = APIBasedLM("openai/" + eval_model, base_url).chat_generate(
            messages=[
                {"role": "system", "content": system_prompt}, 
                {"role": "user", "content": user_prompt}
            ],
            temperature=1,
            max_new_tokens=512,
            num_return_sequences=num_sequences
        )
        return responses
    except Exception as e:
        print(f'Error: {e}')
        raise RuntimeError(f"Failed during query processing.")
    
def get_eval(ques, ans1, ans2, eval_model, k, bpc=1):
    system_prompt, user_prompt = gen_prompt(ques, ans1, ans2)
    responses = query_gpt(system_prompt, user_prompt, eval_model, k)
    all_scores = []
    contents = []
    contents_bpc = []
    for response in responses:
        for message in response:
            if message["role"] == "assistant":
                content = message['content']
                score1, score2 = parse_score_from_review(content)
                if score1 == -1 or score2 == -1:
                    continue
                all_scores.append([score1, score2])
                contents.append(content)
    
    if bpc == 1:
        system_prompt, user_prompt_bpc = gen_prompt(ques, ans2, ans1)
        responses_bpc = query_gpt(eval_model, k, system_prompt, user_prompt_bpc)
        for response in responses_bpc:
            for message in response:
                if message["role"] == "assistant":
                    content = message['content']
                    score1, score2 = parse_score_from_review(content)
                    if score1 == -1 or score2 == -1:
                        continue
                    all_scores.append([score1, score2])
                    contents.append(content)
    
    if all_scores:
        score1 = sum([score[0] for score in all_scores]) / len(all_scores)
        score2 = sum([score[1] for score in all_scores]) / len(all_scores)
    else:
        score1, score2 = -1, -1
    return contents, contents_bpc, [score1, score2]

def parse_score_from_review(review):
    """Parses the score for two assistants from the review text.

    Args:
        review (str): The review text that includes the scores.

    Returns:
        list: A list containing the scores for Assistant 1 and Assistant 2.
        If parsing fails, returns [-1, -1].
    """
    try:
        score1 = review.split("\n")[-2]
        score2 = review.split("\n")[-1]
        score1 = score1.split(":")[-1].strip()
        score2 = score2.split(":")[-1].strip()
        return [float(score1), float(score2)]
    except:
        return [-1, -1]
    
def get_json_list(file_path):
    """Reads a JSON lines file and returns a list of JSON objects.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        list: A list of JSON objects from the file.
    """
    file_path = os.path.expanduser(file_path)
    with open(file_path, "r") as f:
        json_list = []
        for line in f:
            json_list.append(json.loads(line))
        return json_list
    
def get_results(m1, m2, eval_model, bpc=0, k=1):
    """Processes results for multiple questions and answers from two assistants.

    Args:
        m1 (str): Identifier for the first model or assistant.
        m2 (str): Identifier for the second model or assistant.
        eval_model (str): The evaluation model to be used.
        bpc (bool): If True, perform back-and-forth comparisons.
        k (int): Number of response sequences to generate.

    Returns:
        None
    """
    question_jsons = get_json_list("question.jsonl")
    answer1_jsons = get_json_list(f"answer/answer_{m1}.jsonl")
    answer2_jsons = get_json_list(f"answer/answer_{m2}.jsonl")
    output = f"review/review_{m1}_vs_{m2}_eval={eval_model.split('/')[-1]}_mec={k}_bpc={bpc}.json"

    assert len(question_jsons) == len(answer1_jsons) == len(answer2_jsons)

    reviews = []
    total_len = len(question_jsons)
    question_idx_list = list(range(total_len))

    for i in tqdm(question_idx_list):
        assert (
            answer1_jsons[i]["question_id"]
            == question_jsons[i]["question_id"]
            == answer2_jsons[i]["question_id"]
        )

        ques = question_jsons[i]["text"]
        ans1 = answer1_jsons[i]["text"]
        ans2 = answer2_jsons[i]["text"]
        
        reviews.append(get_eval(ques, ans1, ans2, eval_model, k, bpc))
        
        # To avoid the rate limit set by OpenAI
        time.sleep(REQ_TIME_GAP)

    model1_vs_model2 = {
        'win': 0,
        'tie': 0,
        'loss': 0
    }
    with open(f"{output}", "w") as output_review_file:
        for idx, (contents, contents_bpc, [score1, score2]) in enumerate(reviews):
            results = {
                "question_id": question_jsons[idx]["question_id"],
                "question": question_jsons[idx]["text"],
                "review": contents,
                "review_bpc": contents_bpc,
                "score": [score1, score2],
            }
            output_review_file.write(json.dumps(results) + "\n")
            
            if score1 == score2:
                model1_vs_model2['tie'] += 1
                
            elif score1 > score2:
                model1_vs_model2['win'] += 1
            else:
                model1_vs_model2['loss'] += 1

    print(f'Evaluation results (model1_vs_model2):\n{model1_vs_model2}')

The Win Rate of Vicuna-13B significantly fluctuates when positioned as Assistant 1 and Assistant 2.
Conflict Rate refers to the proportion of conflicting results given by the same evaluator when simply changing the position of two models.

### Vicuna-13B v.s. ChatGPT

In [4]:
m1="gpt35"
m2="vicuna-13b"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 80/80 [09:52<00:00,  7.41s/it]

Evaluation results (model1_vs_model2):
{'win': 41, 'tie': 0, 'loss': 39}





In [5]:
m1="vicuna-13b"
m2="gpt35"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 80/80 [09:39<00:00,  7.25s/it]

Evaluation results (model1_vs_model2):
{'win': 36, 'tie': 1, 'loss': 43}





In [6]:
gpt35_vs_vicuna13b_results = []

with open('review/review_gpt35_vs_vicuna-13b_eval=gpt-4o-mini-2024-07-18_mec=1_bpc=0.json', 'r') as file:
    for line in file:
        json_object = json.loads(line)
        gpt35_vs_vicuna13b_results.append(json_object)

vicuna13b_vs_gpt35_results = []

with open('review/review_vicuna-13b_vs_gpt35_eval=gpt-4o-mini-2024-07-18_mec=1_bpc=0.json', 'r') as file:
    for line in file:
        json_object = json.loads(line)
        vicuna13b_vs_gpt35_results.append(json_object)

vicuna13b_win_rate_first = 0
vicuna13b_win_rate_second = 0
conflict = 0

for i in range(len(gpt35_vs_vicuna13b_results)):
    vicuna_as_first_winner = False
    vicuna_as_second_winner = False
    if gpt35_vs_vicuna13b_results[i]['score'][0] < gpt35_vs_vicuna13b_results[i]['score'][1]:
        vicuna13b_win_rate_second += 1
        vicuna_as_second_winner = True
    if vicuna13b_vs_gpt35_results[i]['score'][0] > vicuna13b_vs_gpt35_results[i]['score'][1]:
        vicuna13b_win_rate_first += 1
        vicuna_as_first_winner = True
    if vicuna_as_first_winner != vicuna_as_second_winner:
        conflict += 1


print("Vicuna-13B v.s. ChatGPT | Evaluator: ChatGPT")
print(f"Vicuna-13b win rate as assistant 1: {vicuna13b_win_rate_first / 80 * 100}%")
print(f"Vicuna-13b win rate as assistant 2: {vicuna13b_win_rate_second / 80 * 100}%")
print(f"Conflict rate: {conflict}/80 ({conflict / 80 * 100}%)")
    

Vicuna-13B v.s. ChatGPT | Evaluator: ChatGPT
Vicuna-13b win rate as assistant 1: 45.0%
Vicuna-13b win rate as assistant 2: 48.75%
Conflict rate: 13/80 (16.25%)


### Vicuna-13B v.s. Alpaca-13B

In [7]:
m1="alpaca-13b"
m2="vicuna-13b"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 80/80 [16:30<00:00, 12.38s/it]   

Evaluation results (model1_vs_model2):
{'win': 4, 'tie': 0, 'loss': 76}





In [8]:
m1="vicuna-13b"
m2="alpaca-13b"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 80/80 [16:17<00:00, 12.21s/it]   

Evaluation results (model1_vs_model2):
{'win': 77, 'tie': 0, 'loss': 3}





In [11]:
alpaca13b_vs_vicuna13b_results = []

with open('review/review_alpaca-13b_vs_vicuna-13b_eval=gpt-4o-mini-2024-07-18_mec=1_bpc=0.json', 'r') as file:
    for line in file:
        json_object = json.loads(line)
        alpaca13b_vs_vicuna13b_results.append(json_object)

vicuna13b_vs_alpaca13b_results = []

with open('review/review_vicuna-13b_vs_alpaca-13b_eval=gpt-4o-mini-2024-07-18_mec=1_bpc=0.json', 'r') as file:
    for line in file:
        json_object = json.loads(line)
        vicuna13b_vs_alpaca13b_results.append(json_object)

vicuna13b_win_rate_first = 0
vicuna13b_win_rate_second = 0
conflict = 0

for i in range(len(gpt35_vs_vicuna13b_results)):
    vicuna_as_first_winner = False
    vicuna_as_second_winner = False
    if alpaca13b_vs_vicuna13b_results[i]['score'][0] < alpaca13b_vs_vicuna13b_results[i]['score'][1]:
        vicuna13b_win_rate_second += 1
        vicuna_as_second_winner = True
    if vicuna13b_vs_alpaca13b_results[i]['score'][0] > vicuna13b_vs_alpaca13b_results[i]['score'][1]:
        vicuna13b_win_rate_first += 1
        vicuna_as_first_winner = True
    if vicuna_as_first_winner != vicuna_as_second_winner:
        conflict += 1


print("Vicuna-13B v.s. Alpaca-13B | Evaluator: ChatGPT")
print(f"Vicuna-13b win rate as assistant 1: {vicuna13b_win_rate_first / 80 * 100}%")
print(f"Vicuna-13b win rate as assistant 2: {vicuna13b_win_rate_second / 80 * 100}%")
print(f"Conflict rate: {conflict}/80 ({conflict / 80 * 100}%)")
    

Vicuna-13B v.s. Alpaca-13B | Evaluator: ChatGPT
Vicuna-13b win rate as assistant 1: 96.25%
Vicuna-13b win rate as assistant 2: 95.0%
Conflict rate: 3/80 (3.75%)
