In [1]:
import json
import os
import time

from llments.lm.base.api import APIBasedLM
from tqdm import tqdm

In [None]:
MAX_API_RETRY = 10000
REQ_TIME_GAP = 4

os.environ["OPENAI_API_KEY"] = "<openai-api-key>"

### Positional Bias of the LLM Evaluator

In [5]:
def gen_prompt(ques, ans1, ans2):
    sys_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
    prompt_template = "[Prompt]\n{question}\n\n[The Start of Assistant 1's Answer]\n{answer_1}\n[The End of Assistant 1's Answer]\n\n[The Start of Assistant 2's Answer]\n{answer_2}\n[The End of Assistant 2's Answer]\n\n[System]\n{prompt}\n"
    default_prompt =  """We would like to request your feedback on the performance of two AI assistants in response to the prompt displayed above.
    Please rate the helpfulness, relevance, accuracy, level of details of their responses. 

    Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.
    Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. 
    Then, output two lines indicating the scores for Assistant 1 and 2, respectively.

    Output with the following format:
    Evaluation evidence: <your evluation explanation here>
    Score of the Assistant 1: <score>
    Score of the Assistant 2: <score>"""
    return sys_prompt, prompt_template.format(question=ques, answer_1=ans1, answer_2=ans2, prompt=default_prompt)

def query_gpt(system_prompt, user_prompt, eval_model, num_sequences):
    try:
        base_url = "https://cmu.litellm.ai"
        responses = APIBasedLM("openai/" + eval_model, base_url).chat_generate(
            messages=[
                {"role": "system", "content": system_prompt}, 
                {"role": "user", "content": user_prompt}
            ],
            temperature=1,
            max_new_tokens=512,
            num_return_sequences=num_sequences
        )
        return responses
    except Exception as e:
        print(f'Error: {e}')
        raise RuntimeError(f"Failed during query processing.")
    
def get_eval(ques, ans1, ans2, eval_model, k, bpc=1):
    system_prompt, user_prompt = gen_prompt(ques, ans1, ans2)
    responses = query_gpt(system_prompt, user_prompt, eval_model, k)
    all_scores = []
    contents = []
    contents_bpc = []
    for response in responses:
        for message in response:
            if message["role"] == "assistant":
                content = message['content']
                score1, score2 = parse_score_from_review(content)
                if score1 == -1 or score2 == -1:
                    continue
                all_scores.append([score1, score2])
                contents.append(content)
    
    if bpc == 1:
        system_prompt, user_prompt_bpc = gen_prompt(ques, ans2, ans1)
        responses_bpc = query_gpt(system_prompt, user_prompt_bpc, eval_model, k)
        for response in responses_bpc:
            for message in response:
                if message["role"] == "assistant":
                    content = message['content']
                    score1, score2 = parse_score_from_review(content)
                    if score1 == -1 or score2 == -1:
                        continue
                    all_scores.append([score1, score2])
                    contents.append(content)
    
    if all_scores:
        score1 = sum([score[0] for score in all_scores]) / len(all_scores)
        score2 = sum([score[1] for score in all_scores]) / len(all_scores)
    else:
        score1, score2 = -1, -1
    return contents, contents_bpc, [score1, score2]

def parse_score_from_review(review):
    try:
        score1 = review.split("\n")[-2]
        score2 = review.split("\n")[-1]
        score1 = score1.split(":")[-1].strip()
        score2 = score2.split(":")[-1].strip()
        return [float(score1), float(score2)]
    except:
        return [-1, -1]
    
def get_json_list(file_path):
    file_path = os.path.expanduser(file_path)
    with open(file_path, "r") as f:
        json_list = []
        for line in f:
            json_list.append(json.loads(line))
        return json_list
    
def get_text_list(file_path):
    with open(file_path, "r") as f:
        return [line.strip() for line in f]
    
def get_results(m1, m2, eval_model, bpc=0, k=1):
    prompts_list = get_text_list("responses_communityLM/prompts.txt")
    answer1_jsons = get_json_list(f"responses_communityLM/{m1}_responses.jsonl")
    answer2_jsons = get_json_list(f"responses_communityLM/{m2}_responses.jsonl")
    output = f"review/review_{m1}_vs_{m2}_eval={eval_model.split('/')[-1]}_mec={k}_bpc={bpc}.json"

    assert len(prompts_list) == len(answer1_jsons) == len(answer2_jsons)

    reviews = []
    total_len = len(prompts_list)
    question_idx_list = list(range(total_len))

    for i in tqdm(question_idx_list):
        assert (
            answer1_jsons[i]["prompt"]
            == answer2_jsons[i]["prompt"]
        )

        ques = prompts_list[i]
        ans1 = answer1_jsons[i]["response"]
        ans2 = answer2_jsons[i]["response"]
        
        reviews.append(get_eval(ques, ans1, ans2, eval_model, k, bpc))
        
        # To avoid the rate limit set by OpenAI
        time.sleep(REQ_TIME_GAP)

    model1_vs_model2 = {
        'win': 0,
        'tie': 0,
        'loss': 0
    }
    with open(f"{output}", "w") as output_review_file:
        for idx, (contents, contents_bpc, [score1, score2]) in enumerate(reviews):
            results = {
                "prompt": prompts_list[idx],
                "review": contents,
                "review_bpc": contents_bpc,
                "score": [score1, score2],
            }
            output_review_file.write(json.dumps(results) + "\n")
            
            if score1 == score2:
                model1_vs_model2['tie'] += 1
                
            elif score1 > score2:
                model1_vs_model2['win'] += 1
            else:
                model1_vs_model2['loss'] += 1

    print(f'Evaluation results (model1_vs_model2):\n{model1_vs_model2}')

### Democratic LM vs Republican LM

In [6]:
m1="democratic"
m2="republican"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 60/60 [06:44<00:00,  6.74s/it]

Evaluation results (model1_vs_model2):
{'win': 20, 'tie': 17, 'loss': 23}





### Republican LM vs Democratic LM

In [7]:
m1="republican"
m2="democratic"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 60/60 [13:33<00:00, 13.55s/it]   

Evaluation results (model1_vs_model2):
{'win': 17, 'tie': 15, 'loss': 28}





### Democratic LM (fine-tuned LM) vs GPT 2 (pre-trained LM)

In [8]:
m2="democratic"
m1="gpt2"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 60/60 [07:29<00:00,  7.49s/it]

Evaluation results (model1_vs_model2):
{'win': 46, 'tie': 1, 'loss': 13}





In [9]:
m2="republican"
m1="gpt2"
eval_model="neulab/gpt-4o-mini-2024-07-18"

get_results(m1, m2, eval_model)

100%|██████████| 60/60 [06:49<00:00,  6.83s/it]

Evaluation results (model1_vs_model2):
{'win': 54, 'tie': 1, 'loss': 5}



