In [None]:
#  eval long answer generation metrics
import sys
sys.path.append("../")
import json
from tqdm import tqdm
import evaluate
rouge=evaluate.load("../evaluate_utils/rouge/")

def select_candidate(generated_answer, gold_answers):
    max_rouge_score = 0
    candidate_idx = -1
    if isinstance(gold_answers, str):
        return gold_answers
    if len(gold_answers) == 1:
        return gold_answers[0]
    for idx, gold_answer in enumerate(gold_answers):
        rouge_score = rouge.compute(predictions=[generated_answer], references=[gold_answer])
        if rouge_score["rougeL"] > max_rouge_score:
            max_rouge_score = rouge_score["rougeL"]
            candidate_idx = idx
    return gold_answers[candidate_idx]

def mean_rouge(rouge_results):
    rouge_results = {k: sum([rouge_result for rouge_result in rouge_results[k]]) / len(rouge_results["rouge1"]) for k in rouge_results.keys()}
    return {k: round(v * 100, 2) for k, v in rouge_results.items()}


dataset='eli5'
split=('test')
chat_model="qwen72b"
search_engine="kiltbm25"
rerank_model="e5base"


print(f"dataset: {dataset}, split: {split}, chat_model: {chat_model}, search_engine: {search_engine}, rerank_model: {rerank_model}")
#  no search chat results
no_search_chat_results_file=f'../user_intent_data/{dataset}/{chat_model}/without_search/{chat_model}-{dataset}-{split}.jsonl'
no_search_chat_results=[json.loads(line) for line in open(no_search_chat_results_file, "r", encoding="utf-8")]
no_search_generated_answers=[data_line[f"{chat_model}_without_search_answer"] for data_line in no_search_chat_results]

if "answer" in no_search_chat_results[0]:
    gold_answers=[data_line["answer"] for data_line in no_search_chat_results]
else:
    gold_answers=[data_line["long_answers"] for data_line in no_search_chat_results]

# no_search_gold_answers=[]
# 
# for gen, gold in tqdm(zip(no_search_generated_answers, gold_answers), total=len(no_search_generated_answers)):
#     no_search_gold_answers.append(select_candidate(gen,gold))
# print("no search results end to end metrics")
# rouge_result=rouge.compute(predictions=no_search_generated_answers, references=no_search_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  cot chat results
cot_chat_results_file=f'../user_intent_data/{dataset}/{chat_model}/cot/{chat_model}-{dataset}-{split}.jsonl'
cot_chat_results=[json.loads(line) for line in open(cot_chat_results_file, "r", encoding="utf-8")]
cot_generated_answers=[data_line[f"{chat_model}_cot_answer"] for data_line in cot_chat_results]
cot_gold_answers=[]

# for gen, gold in tqdm(zip(cot_generated_answers, gold_answers), total=len(cot_generated_answers)):
#     cot_gold_answers.append(select_candidate(gen,gold))
# print("cot results end to end metrics")
# rouge_result=rouge.compute(predictions=cot_generated_answers, references=cot_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})
    

#  vanilla search chat results
if search_engine == "bing":
    vanilla_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/vanilla_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    vanilla_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/vanilla_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
vanilla_search_results=[json.loads(line) for line in open(vanilla_search_results_file, "r", encoding="utf-8")]
vanilla_search_generated_answers=[data_line[f"{chat_model}_vanilla_search_answer"] for data_line in vanilla_search_results]
# vanilla_search_gold_answers=[]
# 
# for gen, gold in tqdm(zip(vanilla_search_generated_answers, gold_answers), total=len(vanilla_search_generated_answers)):
#     vanilla_search_gold_answers.append(select_candidate(gen,gold))
# print("vanilla search results end to end metrics")
# rouge_result=rouge.compute(predictions=vanilla_search_generated_answers, references=vanilla_search_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  v0110 rewrite chat results(only judge whether the retrieval is required)
judge_model="v0115"
v0110_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
v0110_rewrite_search_results=[json.loads(line) for line in open(v0110_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[v0110_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(v0110_rewrite_search_results))]
v0110_judge_generated_answers=[no_search_chat_results[idx][f"{chat_model}_without_search_answer"] if knowns[idx] else vanilla_search_results[idx][f"{chat_model}_vanilla_search_answer"] for idx in range(len(v0110_rewrite_search_results))]
v0110_judge_gold_answers=[]
for gen, gold in tqdm(zip(v0110_judge_generated_answers, gold_answers), total=len(v0110_judge_generated_answers)):
    v0110_judge_gold_answers.append(select_candidate(gen,gold))

print(f"{judge_model} judge results end to end metrics")
rouge_result=rouge.compute(predictions=v0110_judge_generated_answers, references=v0110_judge_gold_answers)
print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  gpt4 rewrite chat results
if search_engine == "bing":
    gpt4_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/gpt4_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    gpt4_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/gpt4_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
gpt4_rewrite_search_results=[json.loads(line) for line in open(gpt4_rewrite_search_results_file, "r", encoding="utf-8")]
gpt4_rewrite_search_generated_answers=[data_line[f"{chat_model}_gpt4_rewrite_search_answer"] for data_line in gpt4_rewrite_search_results]
# gpt4_rewrite_search_gold_answers=[]
# 
# for gen, gold in tqdm(zip(gpt4_rewrite_search_generated_answers, gold_answers), total=len(gpt4_rewrite_search_generated_answers)):
#     gpt4_rewrite_search_gold_answers.append(select_candidate(gen,gold))
# 
# print("gpt4 rewrite results end to end metrics")
# rouge_result=rouge.compute(predictions=gpt4_rewrite_search_generated_answers, references=gpt4_rewrite_search_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  v0110 judge chat results(only judge whether the retrieval is required)
v0110_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
v0110_rewrite_search_results=[json.loads(line) for line in open(v0110_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[v0110_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(v0110_rewrite_search_results))]
v0110_judge_generated_answers=[no_search_chat_results[idx][f"{chat_model}_without_search_answer"] if knowns[idx] else gpt4_rewrite_search_results[idx][f"{chat_model}_gpt4_rewrite_search_answer"] for idx in range(len(v0110_rewrite_search_results))]
# v0110_judge_gold_answers=[]
# for gen, gold in tqdm(zip(v0110_judge_generated_answers, gold_answers), total=len(v0110_judge_generated_answers)):
#     v0110_judge_gold_answers.append(select_candidate(gen,gold))
# 
# print(f"{judge_model} judge results end to end metrics")
# rouge_result=rouge.compute(predictions=v0110_judge_generated_answers, references=gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})


# v0105 rewrite chat results
rewrite_model="v0118tinyllamav0104"
if search_engine == "bing":
    v0105_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    v0105_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
v0105_rewrite_search_results=[json.loads(line) for line in open(v0105_rewrite_search_results_file, "r", encoding="utf-8")]
v0105_rewrite_search_generated_answers=[data_line[f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for data_line in v0105_rewrite_search_results]
# v0105_rewrite_search_gold_answers=[]
# 
# for gen, gold in tqdm(zip(v0105_rewrite_search_generated_answers, gold_answers), total=len(v0105_rewrite_search_generated_answers)):
#     v0105_rewrite_search_gold_answers.append(select_candidate(gen,gold))
# 
# print(f"{rewrite_model} rewrite results end to end metrics")
# rouge_result=rouge.compute(predictions=v0105_rewrite_search_generated_answers, references=v0105_rewrite_search_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  v0110 judge chat results(only judge whether the retrieval is required)
v0110_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
v0110_rewrite_search_results=[json.loads(line) for line in open(v0110_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[v0110_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(v0110_rewrite_search_results))]
v0110_judge_generated_answers=[no_search_chat_results[idx][f"{chat_model}_without_search_answer"] if knowns[idx] else v0105_rewrite_search_results[idx][f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for idx in range(len(v0110_rewrite_search_results))]
# v0110_judge_gold_answers=[]
# for gen, gold in tqdm(zip(v0110_judge_generated_answers, gold_answers), total=len(v0110_judge_generated_answers)):
#     v0110_judge_gold_answers.append(select_candidate(gen,gold))
# 
# print(f"{judge_model} judge results end to end metrics")
# rouge_result=rouge.compute(predictions=v0110_judge_generated_answers, references=gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

# v0105 rewrite chat results
rewrite_model="v0119"
if search_engine == "bing":
    v0105_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    v0105_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
v0105_rewrite_search_results=[json.loads(line) for line in open(v0105_rewrite_search_results_file, "r", encoding="utf-8")]
v0105_rewrite_search_generated_answers=[data_line[f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for data_line in v0105_rewrite_search_results]
# v0105_rewrite_search_gold_answers=[]
# 
# for gen, gold in tqdm(zip(v0105_rewrite_search_generated_answers, gold_answers), total=len(v0105_rewrite_search_generated_answers)):
#     v0105_rewrite_search_gold_answers.append(select_candidate(gen,gold))
# 
# print(f"{rewrite_model} rewrite results end to end metrics")
# rouge_result=rouge.compute(predictions=v0105_rewrite_search_generated_answers, references=v0105_rewrite_search_gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

#  v0110 judge chat results(only judge whether the retrieval is required)
v0110_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
v0110_rewrite_search_results=[json.loads(line) for line in open(v0110_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[v0110_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(v0110_rewrite_search_results))]
v0110_judge_generated_answers=[no_search_chat_results[idx][f"{chat_model}_without_search_answer"] if knowns[idx] else v0105_rewrite_search_results[idx][f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for idx in range(len(v0110_rewrite_search_results))]
# v0110_judge_gold_answers=[]
# for gen, gold in tqdm(zip(v0110_judge_generated_answers, gold_answers), total=len(v0110_judge_generated_answers)):
#     v0110_judge_gold_answers.append(select_candidate(gen,gold))
# 
# print(f"{judge_model} judge results end to end metrics")
# rouge_result=rouge.compute(predictions=v0110_judge_generated_answers, references=gold_answers)
# print({k: round(v * 100, 2) for k, v in rouge_result.items()})

In [None]:
#  eval short answer matching metrics
import string
import re

import sys
sys.path.append("../")
from evaluate_utils.rouge_chinese.rouge import Rouge
import json
from transformers import AutoTokenizer, LlamaTokenizer
import evaluate

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""

  def remove_articles(text):
    return re.sub(r'\b(a|an|the)\b', ' ', text)

  def white_space_fix(text):
    return ' '.join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def calculate_short_answer_EM(generated_answer, gold_answers):
    generated_answer=normalize_answer(generated_answer)
    match=0
    for gold_answer in gold_answers:
        gold_answer=normalize_answer(gold_answer)
        if gold_answer in generated_answer:
            match+=1
    return {
        "recall": match/len(gold_answers),
        "exact_match": match>0
    }
            

dataset='asqa'
split=('test')
chat_model="qwen72b"
# chat_model="llama70b"
search_engine="kiltbm25"
rerank_model="e5base"


print(f"dataset: {dataset}, split: {split}, chat_model: {chat_model}, search_engine: {search_engine}, rerank_model: {rerank_model}")
#  no search chat results
no_search_chat_results_file=f'../user_intent_data/{dataset}/{chat_model}/without_search/{chat_model}-{dataset}-{split}.jsonl'
no_search_chat_results=[json.loads(line) for line in open(no_search_chat_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_without_search_answer"] for data_line in no_search_chat_results]
no_search_or_cot_answers=generated_answers

if "short_answers" in no_search_chat_results[0]:
    gold_answers=[data_line["short_answers"] for data_line in no_search_chat_results]
else:
    if isinstance(no_search_chat_results[0]["answer"], list):
        gold_answers=[data_line["answer"] for data_line in no_search_chat_results]
    else:
        gold_answers=[[data_line["answer"]] for data_line in no_search_chat_results]

no_search_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print("no search results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in no_search_em_results])/len(no_search_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in no_search_em_results])/len(no_search_em_results) * 100, 2),
})

#  cot chat results
cot_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/cot/{chat_model}-{dataset}-{split}.jsonl'
cot_search_results=[json.loads(line) for line in open(cot_search_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_cot_answer"] for data_line in cot_search_results]
if "qwen72b" in chat_model and "nq" not in dataset:
    no_search_or_cot_answers=generated_answers

cot_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print("cot results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in cot_em_results])/len(cot_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in cot_em_results])/len(cot_em_results) * 100, 2),
})


#  vanilla search chat results
if search_engine == "bing":
    vanilla_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/vanilla_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    vanilla_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/vanilla_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
vanilla_search_results=[json.loads(line) for line in open(vanilla_search_results_file, "r", encoding="utf-8")]


generated_answers=[data_line[f"{chat_model}_vanilla_search_answer"] for data_line in vanilla_search_results]

vanilla_search_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print("vanilla search results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in vanilla_search_em_results])/len(vanilla_search_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in vanilla_search_em_results])/len(vanilla_search_em_results) * 100, 2),
})

#  v0110 rewrite chat results(only judge whether the retrieval is required)
judge_model="v0104llama7b"
judge_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
judge_rewrite_search_results=[json.loads(line) for line in open(judge_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[judge_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(judge_rewrite_search_results))]
generated_answers=[no_search_or_cot_answers[idx] if knowns[idx] else vanilla_search_results[idx][f"{chat_model}_vanilla_search_answer"] for idx in range(len(judge_rewrite_search_results))]

em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{judge_model} judge results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in em_results])/len(em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in em_results])/len(em_results) * 100, 2),
})

#  gpt4 rewrite chat results
if search_engine == "bing":
    gpt4_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/gpt4_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    gpt4_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/gpt4_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
gpt4_rewrite_search_results=[json.loads(line) for line in open(gpt4_rewrite_search_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_gpt4_rewrite_search_answer"] for data_line in gpt4_rewrite_search_results]

gpt4_rewrite_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print("gpt4 rewrite results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in gpt4_rewrite_em_results])/len(gpt4_rewrite_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in gpt4_rewrite_em_results])/len(gpt4_rewrite_em_results) * 100, 2),
})

#  judge rewrite chat results(only judge whether the retrieval is required)
judge_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
judge_rewrite_search_results=[json.loads(line) for line in open(judge_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[judge_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(judge_rewrite_search_results))]
generated_answers=[no_search_or_cot_answers[idx] if knowns[idx] else gpt4_rewrite_search_results[idx][f"{chat_model}_gpt4_rewrite_search_answer"] for idx in range(len(judge_rewrite_search_results))]

em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{judge_model} judge results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in em_results])/len(em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in em_results])/len(em_results) * 100, 2),
})


# v0105llama7b rewrite chat results
rewrite_model="v0119"
if search_engine == "bing":
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
v0105llama7b_rewrite_search_results=[json.loads(line) for line in open(v0105llama7b_rewrite_search_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for data_line in v0105llama7b_rewrite_search_results]

v0105llama7b_rewrite_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{rewrite_model} rewrite results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
})

#  judge rewrite chat results(only judge whether the retrieval is required)
judge_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
judge_rewrite_search_results=[json.loads(line) for line in open(judge_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[judge_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(judge_rewrite_search_results))]
generated_answers=[no_search_or_cot_answers[idx] if knowns[idx] else v0105llama7b_rewrite_search_results[idx][f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for idx in range(len(judge_rewrite_search_results))]

em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{judge_model} judge results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in em_results])/len(em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in em_results])/len(em_results) * 100, 2),
})

# v0105llama7b rewrite chat results
rewrite_model="v0118llama7b"
if search_engine == "bing":
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
v0105llama7b_rewrite_search_results=[json.loads(line) for line in open(v0105llama7b_rewrite_search_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for data_line in v0105llama7b_rewrite_search_results]

v0105llama7b_rewrite_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{rewrite_model} rewrite results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
})

#  judge rewrite chat results(only judge whether the retrieval is required)
judge_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
judge_rewrite_search_results=[json.loads(line) for line in open(judge_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[judge_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(judge_rewrite_search_results))]
generated_answers=[no_search_or_cot_answers[idx] if knowns[idx] else v0105llama7b_rewrite_search_results[idx][f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for idx in range(len(judge_rewrite_search_results))]

em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{judge_model} judge results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in em_results])/len(em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in em_results])/len(em_results) * 100, 2),
})

# v0105llama7b rewrite chat results
rewrite_model="v0118llama7bv0104"
if search_engine == "bing":
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{chat_model}-{dataset}-{split}.jsonl'
else:
    v0105llama7b_rewrite_search_results_file=f'../user_intent_data/{dataset}/{chat_model}/{search_engine}/{rewrite_model}_rewrite_search/{rerank_model}-{chat_model}-{dataset}-{split}.jsonl'
v0105llama7b_rewrite_search_results=[json.loads(line) for line in open(v0105llama7b_rewrite_search_results_file, "r", encoding="utf-8")]
generated_answers=[data_line[f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for data_line in v0105llama7b_rewrite_search_results]

v0105llama7b_rewrite_em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{rewrite_model} rewrite results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in v0105llama7b_rewrite_em_results])/len(v0105llama7b_rewrite_em_results) * 100, 2),
})

#  judge rewrite chat results(only judge whether the retrieval is required)
judge_rewrite_search_results_file=f'../user_intent_data/{dataset}/rewrite/{judge_model}/{judge_model}-{dataset}-{split}.jsonl'
judge_rewrite_search_results=[json.loads(line) for line in open(judge_rewrite_search_results_file, "r", encoding="utf-8")]
knowns=[judge_rewrite_search_results[idx][f'{judge_model}_rewrite']["known"] for idx in range(len(judge_rewrite_search_results))]
generated_answers=[no_search_or_cot_answers[idx] if knowns[idx] else v0105llama7b_rewrite_search_results[idx][f"{chat_model}_{rewrite_model}_rewrite_search_answer"] for idx in range(len(judge_rewrite_search_results))]

em_results = [calculate_short_answer_EM(generated_answers[idx], gold_answers[idx]) for idx in range(len(generated_answers))]
print(f"{judge_model} judge results short answer matching metrics")
print({
    "recall": round(sum([item["recall"] for item in em_results])/len(em_results) * 100, 2),
    "exact_match": round(sum([1 if item["exact_match"] else 0 for item in em_results])/len(em_results) * 100, 2),
})
