In [77]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.callbacks import get_openai_callback
import dotenv
dotenv.load_dotenv()

True

In [57]:
chat = ChatOpenAI(
    temperature=0.1,
    model="gpt-3.5-turbo"
)


In [59]:
from langchain.schema import BaseOutputParser
import re

class OutputParser(BaseOutputParser):
    def parse(self, text):
        reasoning = None
        answer = None

        # Reasoning 파싱
        reasoning_match = re.search(r'Reasoning:\s*(.*?)(?=Answer:)', text, re.DOTALL)
        if reasoning_match:
            reasoning = reasoning_match.group(1).strip()

        # Answer 파싱
        answer_match = re.search(r'Answer:\s*(.*)', text, re.DOTALL)
        if answer_match:
            answer = answer_match.group(1).strip()

        return {
            "reasoning": reasoning,
            "answer": answer
        }

In [60]:
system = "You are a medical expert. You are good at solving medical domain questions."
human = "{Instruction} \n {Question} \n {Context} \n {Options} \n {few_shots}\n" 
# human = "{text}"
prompt_template = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", human)
])

chain = prompt_template | chat | OutputParser()


### MedQA Dataset Load

In [6]:
import jsonlines

with jsonlines.open('./Dataset/MedQA/questions/US/test.jsonl') as reader:
    medqa_dataset = [obj for obj in reader]
medqa_dataset[0]

{'question': 'A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?',
 'answer': 'Tell the attending that he cannot fail to disclose this mistake',
 'options': {'A': 'Disclose the error to the patient but leave it out of the operative report',
  'B': 'Disclose the error to the patient and put it in the operative report',
  'C': 'Tell the attending that he cannot fail to disclose this mistake',
  'D': 'Report the physician to the ethics commi

### MedMCQA Dataset Load


In [88]:
import json

medmcqa_dataset = json.load(open('./Dataset/MedMCQA/train.json'))
medmcqa_dataset[0]

{'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'cop': 3,
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract',
 'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'choice_type': 'single'}

### PumMedQA Dataset Load


In [107]:
pubmedqa_dataset = json.load(open('./Dataset/PubMedQA/Labeled/ori_pqal_array.json'))

pubmedqa_dataset[0]

{'pmid': '21645374',
 'QUESTION': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'CONTEXTS': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
  'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early

In [47]:
def q_format_MedQA(item):
    ## context = [Question, Context, Options]
    context = []
    context.append(f"Question: {item["question"]}")
    context.append("")
    # option

    option_string = "Options:\n"
    option_string += f"A: {item['options']['A']}\n"
    option_string += f"B: {item['options']['B']}\n"
    option_string += f"C: {item['options']['C']}\n"
    option_string += f"D: {item['options']['D']}\n"
    context.append(option_string)
    return context
q_format_MedQA(medqa_dataset[0])

['Question: A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?',
 '',
 'Options:\nA: Disclose the error to the patient but leave it out of the operative report\nB: Disclose the error to the patient and put it in the operative report\nC: Tell the attending that he cannot fail to disclose this mistake\nD: Report the physician to the ethics committee\n']

In [48]:
def q_format_MedMCQA(item):
    context = []
    context.append(f"Question: {item["question"]}")
    context.append("")

    #option
    option_string = "Options:\n"
    option_string += f"A: {item['opa']}\n"
    option_string += f"B: {item['opb']}\n"
    option_string += f"C: {item['opc']}\n"
    option_string += f"D: {item['opd']}\n"
    context.append(option_string)
    return context
q_format_MedMCQA(medmcqa_dataset[0])

['Question: Which of the following is derived from fibroblast cells ?',
 '',
 'Options:\nA: TGF-13\nB: MMP2\nC: Collagen\nD: Angiopoietin\n']

In [49]:
def q_format_PubMedQA(item):
    context = []
    context.append(f"Question: {item["QUESTION"]}")
    context.append(f"Context: {item["CONTEXTS"][0]}")
    #option
    option_string = "Options:\n"
    option_string += "A: yes\n"
    option_string += "B: no\n"
    option_string += "C: maybe\n"
    context.append(option_string)
    return context
q_format_PubMedQA(pubmedqa_dataset[0])    

['Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'Context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
 'Options:\nA: yes\nB: no\nC: maybe\n']

In [109]:
# Complete Prompt

instruction = "Instruction: Solve the following medical question and generate your reasoning step by step and answer with the following format, 'Reasoning: your reasoning step \nAnswer: correct_option(correct_option_value). Do NOT make additional response."
question, context, options = q_format_PubMedQA(pubmedqa_dataset[10])
formatted_prompt = prompt_template.format_messages(Instruction=instruction, Question=question, Context= context, Options=options,few_shots="")
reasoning_answer = chain.invoke({"Instruction": instruction, "Question" : question, "Context": context, "Options": options, "few_shots":""})
reasoning_answer["answer"]

'A: yes.'

### Parsing

In [106]:
from tqdm import tqdm
results_collected_medqa = []
pass_collected_medqa = []
with get_openai_callback() as usage:
    for i in tqdm(range(200)):
        cur_question, cur_context, cur_options = q_format_MedQA(medqa_dataset[i])
        cur_correct_answer = medqa_dataset[i]["answer_idx"]
        # Chat
        reasoning_answer = chain.invoke({"Instruction": instruction, "Question" : cur_question, "Context": cur_context, "Options": cur_options, "few_shots":""})
        correctness = ((cur_correct_answer + ":") in reasoning_answer["answer"].split()[0]) or (medqa_dataset[i]["answer_idx"] in reasoning_answer["answer"])
        pass_collected_medqa.append(correctness)
        if (i == 199):
            results_collected_medqa.append(medqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness, "accuracy": sum(pass_collected_medqa)/ len(pass_collected_medqa)})
        else:
            results_collected_medqa.append(medqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness})

        if (i + 1) % 10 == 0:
            with open(f'results_collected_medqa.json', 'a') as f:
                json.dump(results_collected_medqa, f, ensure_ascii=False, indent=4)
            results_collected_medqa = []
        print(f"Acc: {sum(pass_collected_medqa)/ len(pass_collected_medqa)}")
    print(usage)

        

  0%|          | 1/200 [00:01<05:51,  1.77s/it]

Acc: 0.0


  1%|          | 2/200 [00:04<07:52,  2.38s/it]

Acc: 0.0


  2%|▏         | 3/200 [00:09<11:02,  3.36s/it]

Acc: 0.3333333333333333


  2%|▏         | 4/200 [00:11<10:14,  3.13s/it]

Acc: 0.25


  2%|▎         | 5/200 [00:14<10:03,  3.10s/it]

Acc: 0.4


  3%|▎         | 6/200 [00:18<10:43,  3.31s/it]

Acc: 0.3333333333333333


  4%|▎         | 7/200 [00:20<09:11,  2.86s/it]

Acc: 0.42857142857142855


  4%|▍         | 8/200 [00:22<08:12,  2.56s/it]

Acc: 0.5


  4%|▍         | 9/200 [00:24<07:21,  2.31s/it]

Acc: 0.5555555555555556


  5%|▌         | 10/200 [00:26<07:01,  2.22s/it]

Acc: 0.5


  6%|▌         | 11/200 [00:28<06:49,  2.17s/it]

Acc: 0.45454545454545453


  6%|▌         | 12/200 [00:31<07:22,  2.35s/it]

Acc: 0.5


  6%|▋         | 13/200 [00:33<07:06,  2.28s/it]

Acc: 0.5384615384615384


  7%|▋         | 14/200 [00:35<06:37,  2.14s/it]

Acc: 0.5


  8%|▊         | 15/200 [00:36<06:16,  2.03s/it]

Acc: 0.5333333333333333


  8%|▊         | 16/200 [00:39<06:49,  2.22s/it]

Acc: 0.5


  8%|▊         | 17/200 [00:41<06:09,  2.02s/it]

Acc: 0.47058823529411764


  9%|▉         | 18/200 [00:42<05:23,  1.78s/it]

Acc: 0.4444444444444444


 10%|▉         | 19/200 [00:44<05:44,  1.90s/it]

Acc: 0.47368421052631576


 10%|█         | 20/200 [00:46<06:12,  2.07s/it]

Acc: 0.5


 10%|█         | 21/200 [00:48<06:07,  2.05s/it]

Acc: 0.5238095238095238


 11%|█         | 22/200 [00:51<06:30,  2.19s/it]

Acc: 0.5454545454545454


 12%|█▏        | 23/200 [00:54<07:04,  2.40s/it]

Acc: 0.5652173913043478


 12%|█▏        | 24/200 [00:56<07:06,  2.42s/it]

Acc: 0.5416666666666666


 12%|█▎        | 25/200 [00:59<07:16,  2.50s/it]

Acc: 0.56


 13%|█▎        | 26/200 [01:01<06:57,  2.40s/it]

Acc: 0.5384615384615384


 14%|█▎        | 27/200 [01:03<06:40,  2.31s/it]

Acc: 0.5185185185185185


 14%|█▍        | 28/200 [01:05<06:32,  2.28s/it]

Acc: 0.5357142857142857


 14%|█▍        | 29/200 [01:08<06:38,  2.33s/it]

Acc: 0.5517241379310345


 15%|█▌        | 30/200 [01:10<06:12,  2.19s/it]

Acc: 0.5333333333333333


 16%|█▌        | 31/200 [01:13<07:13,  2.56s/it]

Acc: 0.5161290322580645


 16%|█▌        | 32/200 [01:15<06:36,  2.36s/it]

Acc: 0.53125


 16%|█▋        | 33/200 [01:17<06:34,  2.36s/it]

Acc: 0.5151515151515151


 17%|█▋        | 34/200 [01:20<06:27,  2.33s/it]

Acc: 0.5


 18%|█▊        | 35/200 [01:23<06:48,  2.48s/it]

Acc: 0.4857142857142857


 18%|█▊        | 36/200 [01:24<06:10,  2.26s/it]

Acc: 0.5


 18%|█▊        | 37/200 [01:26<06:04,  2.23s/it]

Acc: 0.4864864864864865


 19%|█▉        | 38/200 [01:28<05:51,  2.17s/it]

Acc: 0.5


 20%|█▉        | 39/200 [01:33<07:43,  2.88s/it]

Acc: 0.48717948717948717


 20%|██        | 40/200 [01:35<06:52,  2.58s/it]

Acc: 0.5


 20%|██        | 41/200 [01:37<06:41,  2.53s/it]

Acc: 0.5121951219512195


 21%|██        | 42/200 [01:40<06:28,  2.46s/it]

Acc: 0.5238095238095238


 22%|██▏       | 43/200 [01:42<06:11,  2.37s/it]

Acc: 0.5348837209302325


 22%|██▏       | 44/200 [01:45<06:33,  2.53s/it]

Acc: 0.5227272727272727


 22%|██▎       | 45/200 [01:47<06:04,  2.35s/it]

Acc: 0.5333333333333333


 23%|██▎       | 46/200 [01:49<06:00,  2.34s/it]

Acc: 0.5434782608695652


 24%|██▎       | 47/200 [01:53<07:30,  2.94s/it]

Acc: 0.5531914893617021


 24%|██▍       | 48/200 [01:57<07:52,  3.11s/it]

Acc: 0.5625


 24%|██▍       | 49/200 [01:59<07:14,  2.88s/it]

Acc: 0.5510204081632653


 25%|██▌       | 50/200 [02:01<06:34,  2.63s/it]

Acc: 0.56


 26%|██▌       | 51/200 [02:03<06:05,  2.45s/it]

Acc: 0.5490196078431373


 26%|██▌       | 52/200 [02:05<05:44,  2.33s/it]

Acc: 0.5576923076923077


 26%|██▋       | 53/200 [02:09<06:42,  2.74s/it]

Acc: 0.5660377358490566


 27%|██▋       | 54/200 [02:14<08:15,  3.39s/it]

Acc: 0.5740740740740741


 28%|██▊       | 55/200 [02:17<08:02,  3.33s/it]

Acc: 0.5818181818181818


 28%|██▊       | 56/200 [02:20<07:39,  3.19s/it]

Acc: 0.5892857142857143


 28%|██▊       | 57/200 [02:22<06:30,  2.73s/it]

Acc: 0.5964912280701754


 29%|██▉       | 58/200 [02:24<06:33,  2.77s/it]

Acc: 0.603448275862069


 30%|██▉       | 59/200 [02:28<07:22,  3.14s/it]

Acc: 0.5932203389830508


 30%|███       | 60/200 [02:32<07:54,  3.39s/it]

Acc: 0.5833333333333334


 30%|███       | 61/200 [02:35<07:20,  3.17s/it]

Acc: 0.5901639344262295


 31%|███       | 62/200 [02:37<06:34,  2.86s/it]

Acc: 0.5967741935483871


 32%|███▏      | 63/200 [02:40<06:41,  2.93s/it]

Acc: 0.5873015873015873


 32%|███▏      | 64/200 [02:43<06:48,  3.00s/it]

Acc: 0.59375


 32%|███▎      | 65/200 [02:46<06:17,  2.80s/it]

Acc: 0.5846153846153846


 33%|███▎      | 66/200 [02:48<05:56,  2.66s/it]

Acc: 0.5909090909090909


 34%|███▎      | 67/200 [02:51<05:55,  2.68s/it]

Acc: 0.5970149253731343


 34%|███▍      | 68/200 [02:53<05:28,  2.49s/it]

Acc: 0.6029411764705882


 34%|███▍      | 69/200 [02:55<05:09,  2.36s/it]

Acc: 0.6086956521739131


 35%|███▌      | 70/200 [02:58<05:43,  2.64s/it]

Acc: 0.6142857142857143


 36%|███▌      | 71/200 [03:02<06:35,  3.07s/it]

Acc: 0.6197183098591549


 36%|███▌      | 72/200 [03:05<06:17,  2.95s/it]

Acc: 0.625


 36%|███▋      | 73/200 [03:10<07:19,  3.46s/it]

Acc: 0.6301369863013698


 37%|███▋      | 74/200 [03:13<07:07,  3.39s/it]

Acc: 0.6216216216216216


 38%|███▊      | 75/200 [03:15<06:18,  3.03s/it]

Acc: 0.6266666666666667


 38%|███▊      | 76/200 [03:17<05:44,  2.78s/it]

Acc: 0.631578947368421


 38%|███▊      | 77/200 [03:19<05:20,  2.61s/it]

Acc: 0.6363636363636364


 39%|███▉      | 78/200 [03:23<06:03,  2.98s/it]

Acc: 0.6282051282051282


 40%|███▉      | 79/200 [03:28<07:02,  3.49s/it]

Acc: 0.6329113924050633


 40%|████      | 80/200 [03:32<07:23,  3.70s/it]

Acc: 0.6375


 40%|████      | 81/200 [03:34<06:18,  3.18s/it]

Acc: 0.6419753086419753


 41%|████      | 82/200 [03:36<05:33,  2.82s/it]

Acc: 0.6463414634146342


 42%|████▏     | 83/200 [03:38<05:13,  2.68s/it]

Acc: 0.6506024096385542


 42%|████▏     | 84/200 [03:40<04:41,  2.43s/it]

Acc: 0.6428571428571429


 42%|████▎     | 85/200 [03:44<05:08,  2.68s/it]

Acc: 0.6470588235294118


 43%|████▎     | 86/200 [03:46<04:49,  2.54s/it]

Acc: 0.6395348837209303


 44%|████▎     | 87/200 [03:48<04:49,  2.57s/it]

Acc: 0.6436781609195402


 44%|████▍     | 88/200 [03:51<04:53,  2.62s/it]

Acc: 0.6363636363636364


 44%|████▍     | 89/200 [03:55<05:21,  2.90s/it]

Acc: 0.6404494382022472


 45%|████▌     | 90/200 [04:00<06:29,  3.54s/it]

Acc: 0.6444444444444445


 46%|████▌     | 91/200 [04:03<06:03,  3.34s/it]

Acc: 0.6373626373626373


 46%|████▌     | 92/200 [04:05<05:39,  3.14s/it]

Acc: 0.6304347826086957


 46%|████▋     | 93/200 [04:09<05:46,  3.24s/it]

Acc: 0.6344086021505376


 47%|████▋     | 94/200 [04:11<05:11,  2.93s/it]

Acc: 0.6276595744680851


 48%|████▊     | 95/200 [04:14<05:19,  3.04s/it]

Acc: 0.6210526315789474


 48%|████▊     | 96/200 [04:16<04:42,  2.71s/it]

Acc: 0.625


 48%|████▊     | 97/200 [04:19<04:38,  2.70s/it]

Acc: 0.6185567010309279


 49%|████▉     | 98/200 [04:21<04:15,  2.50s/it]

Acc: 0.6224489795918368


 50%|████▉     | 99/200 [04:23<04:01,  2.39s/it]

Acc: 0.6262626262626263


 50%|█████     | 100/200 [04:26<04:20,  2.61s/it]

Acc: 0.63


 50%|█████     | 101/200 [04:29<04:37,  2.80s/it]

Acc: 0.6237623762376238


 51%|█████     | 102/200 [04:32<04:26,  2.72s/it]

Acc: 0.6176470588235294


 52%|█████▏    | 103/200 [04:35<04:19,  2.68s/it]

Acc: 0.6213592233009708


 52%|█████▏    | 104/200 [04:37<04:11,  2.62s/it]

Acc: 0.625


 52%|█████▎    | 105/200 [04:41<04:42,  2.98s/it]

Acc: 0.6285714285714286


 53%|█████▎    | 106/200 [04:45<05:02,  3.22s/it]

Acc: 0.6320754716981132


 54%|█████▎    | 107/200 [04:47<04:44,  3.06s/it]

Acc: 0.6355140186915887


 54%|█████▍    | 108/200 [04:50<04:20,  2.83s/it]

Acc: 0.6388888888888888


 55%|█████▍    | 109/200 [04:52<04:01,  2.66s/it]

Acc: 0.6330275229357798


 55%|█████▌    | 110/200 [04:55<04:12,  2.80s/it]

Acc: 0.6272727272727273


 56%|█████▌    | 111/200 [04:58<04:10,  2.81s/it]

Acc: 0.6306306306306306


 56%|█████▌    | 112/200 [05:01<04:26,  3.02s/it]

Acc: 0.6339285714285714


 56%|█████▋    | 113/200 [05:04<04:27,  3.07s/it]

Acc: 0.6283185840707964


 57%|█████▋    | 114/200 [05:09<04:50,  3.38s/it]

Acc: 0.631578947368421


 57%|█████▊    | 115/200 [05:11<04:21,  3.08s/it]

Acc: 0.6347826086956522


 58%|█████▊    | 116/200 [05:14<04:06,  2.94s/it]

Acc: 0.6293103448275862


 58%|█████▊    | 117/200 [05:17<04:06,  2.97s/it]

Acc: 0.6239316239316239


 59%|█████▉    | 118/200 [05:19<03:50,  2.81s/it]

Acc: 0.6186440677966102


 60%|█████▉    | 119/200 [05:21<03:31,  2.61s/it]

Acc: 0.6134453781512605


 60%|██████    | 120/200 [05:24<03:21,  2.52s/it]

Acc: 0.6166666666666667


 60%|██████    | 121/200 [05:26<03:08,  2.38s/it]

Acc: 0.6115702479338843


 61%|██████    | 122/200 [05:28<03:06,  2.39s/it]

Acc: 0.6147540983606558


 62%|██████▏   | 123/200 [05:30<03:00,  2.35s/it]

Acc: 0.6097560975609756


 62%|██████▏   | 124/200 [05:33<03:00,  2.38s/it]

Acc: 0.6129032258064516


 62%|██████▎   | 125/200 [05:34<02:40,  2.14s/it]

Acc: 0.616


 63%|██████▎   | 126/200 [05:37<02:44,  2.22s/it]

Acc: 0.6111111111111112


 64%|██████▎   | 127/200 [05:40<02:56,  2.41s/it]

Acc: 0.6062992125984252


 64%|██████▍   | 128/200 [05:42<02:57,  2.47s/it]

Acc: 0.609375


 64%|██████▍   | 129/200 [05:44<02:47,  2.36s/it]

Acc: 0.6046511627906976


 65%|██████▌   | 130/200 [05:46<02:41,  2.31s/it]

Acc: 0.6076923076923076


 66%|██████▌   | 131/200 [05:49<02:46,  2.41s/it]

Acc: 0.6106870229007634


 66%|██████▌   | 132/200 [05:51<02:42,  2.40s/it]

Acc: 0.6136363636363636


 66%|██████▋   | 133/200 [05:54<02:37,  2.35s/it]

Acc: 0.6165413533834586


 67%|██████▋   | 134/200 [05:57<02:56,  2.67s/it]

Acc: 0.6194029850746269


 68%|██████▊   | 135/200 [06:00<02:48,  2.60s/it]

Acc: 0.6222222222222222


 68%|██████▊   | 136/200 [06:02<02:35,  2.43s/it]

Acc: 0.625


 68%|██████▊   | 137/200 [06:05<02:47,  2.65s/it]

Acc: 0.6277372262773723


 69%|██████▉   | 138/200 [06:08<02:54,  2.81s/it]

Acc: 0.6231884057971014


 70%|██████▉   | 139/200 [06:11<02:48,  2.77s/it]

Acc: 0.6258992805755396


 70%|███████   | 140/200 [06:13<02:37,  2.62s/it]

Acc: 0.6285714285714286


 70%|███████   | 141/200 [06:16<02:47,  2.84s/it]

Acc: 0.6312056737588653


 71%|███████   | 142/200 [06:18<02:33,  2.65s/it]

Acc: 0.6338028169014085


 72%|███████▏  | 143/200 [06:22<02:40,  2.81s/it]

Acc: 0.6363636363636364


 72%|███████▏  | 144/200 [06:24<02:26,  2.61s/it]

Acc: 0.6388888888888888


 72%|███████▎  | 145/200 [06:27<02:39,  2.90s/it]

Acc: 0.6413793103448275


 73%|███████▎  | 146/200 [06:30<02:26,  2.71s/it]

Acc: 0.6438356164383562


 74%|███████▎  | 147/200 [06:32<02:12,  2.51s/it]

Acc: 0.6462585034013606


 74%|███████▍  | 148/200 [06:35<02:26,  2.81s/it]

Acc: 0.6486486486486487


 74%|███████▍  | 149/200 [06:37<02:15,  2.65s/it]

Acc: 0.6510067114093959


 75%|███████▌  | 150/200 [06:40<02:04,  2.48s/it]

Acc: 0.6466666666666666


 76%|███████▌  | 151/200 [06:42<02:08,  2.61s/it]

Acc: 0.6490066225165563


 76%|███████▌  | 152/200 [06:46<02:22,  2.96s/it]

Acc: 0.6447368421052632


 76%|███████▋  | 153/200 [06:49<02:18,  2.95s/it]

Acc: 0.6405228758169934


 77%|███████▋  | 154/200 [06:52<02:08,  2.79s/it]

Acc: 0.6428571428571429


 78%|███████▊  | 155/200 [06:56<02:23,  3.18s/it]

Acc: 0.6451612903225806


 78%|███████▊  | 156/200 [06:59<02:28,  3.37s/it]

Acc: 0.6410256410256411


 78%|███████▊  | 157/200 [07:02<02:16,  3.17s/it]

Acc: 0.643312101910828


 79%|███████▉  | 158/200 [07:05<02:08,  3.07s/it]

Acc: 0.6455696202531646


 80%|███████▉  | 159/200 [07:08<02:10,  3.18s/it]

Acc: 0.6477987421383647


 80%|████████  | 160/200 [07:11<01:58,  2.97s/it]

Acc: 0.65


 80%|████████  | 161/200 [07:13<01:43,  2.66s/it]

Acc: 0.6459627329192547


 81%|████████  | 162/200 [07:15<01:36,  2.53s/it]

Acc: 0.6481481481481481


 82%|████████▏ | 163/200 [07:18<01:33,  2.52s/it]

Acc: 0.6503067484662577


 82%|████████▏ | 164/200 [07:20<01:26,  2.39s/it]

Acc: 0.6524390243902439


 82%|████████▎ | 165/200 [07:22<01:27,  2.49s/it]

Acc: 0.6545454545454545


 83%|████████▎ | 166/200 [07:25<01:22,  2.43s/it]

Acc: 0.6506024096385542


 84%|████████▎ | 167/200 [07:28<01:31,  2.76s/it]

Acc: 0.6526946107784432


 84%|████████▍ | 168/200 [07:32<01:34,  2.94s/it]

Acc: 0.6547619047619048


 84%|████████▍ | 169/200 [07:35<01:35,  3.07s/it]

Acc: 0.6568047337278107


 85%|████████▌ | 170/200 [07:37<01:22,  2.76s/it]

Acc: 0.6588235294117647


 86%|████████▌ | 171/200 [07:39<01:15,  2.60s/it]

Acc: 0.6608187134502924


 86%|████████▌ | 172/200 [07:42<01:13,  2.64s/it]

Acc: 0.6569767441860465


 86%|████████▋ | 173/200 [07:47<01:32,  3.43s/it]

Acc: 0.6589595375722543


 87%|████████▋ | 174/200 [07:51<01:33,  3.61s/it]

Acc: 0.6609195402298851


 88%|████████▊ | 175/200 [07:55<01:31,  3.66s/it]

Acc: 0.6571428571428571


 88%|████████▊ | 176/200 [07:57<01:17,  3.21s/it]

Acc: 0.6590909090909091


 88%|████████▊ | 177/200 [08:00<01:08,  2.99s/it]

Acc: 0.655367231638418


 89%|████████▉ | 178/200 [08:02<01:00,  2.77s/it]

Acc: 0.651685393258427


 90%|████████▉ | 179/200 [08:04<00:57,  2.72s/it]

Acc: 0.6480446927374302


 90%|█████████ | 180/200 [08:08<00:57,  2.88s/it]

Acc: 0.65


 90%|█████████ | 181/200 [08:11<00:54,  2.85s/it]

Acc: 0.6519337016574586


 91%|█████████ | 182/200 [08:12<00:44,  2.47s/it]

Acc: 0.6538461538461539


 92%|█████████▏| 183/200 [08:16<00:49,  2.94s/it]

Acc: 0.6557377049180327


 92%|█████████▏| 184/200 [08:18<00:41,  2.57s/it]

Acc: 0.6521739130434783


 92%|█████████▎| 185/200 [08:20<00:37,  2.49s/it]

Acc: 0.6486486486486487


 93%|█████████▎| 186/200 [08:23<00:34,  2.47s/it]

Acc: 0.6451612903225806


 94%|█████████▎| 187/200 [08:26<00:36,  2.83s/it]

Acc: 0.6470588235294118


 94%|█████████▍| 188/200 [08:28<00:31,  2.61s/it]

Acc: 0.648936170212766


 94%|█████████▍| 189/200 [08:31<00:29,  2.66s/it]

Acc: 0.6455026455026455


 95%|█████████▌| 190/200 [08:35<00:29,  2.92s/it]

Acc: 0.6473684210526316


 96%|█████████▌| 191/200 [08:38<00:28,  3.14s/it]

Acc: 0.643979057591623


 96%|█████████▌| 192/200 [08:40<00:22,  2.79s/it]

Acc: 0.6458333333333334


 96%|█████████▋| 193/200 [08:43<00:19,  2.81s/it]

Acc: 0.6424870466321243


 97%|█████████▋| 194/200 [08:46<00:17,  2.92s/it]

Acc: 0.6443298969072165


 98%|█████████▊| 195/200 [08:50<00:15,  3.19s/it]

Acc: 0.6461538461538462


 98%|█████████▊| 196/200 [08:52<00:11,  2.86s/it]

Acc: 0.6479591836734694


 98%|█████████▊| 197/200 [08:55<00:08,  2.88s/it]

Acc: 0.6446700507614214


 99%|█████████▉| 198/200 [08:59<00:06,  3.05s/it]

Acc: 0.6464646464646465


100%|█████████▉| 199/200 [09:01<00:02,  2.73s/it]

Acc: 0.6432160804020101


100%|██████████| 200/200 [09:03<00:00,  2.72s/it]

Acc: 0.645
Tokens Used: 86484
	Prompt Tokens: 55464
	Completion Tokens: 31020
Successful Requests: 200
Total Cost (USD): $0.07426199999999997





In [103]:
pass_collected_medmcqa = []
results_collected_medmcqa = []
value_to_letter = {
    1: 'A',
    2: 'B',
    3: 'C',
    4: 'D'
}
with get_openai_callback() as usage:
    for i in tqdm(range(200)):
        cur_question, cur_context, cur_options = q_format_MedMCQA(medmcqa_dataset[i])
        cur_correct_answer = value_to_letter[medmcqa_dataset[i]["cop"]]
        # Chat
        reasoning_answer = chain.invoke({"Instruction": instruction, "Question" : cur_question, "Context": cur_context, "Options": cur_options, "few_shots":""})
        correctness = ((cur_correct_answer + ":") in reasoning_answer["answer"].split()[0]) or (medmcqa_dataset[i]["op" + cur_correct_answer.lower()] in reasoning_answer["answer"])
        pass_collected_medmcqa.append(correctness)
        if (i == 199):
            results_collected_medmcqa.append(medmcqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness, "accuracy": sum(pass_collected_medmcqa)/ len(pass_collected_medmcqa)})
        else:
            results_collected_medmcqa.append(medmcqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness})

        if (i + 1) % 10 == 0:
            with open(f'results_collected_medmcqa.json', 'a') as f:
                json.dump(results_collected_medmcqa, f, ensure_ascii=False, indent=4)
            results_collected_medmcqa = []
        print(f"Acc: {sum(pass_collected_medmcqa)/ len(pass_collected_medmcqa)}")
    print(usage)

        

  0%|          | 1/200 [00:01<05:28,  1.65s/it]

Acc: 1.0


  1%|          | 2/200 [00:02<04:29,  1.36s/it]

Acc: 1.0


  2%|▏         | 3/200 [00:04<05:31,  1.68s/it]

Acc: 0.6666666666666666


  2%|▏         | 4/200 [00:06<05:53,  1.81s/it]

Acc: 0.5


  2%|▎         | 5/200 [00:08<05:33,  1.71s/it]

Acc: 0.6


  3%|▎         | 6/200 [00:10<05:40,  1.76s/it]

Acc: 0.6666666666666666


  4%|▎         | 7/200 [00:12<06:29,  2.02s/it]

Acc: 0.7142857142857143


  4%|▍         | 8/200 [00:14<06:00,  1.88s/it]

Acc: 0.75


  4%|▍         | 9/200 [00:16<06:12,  1.95s/it]

Acc: 0.6666666666666666


  5%|▌         | 10/200 [00:17<05:34,  1.76s/it]

Acc: 0.7


  6%|▌         | 11/200 [00:20<06:01,  1.91s/it]

Acc: 0.7272727272727273


  6%|▌         | 12/200 [00:25<09:02,  2.89s/it]

Acc: 0.6666666666666666


  6%|▋         | 13/200 [00:26<07:49,  2.51s/it]

Acc: 0.6923076923076923


  7%|▋         | 14/200 [00:28<07:18,  2.36s/it]

Acc: 0.6428571428571429


  8%|▊         | 15/200 [00:30<06:43,  2.18s/it]

Acc: 0.6666666666666666


  8%|▊         | 16/200 [00:32<06:10,  2.01s/it]

Acc: 0.6875


  8%|▊         | 17/200 [00:34<06:18,  2.07s/it]

Acc: 0.7058823529411765


  9%|▉         | 18/200 [00:36<06:01,  1.99s/it]

Acc: 0.7222222222222222


 10%|▉         | 19/200 [00:37<05:20,  1.77s/it]

Acc: 0.6842105263157895


 10%|█         | 20/200 [00:39<05:05,  1.70s/it]

Acc: 0.7


 10%|█         | 21/200 [00:40<04:46,  1.60s/it]

Acc: 0.7142857142857143


 11%|█         | 22/200 [00:41<04:27,  1.50s/it]

Acc: 0.6818181818181818


 12%|█▏        | 23/200 [00:43<04:26,  1.51s/it]

Acc: 0.6521739130434783


 12%|█▏        | 24/200 [00:45<04:56,  1.68s/it]

Acc: 0.625


 12%|█▎        | 25/200 [00:47<05:33,  1.91s/it]

Acc: 0.64


 13%|█▎        | 26/200 [00:49<05:41,  1.96s/it]

Acc: 0.6538461538461539


 14%|█▎        | 27/200 [00:52<06:18,  2.19s/it]

Acc: 0.6666666666666666


 14%|█▍        | 28/200 [00:54<06:01,  2.10s/it]

Acc: 0.6428571428571429


 14%|█▍        | 29/200 [00:56<05:47,  2.03s/it]

Acc: 0.6551724137931034


 15%|█▌        | 30/200 [00:57<05:04,  1.79s/it]

Acc: 0.6333333333333333


 16%|█▌        | 31/200 [00:59<05:12,  1.85s/it]

Acc: 0.6451612903225806


 16%|█▌        | 32/200 [01:01<05:36,  2.00s/it]

Acc: 0.65625


 16%|█▋        | 33/200 [01:03<05:00,  1.80s/it]

Acc: 0.6666666666666666


 17%|█▋        | 34/200 [01:05<05:16,  1.90s/it]

Acc: 0.6764705882352942


 18%|█▊        | 35/200 [01:07<05:11,  1.89s/it]

Acc: 0.6857142857142857


 18%|█▊        | 36/200 [01:08<04:51,  1.78s/it]

Acc: 0.6944444444444444


 18%|█▊        | 37/200 [01:11<05:33,  2.05s/it]

Acc: 0.7027027027027027


 19%|█▉        | 38/200 [01:13<05:40,  2.10s/it]

Acc: 0.7105263157894737


 20%|█▉        | 39/200 [01:20<09:44,  3.63s/it]

Acc: 0.6923076923076923


 20%|██        | 40/200 [01:22<08:24,  3.15s/it]

Acc: 0.7


 20%|██        | 41/200 [01:24<07:16,  2.74s/it]

Acc: 0.6829268292682927


 21%|██        | 42/200 [01:27<07:03,  2.68s/it]

Acc: 0.6904761904761905


 22%|██▏       | 43/200 [01:28<06:02,  2.31s/it]

Acc: 0.6976744186046512


 22%|██▏       | 44/200 [01:29<05:08,  1.98s/it]

Acc: 0.6818181818181818


 22%|██▎       | 45/200 [01:31<05:11,  2.01s/it]

Acc: 0.6666666666666666


 23%|██▎       | 46/200 [01:33<04:42,  1.84s/it]

Acc: 0.6521739130434783


 24%|██▎       | 47/200 [01:35<05:07,  2.01s/it]

Acc: 0.6595744680851063


 24%|██▍       | 48/200 [01:38<05:18,  2.10s/it]

Acc: 0.6458333333333334


 24%|██▍       | 49/200 [01:41<06:08,  2.44s/it]

Acc: 0.6530612244897959


 25%|██▌       | 50/200 [01:42<05:26,  2.17s/it]

Acc: 0.66


 26%|██▌       | 51/200 [01:44<04:54,  1.98s/it]

Acc: 0.6666666666666666


 26%|██▌       | 52/200 [01:45<04:17,  1.74s/it]

Acc: 0.6730769230769231


 26%|██▋       | 53/200 [01:47<04:27,  1.82s/it]

Acc: 0.6792452830188679


 27%|██▋       | 54/200 [01:48<04:07,  1.70s/it]

Acc: 0.6666666666666666


 28%|██▊       | 55/200 [01:52<05:14,  2.17s/it]

Acc: 0.6727272727272727


 28%|██▊       | 56/200 [01:54<05:02,  2.10s/it]

Acc: 0.6785714285714286


 28%|██▊       | 57/200 [01:55<04:25,  1.86s/it]

Acc: 0.6842105263157895


 29%|██▉       | 58/200 [01:57<04:45,  2.01s/it]

Acc: 0.6896551724137931


 30%|██▉       | 59/200 [01:59<04:12,  1.79s/it]

Acc: 0.6949152542372882


 30%|███       | 60/200 [02:00<04:08,  1.77s/it]

Acc: 0.7


 30%|███       | 61/200 [02:02<04:05,  1.77s/it]

Acc: 0.7049180327868853


 31%|███       | 62/200 [02:04<04:08,  1.80s/it]

Acc: 0.6935483870967742


 32%|███▏      | 63/200 [02:06<04:18,  1.89s/it]

Acc: 0.6984126984126984


 32%|███▏      | 64/200 [02:08<04:11,  1.85s/it]

Acc: 0.703125


 32%|███▎      | 65/200 [02:10<04:27,  1.98s/it]

Acc: 0.6923076923076923


 33%|███▎      | 66/200 [02:12<04:06,  1.84s/it]

Acc: 0.696969696969697


 34%|███▎      | 67/200 [02:14<04:37,  2.09s/it]

Acc: 0.6865671641791045


 34%|███▍      | 68/200 [02:16<04:28,  2.04s/it]

Acc: 0.6911764705882353


 34%|███▍      | 69/200 [02:18<04:14,  1.94s/it]

Acc: 0.6811594202898551


 35%|███▌      | 70/200 [02:19<03:57,  1.83s/it]

Acc: 0.6714285714285714


 36%|███▌      | 71/200 [02:21<03:48,  1.77s/it]

Acc: 0.676056338028169


 36%|███▌      | 72/200 [02:23<03:53,  1.82s/it]

Acc: 0.6805555555555556


 36%|███▋      | 73/200 [02:25<04:05,  1.93s/it]

Acc: 0.6712328767123288


 37%|███▋      | 74/200 [02:28<04:16,  2.04s/it]

Acc: 0.6621621621621622


 38%|███▊      | 75/200 [02:28<03:31,  1.69s/it]

Acc: 0.6666666666666666


 38%|███▊      | 76/200 [02:30<03:38,  1.76s/it]

Acc: 0.6578947368421053


 38%|███▊      | 77/200 [02:33<04:19,  2.11s/it]

Acc: 0.6493506493506493


 39%|███▉      | 78/200 [02:36<04:22,  2.15s/it]

Acc: 0.6410256410256411


 40%|███▉      | 79/200 [02:38<04:47,  2.38s/it]

Acc: 0.6455696202531646


 40%|████      | 80/200 [02:41<04:51,  2.43s/it]

Acc: 0.65


 40%|████      | 81/200 [02:43<04:24,  2.22s/it]

Acc: 0.654320987654321


 41%|████      | 82/200 [02:44<03:43,  1.90s/it]

Acc: 0.6463414634146342


 42%|████▏     | 83/200 [02:45<03:25,  1.76s/it]

Acc: 0.6506024096385542


 42%|████▏     | 84/200 [02:47<03:13,  1.67s/it]

Acc: 0.6428571428571429


 42%|████▎     | 85/200 [02:50<03:49,  2.00s/it]

Acc: 0.6352941176470588


 43%|████▎     | 86/200 [02:52<04:10,  2.20s/it]

Acc: 0.6395348837209303


 44%|████▎     | 87/200 [02:54<03:41,  1.96s/it]

Acc: 0.632183908045977


 44%|████▍     | 88/200 [02:55<03:32,  1.90s/it]

Acc: 0.6363636363636364


 44%|████▍     | 89/200 [02:57<03:23,  1.83s/it]

Acc: 0.6404494382022472


 45%|████▌     | 90/200 [03:00<03:43,  2.04s/it]

Acc: 0.6444444444444445


 46%|████▌     | 91/200 [03:02<03:42,  2.04s/it]

Acc: 0.6483516483516484


 46%|████▌     | 92/200 [03:03<03:22,  1.88s/it]

Acc: 0.6521739130434783


 46%|████▋     | 93/200 [03:08<04:56,  2.77s/it]

Acc: 0.6451612903225806


 47%|████▋     | 94/200 [03:09<04:13,  2.39s/it]

Acc: 0.648936170212766


 48%|████▊     | 95/200 [03:11<03:40,  2.10s/it]

Acc: 0.6421052631578947


 48%|████▊     | 96/200 [03:13<03:49,  2.20s/it]

Acc: 0.6354166666666666


 48%|████▊     | 97/200 [03:15<03:24,  1.98s/it]

Acc: 0.6391752577319587


 49%|████▉     | 98/200 [03:17<03:36,  2.13s/it]

Acc: 0.6428571428571429


 50%|████▉     | 99/200 [03:18<03:06,  1.85s/it]

Acc: 0.6363636363636364


 50%|█████     | 100/200 [03:21<03:22,  2.02s/it]

Acc: 0.64


 50%|█████     | 101/200 [03:22<02:58,  1.81s/it]

Acc: 0.6336633663366337


 51%|█████     | 102/200 [03:25<03:33,  2.18s/it]

Acc: 0.6274509803921569


 52%|█████▏    | 103/200 [03:27<03:29,  2.16s/it]

Acc: 0.6213592233009708


 52%|█████▏    | 104/200 [03:29<03:10,  1.99s/it]

Acc: 0.625


 52%|█████▎    | 105/200 [03:32<03:28,  2.20s/it]

Acc: 0.6285714285714286


 53%|█████▎    | 106/200 [03:33<03:03,  1.95s/it]

Acc: 0.6320754716981132


 54%|█████▎    | 107/200 [03:34<02:26,  1.58s/it]

Acc: 0.6355140186915887


 54%|█████▍    | 108/200 [03:36<02:32,  1.65s/it]

Acc: 0.6388888888888888


 55%|█████▍    | 109/200 [03:37<02:23,  1.57s/it]

Acc: 0.6422018348623854


 55%|█████▌    | 110/200 [03:39<02:26,  1.63s/it]

Acc: 0.6363636363636364


 56%|█████▌    | 111/200 [03:40<02:10,  1.47s/it]

Acc: 0.6396396396396397


 56%|█████▌    | 112/200 [03:42<02:24,  1.64s/it]

Acc: 0.6428571428571429


 56%|█████▋    | 113/200 [03:44<02:25,  1.67s/it]

Acc: 0.6371681415929203


 57%|█████▋    | 114/200 [03:46<02:35,  1.80s/it]

Acc: 0.631578947368421


 57%|█████▊    | 115/200 [03:49<03:10,  2.24s/it]

Acc: 0.6347826086956522


 58%|█████▊    | 116/200 [03:50<02:45,  1.97s/it]

Acc: 0.6379310344827587


 58%|█████▊    | 117/200 [03:54<03:25,  2.47s/it]

Acc: 0.6324786324786325


 59%|█████▉    | 118/200 [03:58<03:52,  2.84s/it]

Acc: 0.6271186440677966


 60%|█████▉    | 119/200 [03:59<03:17,  2.44s/it]

Acc: 0.6218487394957983


 60%|██████    | 120/200 [04:01<02:59,  2.24s/it]

Acc: 0.625


 60%|██████    | 121/200 [04:03<03:01,  2.30s/it]

Acc: 0.6198347107438017


 61%|██████    | 122/200 [04:05<02:46,  2.13s/it]

Acc: 0.6147540983606558


 62%|██████▏   | 123/200 [04:10<03:43,  2.90s/it]

Acc: 0.6178861788617886


 62%|██████▏   | 124/200 [04:12<03:31,  2.79s/it]

Acc: 0.6209677419354839


 62%|██████▎   | 125/200 [04:15<03:16,  2.62s/it]

Acc: 0.624


 63%|██████▎   | 126/200 [04:16<02:41,  2.18s/it]

Acc: 0.626984126984127


 64%|██████▎   | 127/200 [04:17<02:30,  2.07s/it]

Acc: 0.6299212598425197


 64%|██████▍   | 128/200 [04:19<02:19,  1.94s/it]

Acc: 0.6328125


 64%|██████▍   | 129/200 [04:21<02:11,  1.85s/it]

Acc: 0.6356589147286822


 65%|██████▌   | 130/200 [04:22<02:07,  1.82s/it]

Acc: 0.6384615384615384


 66%|██████▌   | 131/200 [04:25<02:15,  1.97s/it]

Acc: 0.6412213740458015


 66%|██████▌   | 132/200 [04:26<02:00,  1.77s/it]

Acc: 0.6363636363636364


 66%|██████▋   | 133/200 [04:28<01:59,  1.79s/it]

Acc: 0.631578947368421


 67%|██████▋   | 134/200 [04:30<01:56,  1.76s/it]

Acc: 0.6343283582089553


 68%|██████▊   | 135/200 [04:31<01:48,  1.67s/it]

Acc: 0.6370370370370371


 68%|██████▊   | 136/200 [04:33<01:41,  1.59s/it]

Acc: 0.6323529411764706


 68%|██████▊   | 137/200 [04:35<01:58,  1.88s/it]

Acc: 0.635036496350365


 69%|██████▉   | 138/200 [04:38<02:09,  2.08s/it]

Acc: 0.6304347826086957


 70%|██████▉   | 139/200 [04:40<02:07,  2.09s/it]

Acc: 0.6330935251798561


 70%|███████   | 140/200 [04:42<02:15,  2.26s/it]

Acc: 0.6357142857142857


 70%|███████   | 141/200 [04:44<02:03,  2.10s/it]

Acc: 0.6382978723404256


 71%|███████   | 142/200 [04:47<02:07,  2.21s/it]

Acc: 0.6338028169014085


 72%|███████▏  | 143/200 [04:48<02:00,  2.12s/it]

Acc: 0.6363636363636364


 72%|███████▏  | 144/200 [04:51<01:57,  2.10s/it]

Acc: 0.6388888888888888


 72%|███████▎  | 145/200 [04:53<01:57,  2.14s/it]

Acc: 0.6413793103448275


 73%|███████▎  | 146/200 [04:54<01:45,  1.95s/it]

Acc: 0.636986301369863


 74%|███████▎  | 147/200 [04:56<01:45,  1.98s/it]

Acc: 0.6394557823129252


 74%|███████▍  | 148/200 [04:59<02:00,  2.32s/it]

Acc: 0.6351351351351351


 74%|███████▍  | 149/200 [05:01<01:48,  2.13s/it]

Acc: 0.6375838926174496


 75%|███████▌  | 150/200 [05:04<02:03,  2.47s/it]

Acc: 0.64


 76%|███████▌  | 151/200 [05:06<01:55,  2.35s/it]

Acc: 0.6423841059602649


 76%|███████▌  | 152/200 [05:08<01:37,  2.04s/it]

Acc: 0.6447368421052632


 76%|███████▋  | 153/200 [05:10<01:38,  2.09s/it]

Acc: 0.6405228758169934


 77%|███████▋  | 154/200 [05:12<01:39,  2.15s/it]

Acc: 0.6428571428571429


 78%|███████▊  | 155/200 [05:14<01:31,  2.04s/it]

Acc: 0.6451612903225806


 78%|███████▊  | 156/200 [05:16<01:27,  1.98s/it]

Acc: 0.6410256410256411


 78%|███████▊  | 157/200 [05:17<01:18,  1.82s/it]

Acc: 0.643312101910828


 79%|███████▉  | 158/200 [05:19<01:19,  1.89s/it]

Acc: 0.6392405063291139


 80%|███████▉  | 159/200 [05:22<01:25,  2.09s/it]

Acc: 0.6352201257861635


 80%|████████  | 160/200 [05:23<01:15,  1.89s/it]

Acc: 0.6375


 80%|████████  | 161/200 [05:26<01:16,  1.96s/it]

Acc: 0.639751552795031


 81%|████████  | 162/200 [05:27<01:11,  1.88s/it]

Acc: 0.6358024691358025


 82%|████████▏ | 163/200 [05:29<01:08,  1.86s/it]

Acc: 0.6380368098159509


 82%|████████▏ | 164/200 [05:31<01:04,  1.80s/it]

Acc: 0.6341463414634146


 82%|████████▎ | 165/200 [05:31<00:51,  1.47s/it]

Acc: 0.6303030303030303


 83%|████████▎ | 166/200 [05:33<00:53,  1.58s/it]

Acc: 0.6325301204819277


 84%|████████▎ | 167/200 [05:36<01:07,  2.05s/it]

Acc: 0.6287425149700598


 84%|████████▍ | 168/200 [05:38<00:58,  1.84s/it]

Acc: 0.6309523809523809


 84%|████████▍ | 169/200 [05:39<00:55,  1.79s/it]

Acc: 0.6331360946745562


 85%|████████▌ | 170/200 [05:43<01:14,  2.47s/it]

Acc: 0.6294117647058823


 86%|████████▌ | 171/200 [05:45<01:03,  2.20s/it]

Acc: 0.631578947368421


 86%|████████▌ | 172/200 [05:47<01:01,  2.19s/it]

Acc: 0.627906976744186


 86%|████████▋ | 173/200 [05:48<00:49,  1.85s/it]

Acc: 0.630057803468208


 87%|████████▋ | 174/200 [05:51<00:51,  1.97s/it]

Acc: 0.6264367816091954


 88%|████████▊ | 175/200 [05:52<00:48,  1.94s/it]

Acc: 0.6228571428571429


 88%|████████▊ | 176/200 [05:55<00:48,  2.01s/it]

Acc: 0.625


 88%|████████▊ | 177/200 [05:56<00:40,  1.77s/it]

Acc: 0.6271186440677966


 89%|████████▉ | 178/200 [05:57<00:36,  1.67s/it]

Acc: 0.6292134831460674


 90%|████████▉ | 179/200 [05:59<00:34,  1.64s/it]

Acc: 0.6312849162011173


 90%|█████████ | 180/200 [06:00<00:31,  1.59s/it]

Acc: 0.6333333333333333


 90%|█████████ | 181/200 [06:02<00:28,  1.53s/it]

Acc: 0.6298342541436464


 91%|█████████ | 182/200 [06:03<00:29,  1.62s/it]

Acc: 0.6318681318681318


 92%|█████████▏| 183/200 [06:06<00:32,  1.90s/it]

Acc: 0.6338797814207651


 92%|█████████▏| 184/200 [06:08<00:31,  1.96s/it]

Acc: 0.6358695652173914


 92%|█████████▎| 185/200 [06:10<00:30,  2.00s/it]

Acc: 0.6378378378378379


 93%|█████████▎| 186/200 [06:11<00:24,  1.72s/it]

Acc: 0.6397849462365591


 94%|█████████▎| 187/200 [06:14<00:26,  2.02s/it]

Acc: 0.6363636363636364


 94%|█████████▍| 188/200 [06:21<00:43,  3.63s/it]

Acc: 0.6382978723404256


 94%|█████████▍| 189/200 [06:23<00:35,  3.18s/it]

Acc: 0.6402116402116402


 95%|█████████▌| 190/200 [06:25<00:26,  2.60s/it]

Acc: 0.6368421052631579


 96%|█████████▌| 191/200 [06:26<00:19,  2.17s/it]

Acc: 0.6335078534031413


 96%|█████████▌| 192/200 [06:27<00:15,  1.89s/it]

Acc: 0.6354166666666666


 96%|█████████▋| 193/200 [06:30<00:14,  2.06s/it]

Acc: 0.6373056994818653


 97%|█████████▋| 194/200 [06:32<00:13,  2.22s/it]

Acc: 0.6391752577319587


 98%|█████████▊| 195/200 [06:34<00:10,  2.12s/it]

Acc: 0.6410256410256411


 98%|█████████▊| 196/200 [06:36<00:08,  2.18s/it]

Acc: 0.6428571428571429


 98%|█████████▊| 197/200 [06:38<00:06,  2.00s/it]

Acc: 0.6446700507614214


 99%|█████████▉| 198/200 [06:41<00:04,  2.19s/it]

Acc: 0.6464646464646465


100%|█████████▉| 199/200 [06:43<00:02,  2.16s/it]

Acc: 0.6482412060301508


100%|██████████| 200/200 [06:45<00:00,  2.03s/it]

Acc: 0.645
Tokens Used: 44862
	Prompt Tokens: 24888
	Completion Tokens: 19974
Successful Requests: 200
Total Cost (USD): $0.042405000000000005





### PumMedQA


In [112]:
value_to_letter_pm = {
    "yes": 'A',
    "no": 'B',
    "maybe": 'C'
}
results_collected_pubmedqa = []
pass_collected_pubmedqa = []
with get_openai_callback() as usage:
    for i in tqdm(range(200)):
        cur_question, cur_context, cur_options = q_format_PubMedQA(pubmedqa_dataset[i])
        cur_correct_answer = value_to_letter_pm[pubmedqa_dataset[i]["final_decision"]]

        # Chat
        reasoning_answer = chain.invoke({"Instruction": instruction, "Question" : cur_question, "Context": cur_context, "Options": cur_options, "few_shots":""})
        correctness = ((cur_correct_answer + ":") in reasoning_answer["answer"].split()[0]) or (pubmedqa_dataset[i]["final_decision"] in reasoning_answer["answer"])
        pass_collected_pubmedqa.append(correctness)
        if (i == 199):
            results_collected_pubmedqa.append(pubmedqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness, "accuracy": sum(pass_collected_pubmedqa)/ len(pass_collected_pubmedqa)})
        else:
            results_collected_pubmedqa.append(pubmedqa_dataset[i] | {"reasoning" : reasoning_answer["reasoning"], "model chosen" : reasoning_answer["answer"], "correctness" : correctness})

        if (i + 1) % 10 == 0:
            with open(f'results_collected_pubmedqa.json', 'a') as f:
                json.dump(results_collected_pubmedqa, f, ensure_ascii=False, indent=4)
            results_collected_pubmedqa = []
        print(f"Acc: {sum(pass_collected_pubmedqa)/ len(pass_collected_pubmedqa)}")
    print(usage)

        

  0%|          | 1/200 [00:01<05:47,  1.75s/it]

Acc: 1.0


  1%|          | 2/200 [00:04<07:44,  2.35s/it]

Acc: 0.5


  2%|▏         | 3/200 [00:06<06:38,  2.02s/it]

Acc: 0.3333333333333333


  2%|▏         | 4/200 [00:08<06:46,  2.08s/it]

Acc: 0.5


  2%|▎         | 5/200 [00:09<06:08,  1.89s/it]

Acc: 0.6


  3%|▎         | 6/200 [00:12<06:39,  2.06s/it]

Acc: 0.6666666666666666


  4%|▎         | 7/200 [00:14<07:17,  2.27s/it]

Acc: 0.5714285714285714


  4%|▍         | 8/200 [00:16<06:47,  2.12s/it]

Acc: 0.5


  4%|▍         | 9/200 [00:18<06:34,  2.06s/it]

Acc: 0.5555555555555556


  5%|▌         | 10/200 [00:21<06:53,  2.18s/it]

Acc: 0.6


  6%|▌         | 11/200 [00:23<07:09,  2.27s/it]

Acc: 0.6363636363636364


  6%|▌         | 12/200 [00:25<06:23,  2.04s/it]

Acc: 0.5833333333333334


  6%|▋         | 13/200 [00:27<06:34,  2.11s/it]

Acc: 0.5384615384615384


  7%|▋         | 14/200 [00:29<06:07,  1.97s/it]

Acc: 0.5


  8%|▊         | 15/200 [00:30<05:52,  1.90s/it]

Acc: 0.4666666666666667


  8%|▊         | 16/200 [00:32<05:58,  1.95s/it]

Acc: 0.5


  8%|▊         | 17/200 [00:34<05:55,  1.95s/it]

Acc: 0.5294117647058824


  9%|▉         | 18/200 [00:36<05:16,  1.74s/it]

Acc: 0.5555555555555556


 10%|▉         | 19/200 [00:37<05:09,  1.71s/it]

Acc: 0.5789473684210527


 10%|█         | 20/200 [00:39<05:27,  1.82s/it]

Acc: 0.6


 10%|█         | 21/200 [00:41<05:36,  1.88s/it]

Acc: 0.5714285714285714


 11%|█         | 22/200 [00:43<05:19,  1.80s/it]

Acc: 0.5909090909090909


 12%|█▏        | 23/200 [00:45<05:23,  1.83s/it]

Acc: 0.6086956521739131


 12%|█▏        | 24/200 [00:47<05:32,  1.89s/it]

Acc: 0.5833333333333334


 12%|█▎        | 25/200 [00:49<05:38,  1.93s/it]

Acc: 0.6


 13%|█▎        | 26/200 [00:52<06:25,  2.21s/it]

Acc: 0.5769230769230769


 14%|█▎        | 27/200 [00:54<06:04,  2.10s/it]

Acc: 0.5555555555555556


 14%|█▍        | 28/200 [00:55<05:48,  2.02s/it]

Acc: 0.5357142857142857


 14%|█▍        | 29/200 [00:57<05:47,  2.03s/it]

Acc: 0.5172413793103449


 15%|█▌        | 30/200 [01:00<05:46,  2.04s/it]

Acc: 0.5


 16%|█▌        | 31/200 [01:01<05:18,  1.89s/it]

Acc: 0.4838709677419355


 16%|█▌        | 32/200 [01:04<05:45,  2.06s/it]

Acc: 0.46875


 16%|█▋        | 33/200 [01:06<05:55,  2.13s/it]

Acc: 0.45454545454545453


 17%|█▋        | 34/200 [01:08<05:56,  2.15s/it]

Acc: 0.4411764705882353


 18%|█▊        | 35/200 [01:10<06:10,  2.25s/it]

Acc: 0.42857142857142855


 18%|█▊        | 36/200 [01:12<05:53,  2.16s/it]

Acc: 0.4166666666666667


 18%|█▊        | 37/200 [01:15<05:51,  2.15s/it]

Acc: 0.40540540540540543


 19%|█▉        | 38/200 [01:16<05:26,  2.01s/it]

Acc: 0.39473684210526316


 20%|█▉        | 39/200 [01:18<04:54,  1.83s/it]

Acc: 0.41025641025641024


 20%|██        | 40/200 [01:20<05:10,  1.94s/it]

Acc: 0.4


 20%|██        | 41/200 [01:22<05:03,  1.91s/it]

Acc: 0.4146341463414634


 21%|██        | 42/200 [01:24<05:15,  1.99s/it]

Acc: 0.40476190476190477


 22%|██▏       | 43/200 [01:25<04:46,  1.83s/it]

Acc: 0.3953488372093023


 22%|██▏       | 44/200 [01:28<05:17,  2.03s/it]

Acc: 0.38636363636363635


 22%|██▎       | 45/200 [01:29<04:55,  1.90s/it]

Acc: 0.4


 23%|██▎       | 46/200 [01:31<04:36,  1.80s/it]

Acc: 0.41304347826086957


 24%|██▎       | 47/200 [01:33<04:28,  1.75s/it]

Acc: 0.425531914893617


 24%|██▍       | 48/200 [01:34<04:26,  1.75s/it]

Acc: 0.4166666666666667


 24%|██▍       | 49/200 [01:36<04:24,  1.75s/it]

Acc: 0.42857142857142855


 25%|██▌       | 50/200 [01:38<04:39,  1.87s/it]

Acc: 0.42


 26%|██▌       | 51/200 [01:40<04:16,  1.72s/it]

Acc: 0.4117647058823529


 26%|██▌       | 52/200 [01:42<04:46,  1.93s/it]

Acc: 0.40384615384615385


 26%|██▋       | 53/200 [01:44<04:25,  1.81s/it]

Acc: 0.39622641509433965


 27%|██▋       | 54/200 [01:46<04:32,  1.87s/it]

Acc: 0.4074074074074074


 28%|██▊       | 55/200 [01:47<04:25,  1.83s/it]

Acc: 0.4


 28%|██▊       | 56/200 [01:49<04:37,  1.93s/it]

Acc: 0.39285714285714285


 28%|██▊       | 57/200 [01:51<04:22,  1.84s/it]

Acc: 0.38596491228070173


 29%|██▉       | 58/200 [01:53<04:17,  1.81s/it]

Acc: 0.3793103448275862


 30%|██▉       | 59/200 [01:55<04:16,  1.82s/it]

Acc: 0.3728813559322034


 30%|███       | 60/200 [01:56<04:01,  1.73s/it]

Acc: 0.36666666666666664


 30%|███       | 61/200 [01:58<04:18,  1.86s/it]

Acc: 0.3770491803278688


 31%|███       | 62/200 [02:00<03:59,  1.73s/it]

Acc: 0.3709677419354839


 32%|███▏      | 63/200 [02:02<04:14,  1.86s/it]

Acc: 0.38095238095238093


 32%|███▏      | 64/200 [02:05<04:54,  2.16s/it]

Acc: 0.390625


 32%|███▎      | 65/200 [02:07<04:58,  2.21s/it]

Acc: 0.4


 33%|███▎      | 66/200 [02:09<04:42,  2.11s/it]

Acc: 0.3939393939393939


 34%|███▎      | 67/200 [02:11<04:36,  2.08s/it]

Acc: 0.3880597014925373


 34%|███▍      | 68/200 [02:12<04:09,  1.89s/it]

Acc: 0.39705882352941174


 34%|███▍      | 69/200 [02:15<04:26,  2.03s/it]

Acc: 0.391304347826087


 35%|███▌      | 70/200 [02:17<04:43,  2.18s/it]

Acc: 0.4


 36%|███▌      | 71/200 [02:20<04:41,  2.18s/it]

Acc: 0.4084507042253521


 36%|███▌      | 72/200 [02:22<04:31,  2.12s/it]

Acc: 0.4027777777777778


 36%|███▋      | 73/200 [02:24<04:35,  2.17s/it]

Acc: 0.410958904109589


 37%|███▋      | 74/200 [02:26<04:40,  2.23s/it]

Acc: 0.4189189189189189


 38%|███▊      | 75/200 [02:28<04:24,  2.11s/it]

Acc: 0.4266666666666667


 38%|███▊      | 76/200 [02:30<04:34,  2.21s/it]

Acc: 0.42105263157894735


 38%|███▊      | 77/200 [02:32<04:20,  2.12s/it]

Acc: 0.42857142857142855


 39%|███▉      | 78/200 [02:34<04:12,  2.07s/it]

Acc: 0.4358974358974359


 40%|███▉      | 79/200 [02:38<04:54,  2.43s/it]

Acc: 0.43037974683544306


 40%|████      | 80/200 [02:39<04:30,  2.25s/it]

Acc: 0.4375


 40%|████      | 81/200 [02:41<04:06,  2.07s/it]

Acc: 0.43209876543209874


 41%|████      | 82/200 [02:43<04:03,  2.07s/it]

Acc: 0.43902439024390244


 42%|████▏     | 83/200 [02:51<07:39,  3.93s/it]

Acc: 0.43373493975903615


 42%|████▏     | 84/200 [02:54<06:58,  3.61s/it]

Acc: 0.44047619047619047


 42%|████▎     | 85/200 [02:56<05:54,  3.08s/it]

Acc: 0.43529411764705883


 43%|████▎     | 86/200 [02:58<05:08,  2.71s/it]

Acc: 0.43023255813953487


 44%|████▎     | 87/200 [03:00<04:33,  2.42s/it]

Acc: 0.42528735632183906


 44%|████▍     | 88/200 [03:02<04:09,  2.23s/it]

Acc: 0.4318181818181818


 44%|████▍     | 89/200 [03:03<03:42,  2.00s/it]

Acc: 0.42696629213483145


 45%|████▌     | 90/200 [03:05<03:44,  2.04s/it]

Acc: 0.4222222222222222


 46%|████▌     | 91/200 [03:07<03:36,  1.99s/it]

Acc: 0.42857142857142855


 46%|████▌     | 92/200 [03:09<03:40,  2.04s/it]

Acc: 0.42391304347826086


 46%|████▋     | 93/200 [03:11<03:23,  1.90s/it]

Acc: 0.43010752688172044


 47%|████▋     | 94/200 [03:12<03:11,  1.81s/it]

Acc: 0.43617021276595747


 48%|████▊     | 95/200 [03:14<03:00,  1.71s/it]

Acc: 0.4421052631578947


 48%|████▊     | 96/200 [03:15<02:46,  1.60s/it]

Acc: 0.4375


 48%|████▊     | 97/200 [03:17<02:44,  1.59s/it]

Acc: 0.4329896907216495


 49%|████▉     | 98/200 [03:19<02:56,  1.73s/it]

Acc: 0.4387755102040816


 50%|████▉     | 99/200 [03:20<02:52,  1.70s/it]

Acc: 0.4444444444444444


 50%|█████     | 100/200 [03:24<03:40,  2.21s/it]

Acc: 0.45


 50%|█████     | 101/200 [03:25<03:17,  2.00s/it]

Acc: 0.45544554455445546


 51%|█████     | 102/200 [03:27<03:08,  1.93s/it]

Acc: 0.46078431372549017


 52%|█████▏    | 103/200 [03:29<03:14,  2.00s/it]

Acc: 0.46601941747572817


 52%|█████▏    | 104/200 [03:31<03:12,  2.01s/it]

Acc: 0.47115384615384615


 52%|█████▎    | 105/200 [03:33<03:16,  2.07s/it]

Acc: 0.47619047619047616


 53%|█████▎    | 106/200 [03:35<03:09,  2.01s/it]

Acc: 0.4811320754716981


 54%|█████▎    | 107/200 [03:37<02:45,  1.78s/it]

Acc: 0.48598130841121495


 54%|█████▍    | 108/200 [03:38<02:42,  1.76s/it]

Acc: 0.49074074074074076


 55%|█████▍    | 109/200 [03:40<02:39,  1.76s/it]

Acc: 0.4954128440366973


 55%|█████▌    | 110/200 [03:43<03:02,  2.03s/it]

Acc: 0.4909090909090909


 56%|█████▌    | 111/200 [03:46<03:20,  2.25s/it]

Acc: 0.4864864864864865


 56%|█████▌    | 112/200 [03:47<03:04,  2.10s/it]

Acc: 0.48214285714285715


 56%|█████▋    | 113/200 [03:50<03:17,  2.27s/it]

Acc: 0.4778761061946903


 57%|█████▋    | 114/200 [03:52<03:03,  2.13s/it]

Acc: 0.4824561403508772


 57%|█████▊    | 115/200 [03:54<02:57,  2.09s/it]

Acc: 0.4782608695652174


 58%|█████▊    | 116/200 [03:56<02:59,  2.13s/it]

Acc: 0.47413793103448276


 58%|█████▊    | 117/200 [03:58<03:00,  2.18s/it]

Acc: 0.47863247863247865


 59%|█████▉    | 118/200 [04:00<02:47,  2.04s/it]

Acc: 0.4830508474576271


 60%|█████▉    | 119/200 [04:02<02:40,  1.98s/it]

Acc: 0.4789915966386555


 60%|██████    | 120/200 [04:03<02:28,  1.86s/it]

Acc: 0.48333333333333334


 60%|██████    | 121/200 [04:05<02:28,  1.87s/it]

Acc: 0.4793388429752066


 61%|██████    | 122/200 [04:08<02:49,  2.17s/it]

Acc: 0.48360655737704916


 62%|██████▏   | 123/200 [04:10<02:32,  1.98s/it]

Acc: 0.4878048780487805


 62%|██████▏   | 124/200 [04:12<02:41,  2.12s/it]

Acc: 0.4838709677419355


 62%|██████▎   | 125/200 [04:14<02:44,  2.19s/it]

Acc: 0.48


 63%|██████▎   | 126/200 [04:16<02:33,  2.08s/it]

Acc: 0.48412698412698413


 64%|██████▎   | 127/200 [04:18<02:25,  1.99s/it]

Acc: 0.48031496062992124


 64%|██████▍   | 128/200 [04:20<02:20,  1.95s/it]

Acc: 0.4765625


 64%|██████▍   | 129/200 [04:22<02:25,  2.05s/it]

Acc: 0.4806201550387597


 65%|██████▌   | 130/200 [04:24<02:13,  1.91s/it]

Acc: 0.47692307692307695


 66%|██████▌   | 131/200 [04:26<02:16,  1.98s/it]

Acc: 0.48091603053435117


 66%|██████▌   | 132/200 [04:28<02:19,  2.05s/it]

Acc: 0.48484848484848486


 66%|██████▋   | 133/200 [04:30<02:18,  2.07s/it]

Acc: 0.48120300751879697


 67%|██████▋   | 134/200 [04:32<02:01,  1.84s/it]

Acc: 0.48507462686567165


 68%|██████▊   | 135/200 [04:33<01:56,  1.79s/it]

Acc: 0.4888888888888889


 68%|██████▊   | 136/200 [04:35<01:57,  1.83s/it]

Acc: 0.49264705882352944


 68%|██████▊   | 137/200 [04:38<02:06,  2.01s/it]

Acc: 0.49635036496350365


 69%|██████▉   | 138/200 [04:40<02:06,  2.03s/it]

Acc: 0.4927536231884058


 70%|██████▉   | 139/200 [04:42<02:02,  2.00s/it]

Acc: 0.4892086330935252


 70%|███████   | 140/200 [04:44<02:13,  2.23s/it]

Acc: 0.4928571428571429


 70%|███████   | 141/200 [04:47<02:17,  2.34s/it]

Acc: 0.49645390070921985


 71%|███████   | 142/200 [04:50<02:20,  2.43s/it]

Acc: 0.5


 72%|███████▏  | 143/200 [04:52<02:12,  2.32s/it]

Acc: 0.4965034965034965


 72%|███████▏  | 144/200 [04:54<02:17,  2.45s/it]

Acc: 0.5


 72%|███████▎  | 145/200 [04:56<02:02,  2.23s/it]

Acc: 0.496551724137931


 73%|███████▎  | 146/200 [04:58<01:49,  2.02s/it]

Acc: 0.4931506849315068


 74%|███████▎  | 147/200 [05:00<01:47,  2.03s/it]

Acc: 0.4897959183673469


 74%|███████▍  | 148/200 [05:02<01:47,  2.07s/it]

Acc: 0.4864864864864865


 74%|███████▍  | 149/200 [05:04<01:42,  2.00s/it]

Acc: 0.48322147651006714


 75%|███████▌  | 150/200 [05:06<01:37,  1.95s/it]

Acc: 0.48


 76%|███████▌  | 151/200 [05:08<01:36,  1.98s/it]

Acc: 0.4768211920529801


 76%|███████▌  | 152/200 [05:09<01:27,  1.82s/it]

Acc: 0.48026315789473684


 76%|███████▋  | 153/200 [05:11<01:34,  2.00s/it]

Acc: 0.48366013071895425


 77%|███████▋  | 154/200 [05:13<01:30,  1.96s/it]

Acc: 0.487012987012987


 78%|███████▊  | 155/200 [05:16<01:35,  2.12s/it]

Acc: 0.49032258064516127


 78%|███████▊  | 156/200 [05:18<01:31,  2.09s/it]

Acc: 0.4935897435897436


 78%|███████▊  | 157/200 [05:20<01:34,  2.19s/it]

Acc: 0.4968152866242038


 79%|███████▉  | 158/200 [05:22<01:29,  2.13s/it]

Acc: 0.5


 80%|███████▉  | 159/200 [05:25<01:32,  2.26s/it]

Acc: 0.4968553459119497


 80%|████████  | 160/200 [05:27<01:25,  2.13s/it]

Acc: 0.49375


 80%|████████  | 161/200 [05:30<01:32,  2.38s/it]

Acc: 0.4968944099378882


 81%|████████  | 162/200 [05:32<01:28,  2.32s/it]

Acc: 0.49382716049382713


 82%|████████▏ | 163/200 [05:34<01:24,  2.28s/it]

Acc: 0.49079754601226994


 82%|████████▏ | 164/200 [05:37<01:24,  2.35s/it]

Acc: 0.4878048780487805


 82%|████████▎ | 165/200 [05:38<01:12,  2.07s/it]

Acc: 0.4909090909090909


 83%|████████▎ | 166/200 [05:40<01:10,  2.08s/it]

Acc: 0.4879518072289157


 84%|████████▎ | 167/200 [05:42<01:11,  2.16s/it]

Acc: 0.49101796407185627


 84%|████████▍ | 168/200 [05:44<01:05,  2.05s/it]

Acc: 0.4880952380952381


 84%|████████▍ | 169/200 [05:47<01:06,  2.14s/it]

Acc: 0.4911242603550296


 85%|████████▌ | 170/200 [05:49<01:03,  2.11s/it]

Acc: 0.48823529411764705


 86%|████████▌ | 171/200 [05:50<00:56,  1.94s/it]

Acc: 0.49122807017543857


 86%|████████▌ | 172/200 [05:51<00:49,  1.76s/it]

Acc: 0.4941860465116279


 86%|████████▋ | 173/200 [05:53<00:47,  1.75s/it]

Acc: 0.49710982658959535


 87%|████████▋ | 174/200 [05:55<00:48,  1.87s/it]

Acc: 0.5


 88%|████████▊ | 175/200 [05:57<00:48,  1.93s/it]

Acc: 0.5028571428571429


 88%|████████▊ | 176/200 [05:59<00:47,  1.97s/it]

Acc: 0.5


 88%|████████▊ | 177/200 [06:03<00:55,  2.43s/it]

Acc: 0.5028248587570622


 89%|████████▉ | 178/200 [06:05<00:48,  2.21s/it]

Acc: 0.5


 90%|████████▉ | 179/200 [06:07<00:45,  2.16s/it]

Acc: 0.4972067039106145


 90%|█████████ | 180/200 [06:10<00:48,  2.40s/it]

Acc: 0.49444444444444446


 90%|█████████ | 181/200 [06:12<00:45,  2.42s/it]

Acc: 0.49171270718232046


 91%|█████████ | 182/200 [06:14<00:43,  2.40s/it]

Acc: 0.4945054945054945


 92%|█████████▏| 183/200 [06:17<00:40,  2.38s/it]

Acc: 0.4972677595628415


 92%|█████████▏| 184/200 [06:19<00:36,  2.26s/it]

Acc: 0.5


 92%|█████████▎| 185/200 [06:21<00:31,  2.13s/it]

Acc: 0.5027027027027027


 93%|█████████▎| 186/200 [06:22<00:27,  2.00s/it]

Acc: 0.5053763440860215


 94%|█████████▎| 187/200 [06:24<00:24,  1.91s/it]

Acc: 0.5026737967914439


 94%|█████████▍| 188/200 [06:26<00:22,  1.88s/it]

Acc: 0.5


 94%|█████████▍| 189/200 [06:27<00:19,  1.81s/it]

Acc: 0.4973544973544973


 95%|█████████▌| 190/200 [06:29<00:18,  1.80s/it]

Acc: 0.49473684210526314


 96%|█████████▌| 191/200 [06:31<00:15,  1.68s/it]

Acc: 0.49214659685863876


 96%|█████████▌| 192/200 [06:33<00:14,  1.85s/it]

Acc: 0.4895833333333333


 96%|█████████▋| 193/200 [06:35<00:13,  1.88s/it]

Acc: 0.49222797927461137


 97%|█████████▋| 194/200 [06:36<00:10,  1.75s/it]

Acc: 0.4896907216494845


 98%|█████████▊| 195/200 [06:38<00:08,  1.65s/it]

Acc: 0.48717948717948717


 98%|█████████▊| 196/200 [06:39<00:06,  1.62s/it]

Acc: 0.4846938775510204


 98%|█████████▊| 197/200 [06:41<00:04,  1.66s/it]

Acc: 0.48223350253807107


 99%|█████████▉| 198/200 [06:43<00:03,  1.73s/it]

Acc: 0.4797979797979798


100%|█████████▉| 199/200 [06:45<00:01,  1.75s/it]

Acc: 0.47738693467336685


100%|██████████| 200/200 [06:48<00:00,  2.04s/it]

Acc: 0.48
Tokens Used: 56657
	Prompt Tokens: 34341
	Completion Tokens: 22316
Successful Requests: 200
Total Cost (USD): $0.05064450000000001



