In [1]:
!pip install groq python-dotenv numpy tqdm datasets

Collecting groq
  Downloading groq-0.30.0-py3-none-any.whl.metadata (16 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading groq-0.30.0-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, groq
Successfully installed groq-0.30.0 python-dotenv-1.1.1


In [3]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content

    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [5]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello there! I'm thrilled to meet you! I'm a math-solving assistant, here to help you with any math problems you're struggling with. Whether it's algebra, geometry, calculus, or anything in between, I'm here to lend a hand.

So, what's the math problem that's got you stumped? Share it with me, and I'll do my best to help you solve it!


#### GSM8K 데이터셋 확인해보기

In [6]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [5]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [6]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama3-8b-8192",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))

            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False

            if is_correct:
                correct += 1
            total += 1

            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [7]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"

    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"

    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [10]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [11]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:02<00:01,  2.60it/s]

Progress: [5/10]
Current Acc.: [20.00%]


100%|██████████| 10/10 [00:04<00:00,  2.08it/s]

Progress: [10/10]
Current Acc.: [30.00%]





In [12]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    prompt = construct_direct_prompt(shot)
    results, accuracy = run_benchmark_test(gsm8k_test, prompt, num_samples=50)

    print(f"[Direct Prompting] {shot}-shot accuracy: {accuracy:.2%}")

    save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")

 12%|█▏        | 6/50 [00:01<00:07,  5.64it/s]

Progress: [5/50]
Current Acc.: [40.00%]


 18%|█▊        | 9/50 [00:01<00:05,  6.98it/s]

Progress: [10/50]
Current Acc.: [40.00%]


 32%|███▏      | 16/50 [00:02<00:05,  6.71it/s]

Progress: [15/50]
Current Acc.: [26.67%]


 42%|████▏     | 21/50 [00:03<00:03,  8.32it/s]

Progress: [20/50]
Current Acc.: [30.00%]


 50%|█████     | 25/50 [00:12<00:40,  1.61s/it]

Progress: [25/50]
Current Acc.: [28.00%]


 60%|██████    | 30/50 [00:23<00:42,  2.12s/it]

Progress: [30/50]
Current Acc.: [26.67%]


 70%|███████   | 35/50 [00:34<00:32,  2.20s/it]

Progress: [35/50]
Current Acc.: [22.86%]


 80%|████████  | 40/50 [00:45<00:21,  2.19s/it]

Progress: [40/50]
Current Acc.: [22.50%]


 90%|█████████ | 45/50 [00:56<00:10,  2.20s/it]

Progress: [45/50]
Current Acc.: [24.44%]


100%|██████████| 50/50 [01:13<00:00,  1.47s/it]


Progress: [50/50]
Current Acc.: [26.00%]
[Direct Prompting] 0-shot accuracy: 26.00%


 10%|█         | 5/50 [00:10<01:38,  2.18s/it]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:21<01:28,  2.22s/it]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:32<01:18,  2.23s/it]

Progress: [15/50]
Current Acc.: [20.00%]


 40%|████      | 20/50 [00:44<01:06,  2.22s/it]

Progress: [20/50]
Current Acc.: [25.00%]


 50%|█████     | 25/50 [00:55<00:55,  2.23s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [01:06<00:44,  2.21s/it]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [01:26<01:14,  4.99s/it]

Progress: [35/50]
Current Acc.: [20.00%]


 82%|████████▏ | 41/50 [01:38<00:12,  1.33s/it]

Progress: [40/50]
Current Acc.: [20.00%]


 90%|█████████ | 45/50 [01:43<00:07,  1.50s/it]

Progress: [45/50]
Current Acc.: [20.00%]


100%|██████████| 50/50 [01:54<00:00,  2.30s/it]


Progress: [50/50]
Current Acc.: [20.00%]
[Direct Prompting] 3-shot accuracy: 20.00%


 10%|█         | 5/50 [00:11<01:41,  2.25s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:22<01:29,  2.23s/it]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:33<01:19,  2.26s/it]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:44<01:06,  2.22s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:00<01:18,  3.13s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 62%|██████▏   | 31/50 [01:26<01:07,  3.58s/it]

Progress: [30/50]
Current Acc.: [26.67%]


 70%|███████   | 35/50 [01:37<00:47,  3.16s/it]

Progress: [35/50]
Current Acc.: [22.86%]


 80%|████████  | 40/50 [01:55<00:35,  3.55s/it]

Progress: [40/50]
Current Acc.: [22.50%]


 90%|█████████ | 45/50 [02:16<00:19,  3.90s/it]

Progress: [45/50]
Current Acc.: [22.22%]


100%|██████████| 50/50 [02:38<00:00,  3.16s/it]

Progress: [50/50]
Current Acc.: [22.00%]
[Direct Prompting] 5-shot accuracy: 22.00%





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [13]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "Instruction:\nSolve the following math problems step by step, showing your reasoning clearly. After reasoning, give the final answer after a tag, '####'.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [15]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    prompt = construct_CoT_prompt(shot)
    results, accuracy = run_benchmark_test(gsm8k_test, prompt, num_samples=50)

    print(f"[CoT Prompting] {shot}-shot accuracy: {accuracy:.2%}")

    save_final_result(results, accuracy, f"CoT_prompting_{shot}.txt")

 10%|█         | 5/50 [00:01<00:13,  3.28it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [00:14<01:57,  2.93s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [00:32<02:06,  3.62s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [01:07<04:23,  8.77s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [01:11<00:47,  1.92s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████    | 30/50 [01:23<00:52,  2.63s/it]

Progress: [30/50]
Current Acc.: [66.67%]


 70%|███████   | 35/50 [01:37<00:39,  2.66s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|████████  | 40/50 [01:54<00:34,  3.49s/it]

Progress: [40/50]
Current Acc.: [65.00%]


 90%|█████████ | 45/50 [02:19<00:20,  4.01s/it]

Progress: [45/50]
Current Acc.: [64.44%]


100%|██████████| 50/50 [02:30<00:00,  3.01s/it]


Progress: [50/50]
Current Acc.: [66.00%]
[CoT Prompting] 0-shot accuracy: 66.00%


 10%|█         | 5/50 [00:31<05:00,  6.67s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:04<04:14,  6.37s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [01:37<03:51,  6.61s/it]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [02:12<03:15,  6.53s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [02:41<02:27,  5.88s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [03:25<03:16,  9.81s/it]

Progress: [30/50]
Current Acc.: [63.33%]


 70%|███████   | 35/50 [03:42<01:19,  5.30s/it]

Progress: [35/50]
Current Acc.: [68.57%]


 80%|████████  | 40/50 [04:13<01:03,  6.33s/it]

Progress: [40/50]
Current Acc.: [67.50%]


 90%|█████████ | 45/50 [04:44<00:25,  5.02s/it]

Progress: [45/50]
Current Acc.: [68.89%]


100%|██████████| 50/50 [05:17<00:00,  6.34s/it]


Progress: [50/50]
Current Acc.: [70.00%]
[CoT Prompting] 3-shot accuracy: 70.00%


 10%|█         | 5/50 [00:41<04:13,  5.62s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:27<05:51,  8.79s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [02:14<05:04,  8.70s/it]

Progress: [15/50]
Current Acc.: [53.33%]


 40%|████      | 20/50 [03:09<05:05, 10.19s/it]

Progress: [20/50]
Current Acc.: [50.00%]


 50%|█████     | 25/50 [03:44<03:27,  8.28s/it]

Progress: [25/50]
Current Acc.: [48.00%]


 60%|██████    | 30/50 [04:27<02:41,  8.06s/it]

Progress: [30/50]
Current Acc.: [53.33%]


 70%|███████   | 35/50 [05:16<02:32, 10.15s/it]

Progress: [35/50]
Current Acc.: [60.00%]


 80%|████████  | 40/50 [05:53<01:23,  8.31s/it]

Progress: [40/50]
Current Acc.: [57.50%]


 90%|█████████ | 45/50 [06:38<00:43,  8.75s/it]

Progress: [45/50]
Current Acc.: [60.00%]


100%|██████████| 50/50 [07:24<00:00,  8.88s/it]

Progress: [50/50]
Current Acc.: [64.00%]
[CoT Prompting] 5-shot accuracy: 64.00%





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [11]:
### 자유롭게 수정해도 됩니다! 완전히 새로 함수를 만들어도 돼요.
def construct_my_prompt(example_list: List[str], num_examples: int = 3):
    # TODO: 구현해주세요!

    prompt = (
    "Solve the following math problems step-by-step. After solving, double-check your work.\n"
    "Then write the final answer on a new line, prefixed with '####'.\n\n"
)


    for example in example_list[:num_examples]:
        prompt += f"{example}\n\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [12]:
example_list = []

manual_examples = [
    {
        "question": "Janet’s ducks lay 16 eggs per day. She eats 3 and bakes with 4. She sells the rest at $2 each. How much does she make per day?",
        "reasoning": (
            "Janet eats 3 eggs and uses 4 for baking, so 16 - 3 - 4 = 9 eggs left.\n"
            "She sells 9 eggs at $2 each → 9 * 2 = $18.\n\n"
            "Double-check:\n3 + 4 = 7 eggs used\n16 - 7 = 9 left\n9 * 2 = 18"
        ),
        "final": "18"
    },
    {
        "question": "Mike has $50. He buys a game for $18 and a book for $12. How much money does he have left?",
        "reasoning": (
            "He spends 18 + 12 = $30.\n"
            "He has 50 - 30 = $20 left.\n\n"
            "Double-check:\n18 + 12 = 30\n50 - 30 = 20"
        ),
        "final": "20"
    },
    {
        "question": "Tom buys 4 packs of pencils. Each pack has 6 pencils. He gives away 5 pencils. How many does he have left?",
        "reasoning": (
            "4 * 6 = 24 pencils bought.\n"
            "He gives away 5 → 24 - 5 = 19 pencils left.\n\n"
            "Double-check:\n4 * 6 = 24\n24 - 5 = 19"
        ),
        "final": "19"
    },
    {
        "question": "Sarah read 12 pages on Monday and 15 pages on Tuesday. Her goal is to read 40 pages. How many pages does she have left?",
        "reasoning": (
            "12 + 15 = 27 pages read.\n"
            "40 - 27 = 13 pages left to meet her goal.\n\n"
            "Double-check:\n12 + 15 = 27\n40 - 27 = 13"
        ),
        "final": "13"
    },
    {
        "question": "Jenny baked 24 cookies. She gave 5 to a friend, 3 to a neighbor, and ate 4 herself. How many cookies does she have left?",
        "reasoning": (
            "5 + 3 + 4 = 12 cookies used.\n"
            "24 - 12 = 12 cookies left.\n\n"
            "Double-check:\n5 + 3 + 4 = 12\n24 - 12 = 12"
        ),
        "final": "12"
    }
]



for ex in manual_examples:
    formatted = (
        f"Question:\n{ex['question']}\n"
        f"Answer:\n{ex['reasoning']}\n#### {ex['final']}"
    )
    example_list.append(formatted)

In [13]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    prompt = construct_my_prompt(example_list, shot)
    results, accuracy = run_benchmark_test(gsm8k_test, prompt, num_samples=50)

    print(f"[My Prompting] {shot}-shot accuracy: {accuracy:.2%}")

    save_final_result(results, accuracy, f"My_prompting_{shot}.txt")

 10%|█         | 5/50 [00:03<00:31,  1.43it/s]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██        | 10/50 [00:10<01:04,  1.61s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███       | 15/50 [00:28<01:56,  3.32s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [00:43<01:33,  3.11s/it]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|█████     | 25/50 [01:25<03:55,  9.42s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████    | 30/50 [01:30<00:50,  2.55s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [01:36<00:18,  1.26s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [01:46<00:24,  2.45s/it]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [02:03<00:17,  3.53s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [02:19<00:00,  2.80s/it]


Progress: [50/50]
Current Acc.: [72.00%]
[My Prompting] 0-shot accuracy: 72.00%


 10%|█         | 5/50 [00:25<03:49,  5.10s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [00:54<03:50,  5.77s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [01:26<03:47,  6.51s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [01:55<02:32,  5.10s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [02:22<02:09,  5.18s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [02:50<01:48,  5.41s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [03:16<01:03,  4.23s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [03:43<00:52,  5.29s/it]

Progress: [40/50]
Current Acc.: [67.50%]


 90%|█████████ | 45/50 [04:12<00:28,  5.63s/it]

Progress: [45/50]
Current Acc.: [68.89%]


100%|██████████| 50/50 [04:43<00:00,  5.68s/it]


Progress: [50/50]
Current Acc.: [70.00%]
[My Prompting] 3-shot accuracy: 70.00%


 10%|█         | 5/50 [00:33<05:07,  6.83s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:17<05:24,  8.11s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [01:51<04:29,  7.69s/it]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [02:31<03:12,  6.42s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [03:06<02:48,  6.73s/it]

Progress: [25/50]
Current Acc.: [60.00%]


 60%|██████    | 30/50 [03:43<01:57,  5.89s/it]

Progress: [30/50]
Current Acc.: [63.33%]


 70%|███████   | 35/50 [04:19<01:41,  6.79s/it]

Progress: [35/50]
Current Acc.: [68.57%]


 80%|████████  | 40/50 [04:59<01:15,  7.59s/it]

Progress: [40/50]
Current Acc.: [62.50%]


 90%|█████████ | 45/50 [05:32<00:34,  6.94s/it]

Progress: [45/50]
Current Acc.: [66.67%]


100%|██████████| 50/50 [06:11<00:00,  7.43s/it]

Progress: [50/50]
Current Acc.: [68.00%]
[My Prompting] 5-shot accuracy: 68.00%





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!