In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = "/content/drive/MyDrive/Colab Notebooks/llm/0127-1 LLM"
os.chdir(PROJECT_DIR)

print("cwd:", os.getcwd())
print("env exists?", os.path.exists(".env"))
print("files:", os.listdir())


Mounted at /content/drive
cwd: /content/drive/MyDrive/Colab Notebooks/llm/0127-1 LLM
env exists? True
files: ['.env.example', 'PROMPTING.md', '[7회차] LLM 과제.zip', '[7회차] LLM 과제.pdf', '0127_LLM.pdf', '.env', 'example.txt', 'direct_prompting_0.txt', 'direct_prompting_3.txt', 'direct_prompting_5.txt', 'CoT_prompting_0.txt', 'CoT_prompting_3.txt', 'CoT_prompting_5.txt', 'My_prompting_0.txt', 'My_prompting_3.txt', 'My_prompting_5.txt', 'prompting.ipynb']


In [3]:
!pip install groq python-dotenv numpy tqdm datasets

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0


In [4]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [5]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama-3.1-8b-instant"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content

    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [6]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello world! It's nice to meet you. I'm here to help with any math problems you might have. What kind of math are you working on today? Do you need help with algebra, geometry, calculus, or something else?


#### GSM8K 데이터셋 확인해보기

In [7]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [8]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [9]:
### 수정해도 됩니다!
import re
import math
from tqdm import tqdm

def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama-3.1-8b-instant",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    """
    Robust benchmark runner.

    - Never crashes on parse failures (empty strings, None, weird formats)
    - Keeps the same signature + return type: (results: list[dict], accuracy: float)
    - Skips samples with no response (same behavior as original: total increments only when response exists)
    """

    def _safe_extract_number_from_gsm8k_answer(ans_text: str):
        """
        GSM8K answer typically contains '#### <number>'.
        This returns a float if possible; otherwise None.
        """
        if ans_text is None:
            return None

        # Prefer after '####' if present
        tail = ans_text.split("####")[-1] if "####" in ans_text else ans_text
        # Remove commas, keep digits/sign/dot
        tail = tail.replace(",", " ").strip()

        # Find the first number in tail
        m = re.search(r"[-+]?\d+(?:\.\d+)?", tail)
        if not m:
            return None
        try:
            return float(m.group(0))
        except Exception:
            return None

    def _to_float_or_none(x):
        """
        Convert predicted_answer into float robustly.
        Accepts: float/int/str (possibly with commas/spaces), else None.
        """
        if x is None:
            return None

        # Already numeric
        if isinstance(x, (int, float)):
            # Guard NaN/inf
            if isinstance(x, float) and (math.isnan(x) or math.isinf(x)):
                return None
            return float(x)

        # String cases
        if isinstance(x, str):
            s = x.strip()
            if s == "":
                return None

            # Common cleanups: commas, currency, extra tokens
            s = s.replace(",", " ")

            # Prefer '#### <number>' if it exists in the string
            m = re.search(r"####\s*([-+]?\d+(?:\.\d+)?)", s)
            if m:
                try:
                    return float(m.group(1))
                except Exception:
                    return None

            # Fallback: last number in the string (often the final answer)
            nums = re.findall(r"[-+]?\d+(?:\.\d+)?", s)
            if not nums:
                return None
            try:
                return float(nums[-1])
            except Exception:
                return None

        # Unknown type
        return None

    correct = 0
    total   = 0
    results = []

    n = min(num_samples, len(dataset))

    for i in tqdm(range(n)):
        question = dataset[i].get("question", None)
        raw_answer = dataset[i].get("answer", None)

        # If dataset row is malformed, skip safely
        if question is None or raw_answer is None:
            if VERBOSE:
                print(f"[WARN] Missing question/answer at index={i}. Skipped.")
            continue

        correct_answer = _safe_extract_number_from_gsm8k_answer(raw_answer)
        if correct_answer is None:
            # Can't score this sample; skip to avoid crashing / unfair scoring
            if VERBOSE:
                print(f"[WARN] Could not parse correct answer at index={i}. Skipped.")
                print("RAW ANSWER:", raw_answer)
            continue

        # Call model
        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        # Keep original semantics: only score when response is truthy
        if not response:
            if VERBOSE:
                print(f"[WARN] Empty response at index={i}. Skipped.")
            continue

        if VERBOSE:
            print("="*50)
            print(response)
            print("="*50)

        # Extract and parse prediction robustly
        predicted_raw = None
        predicted_answer = None
        parse_error = None

        try:
            predicted_raw = extract_final_answer(response)
            predicted_answer = _to_float_or_none(predicted_raw)
        except Exception as e:
            parse_error = repr(e)
            predicted_answer = None

        # Score
        if predicted_answer is None:
            is_correct = False
            diff = None
        else:
            diff = abs(predicted_answer - correct_answer)
            is_correct = (diff < 1e-5)

        if is_correct:
            correct += 1
        total += 1

        results.append({
            'question': question,
            'correct_answer': correct_answer,
            'predicted_answer': predicted_answer,
            'response': response,
            'correct': is_correct,
            # Debug fields (안 써도 되지만 결과 분석에 도움)
            'predicted_raw': predicted_raw,
            'parse_error': parse_error,
            'diff': diff
        })

        # Progress print (원래 코드와 최대한 동일하게 유지)
        if (i + 1) % 5 == 0:
            current_acc = correct / total if total > 0 else 0
            print(f"Progress: [{i+1}/{num_samples}]")
            print(f"Current Acc.: [{current_acc:.2%}]")

    return results, (correct / total if total > 0 else 0.0)


In [10]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"

    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"

    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [11]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [None]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:01<00:01,  3.15it/s]

Progress: [5/10]
Current Acc.: [60.00%]


100%|██████████| 10/10 [00:03<00:00,  3.26it/s]

Progress: [10/10]
Current Acc.: [50.00%]





In [None]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

In [None]:
for shot in [0, 3, 5]:
  PROMPT = construct_direct_prompt(shot)
  VERBOSE = False

  results, accuracy = run_benchmark_test(
      dataset=gsm8k_test,
      prompt=PROMPT,
      VERBOSE=VERBOSE,
      num_samples=50
  )
  file_name = f"direct_prompting_{shot}.txt"
  save_final_result(results, accuracy, file_name)

 12%|█▏        | 6/50 [00:01<00:11,  3.97it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 22%|██▏       | 11/50 [00:03<00:09,  4.10it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:04<00:09,  3.58it/s]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [00:14<01:34,  3.14s/it]

Progress: [20/50]
Current Acc.: [55.00%]


 50%|█████     | 25/50 [00:25<00:56,  2.25s/it]

Progress: [25/50]
Current Acc.: [60.00%]


 60%|██████    | 30/50 [00:43<01:15,  3.75s/it]

Progress: [30/50]
Current Acc.: [63.33%]


 70%|███████   | 35/50 [00:54<00:34,  2.28s/it]

Progress: [35/50]
Current Acc.: [68.57%]


 80%|████████  | 40/50 [01:04<00:22,  2.24s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|█████████ | 45/50 [01:14<00:10,  2.11s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [01:25<00:00,  1.71s/it]


Progress: [50/50]
Current Acc.: [74.00%]


 10%|█         | 5/50 [00:18<02:53,  3.85s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:36<02:29,  3.74s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:53<02:01,  3.46s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [01:11<01:46,  3.54s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [01:35<01:37,  3.92s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [01:52<01:08,  3.44s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [02:08<00:49,  3.29s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [02:26<00:36,  3.70s/it]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [02:45<00:18,  3.71s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [03:04<00:00,  3.69s/it]


Progress: [50/50]
Current Acc.: [72.00%]


 10%|█         | 5/50 [00:23<03:42,  4.94s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:55<03:35,  5.38s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [01:17<02:42,  4.64s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [01:40<02:16,  4.55s/it]

Progress: [20/50]
Current Acc.: [60.00%]


 50%|█████     | 25/50 [02:02<01:49,  4.40s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████    | 30/50 [02:24<01:28,  4.41s/it]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████   | 35/50 [02:46<01:06,  4.41s/it]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████  | 40/50 [03:09<00:47,  4.78s/it]

Progress: [40/50]
Current Acc.: [77.50%]


 90%|█████████ | 45/50 [03:33<00:22,  4.59s/it]

Progress: [45/50]
Current Acc.: [80.00%]


100%|██████████| 50/50 [03:57<00:00,  4.74s/it]

Progress: [50/50]
Current Acc.: [80.00%]





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [19]:
import random

def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    # 1. '핵심 로직' 위주로 서술하되, 너무 세세한 산수는 생략하라는 지침
    prompt = (
        "Solve the following math problem step-by-step.\n"
        "Rules:\n"
        "1) Focus on the core logic and main equations.\n"
        "2) Provide a clear but concise reasoning flow.\n"
        "3) Use the format: Reasoning: -> Conclusion: #### <number>\n"
        "\n"
        "Examples:\n"
    )

    for i in range(num_examples):
        idx = sampled_indices[i]
        q = train_dataset["question"][idx].strip()
        a_content = train_dataset["answer"][idx].strip()

        # 정답에서 최종 숫자 추출
        final_number = a_content.split("####")[-1].strip()

        # 2. '성능 보정': 핵심 식을 세우는 과정을 예시에 포함
        # "이거랑 이걸 계산하면 된다"는 식의 가이드를 주어 모델이 정답 근처까지 가게 함
        prompt += (
            f"\nExample {i+1}\n"
            f"Question:\n{q}\n"
            f"Reasoning:\nTo find the answer, we need to determine the total based on the given conditions. Let's calculate the main values and combine them. After evaluating the steps, the result is {final_number}.\n"
            f"Conclusion:\n#### {final_number}\n"
        )

    # 3. 모델이 생각을 '시작'할 수 있게 유도
    prompt += "\nQuestion:\n{question}\nReasoning:\nLet's think step-by-step.\nConclusion:\n"

    return prompt

In [None]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

In [20]:
# CoT prompting: 0 shot, 3 shot, 5 shot
# 항상 num_samples=50
for shot in [0, 3, 5]:
    PROMPT = construct_CoT_prompt(num_examples=shot)
    VERBOSE = False

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        model="llama-3.1-8b-instant",
        VERBOSE=VERBOSE,
        num_samples=50
    )

    file_name = f"CoT_prompting_{shot}.txt"
    save_final_result(results, accuracy, file_name)


 10%|█         | 5/50 [00:01<00:16,  2.74it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [00:04<00:20,  1.91it/s]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [00:13<01:03,  1.83s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [00:24<01:07,  2.26s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [00:36<00:58,  2.36s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [00:48<00:48,  2.42s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [01:00<00:35,  2.34s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [01:12<00:24,  2.43s/it]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [01:25<00:12,  2.46s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [01:38<00:00,  1.97s/it]


Progress: [50/50]
Current Acc.: [68.00%]


 10%|█         | 5/50 [00:32<05:02,  6.71s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:05<04:27,  6.70s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [01:37<03:44,  6.41s/it]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [02:09<03:12,  6.41s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [02:41<02:38,  6.33s/it]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|██████    | 30/50 [03:13<02:09,  6.46s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [03:45<01:35,  6.34s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|████████  | 40/50 [04:24<01:14,  7.44s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|█████████ | 45/50 [04:57<00:33,  6.66s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [05:29<00:00,  6.59s/it]


Progress: [50/50]
Current Acc.: [72.00%]


 10%|█         | 5/50 [00:38<05:49,  7.77s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [01:17<05:12,  7.82s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [01:55<04:23,  7.54s/it]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [02:34<03:56,  7.88s/it]

Progress: [20/50]
Current Acc.: [60.00%]


 50%|█████     | 25/50 [03:11<03:05,  7.43s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [03:51<02:35,  7.80s/it]

Progress: [30/50]
Current Acc.: [60.00%]


 70%|███████   | 35/50 [04:23<01:46,  7.09s/it]

Progress: [35/50]
Current Acc.: [62.86%]


 80%|████████  | 40/50 [05:01<01:17,  7.74s/it]

Progress: [40/50]
Current Acc.: [62.50%]


100%|██████████| 50/50 [06:18<00:00,  7.57s/it]

Progress: [45/50]
Current Acc.: [66.67%]
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-8b-instant` in organization `org_01kg530a2bf74a05nz39bqwb00` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500000, Requested 736. Please try again in 2m7.1808s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-8b-instant` in organization `org_01kg530a2bf74a05nz39bqwb00` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500000, Requested 732. Please try again in 2m6.4896s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-8b-instant` in org




### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [12]:
import random

def construct_my_prompt(num_examples: int = 3) -> str:
    # 데이터셋 참조 (실행 환경에 맞게 조정 필요)
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset["question"]))],
        num_examples
    )

    # 1. System Instruction: 역할 정의 및 엄격한 규칙 부여
    prompt = (
        "You are an expert mathematical reasoning engine. Your primary goal is absolute accuracy.\n"
        "Solve the problem by breaking it down into atomic logical steps. Use the following structure precisely.\n\n"
        "### Guidelines ###\n"
        "1) Plan: Outline a concise, 1-2 sentence strategy to solve the problem.\n"
        "2) Compute: Perform calculation steps. Each line should represent one logical operation. Keep numbers and units clear.\n"
        "3) Verify: Re-calculate the final step or perform a quick sanity check (e.g., 'Does the scale of the answer make sense?').\n"
        "4) Final: Output ONLY the final numeric result following the '#### <number>' format.\n\n"
        "### Constraints ###\n"
        "- Never skip intermediate steps in 'Compute'.\n"
        "- Do not include explanatory text or conversational fillers after the #### marker.\n"
        "--------------------\n"
    )

    # 2. Few-shot Examples (Dynamic sampling)
    for i in range(num_examples):
        idx = sampled_indices[i]
        q = train_dataset["question"][idx].strip()
        a = train_dataset["answer"][idx].strip()

        # 기존 answer에서 #### 정답 분리 (Compute 섹션에 풀이만 넣기 위함)
        if "####" in a:
            reasoning, final_val = a.split("####")
            reasoning = reasoning.strip()
            final_val = final_val.strip()
        else:
            reasoning = a
            final_val = "[Result]"

        prompt += (
            f"Example {i+1}\n"
            f"Question: {q}\n"
            f"Answer:\n"
            f"Plan: Analyze the given quantities and apply necessary arithmetic operations step-by-step.\n"
            f"Compute: {reasoning}\n"
            f"Verify: Double-checked calculation steps for any arithmetic or logical errors.\n"
            f"Final: #### {final_val}\n"
            f"--------------------\n"
        )

    # 3. Target Question
    prompt += (
        "Question: {question}\n"
        "Answer:\n"
        "Plan:\n"
    )

    return prompt

In [None]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

In [13]:
for shot in [0, 3, 5]:
  PROMPT = construct_my_prompt(num_examples=shot)
  VERBOSE = False

  results, accuracy = run_benchmark_test(
      dataset=gsm8k_test,
      prompt=PROMPT,
      VERBOSE=VERBOSE,
      num_samples=50
  )
  file_name = f"My_prompting_{shot}.txt"
  save_final_result(results, accuracy, file_name)


 10%|█         | 5/50 [00:02<00:22,  2.02it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:05<00:22,  1.75it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:16<01:28,  2.53s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [00:34<01:38,  3.29s/it]

Progress: [20/50]
Current Acc.: [80.00%]


 50%|█████     | 25/50 [00:51<01:24,  3.36s/it]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|██████    | 30/50 [01:08<01:09,  3.45s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|███████   | 35/50 [01:25<00:50,  3.38s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|████████  | 40/50 [01:42<00:34,  3.49s/it]

Progress: [40/50]
Current Acc.: [75.00%]


 90%|█████████ | 45/50 [02:00<00:17,  3.52s/it]

Progress: [45/50]
Current Acc.: [75.56%]


100%|██████████| 50/50 [02:25<00:00,  2.90s/it]


Progress: [50/50]
Current Acc.: [76.00%]


 10%|█         | 5/50 [00:53<08:13, 10.96s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:48<07:21, 11.03s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███       | 15/50 [02:40<06:03, 10.37s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [03:35<05:27, 10.91s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [04:28<04:24, 10.58s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [05:24<03:40, 11.03s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [06:16<02:38, 10.59s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [07:10<01:49, 10.93s/it]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [08:02<00:53, 10.77s/it]

Progress: [45/50]
Current Acc.: [75.56%]


100%|██████████| 50/50 [08:57<00:00, 10.74s/it]


Progress: [50/50]
Current Acc.: [76.00%]


 10%|█         | 5/50 [01:25<12:55, 17.24s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [02:53<11:42, 17.57s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███       | 15/50 [04:22<10:17, 17.64s/it]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|████      | 20/50 [05:47<08:29, 16.97s/it]

Progress: [20/50]
Current Acc.: [90.00%]


 50%|█████     | 25/50 [07:15<07:16, 17.47s/it]

Progress: [25/50]
Current Acc.: [84.00%]


 60%|██████    | 30/50 [08:43<05:51, 17.58s/it]

Progress: [30/50]
Current Acc.: [86.67%]


 70%|███████   | 35/50 [10:09<04:19, 17.27s/it]

Progress: [35/50]
Current Acc.: [88.57%]


 80%|████████  | 40/50 [11:37<02:56, 17.61s/it]

Progress: [40/50]
Current Acc.: [87.50%]


 90%|█████████ | 45/50 [13:05<01:28, 17.61s/it]

Progress: [45/50]
Current Acc.: [88.89%]


100%|██████████| 50/50 [14:40<00:00, 17.61s/it]

Progress: [50/50]
Current Acc.: [88.00%]





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!

In [22]:
import os, re
import pandas as pd

files = {
    ("Direct", 0): "direct_prompting_0.txt",
    ("Direct", 3): "direct_prompting_3.txt",
    ("Direct", 5): "direct_prompting_5.txt",
    ("CoT", 0): "CoT_prompting_0.txt",
    ("CoT", 3): "CoT_prompting_3.txt",
    ("CoT", 5): "CoT_prompting_5.txt",
    ("My", 0): "My_prompting_0.txt",
    ("My", 3): "My_prompting_3.txt",
    ("My", 5): "My_prompting_5.txt",
}

def read_accuracy(path: str):
    if not os.path.exists(path):
        return None
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        head = "".join([next(f, "") for _ in range(10)])  # 첫 10줄만 확인
    m = re.search(r"(ACCURACY|accuracy)\s*:\s*([0-9]*\.?[0-9]+)", head)
    if not m:
        return None
    return float(m.group(2))

rows = []
for (method, shot), fn in files.items():
    acc = read_accuracy(fn)
    rows.append({"method": method, "shot": shot, "accuracy": acc, "file": fn})

df = pd.DataFrame(rows)
pivot = df.pivot(index="method", columns="shot", values="accuracy").loc[["Direct","CoT","My"]]
pivot_percent = pivot.applymap(lambda x: f"{x*100:.2f}%" if x is not None else "N/A")

print("=== Accuracy Table ===")
display(pivot_percent)

missing = df[df["accuracy"].isna()]
if len(missing) > 0:
    print("\n[WARN] accuracy를 못 읽은 항목들(파일 없거나 포맷 다름):")
    display(missing[["method","shot","file"]])


=== Accuracy Table ===


  pivot_percent = pivot.applymap(lambda x: f"{x*100:.2f}%" if x is not None else "N/A")


shot,0,3,5
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Direct,74.00%,72.00%,80.00%
CoT,68.00%,72.00%,66.67%
My,76.00%,76.00%,88.00%
