In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from datasets import load_dataset
from tqdm.auto import tqdm
import json
import re

MODEL = "neuralmagic/DeepSeek-R1-Distill-Qwen-7B-quantized.w8a8"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
llm = LLM(model=MODEL, max_model_len=8600)

test = load_dataset("gsm8k", "main", split="test")

In [None]:
from typing import List

system_prompt = """
You will be given a math question
First, reason through the question step by step to arrive at an answer.
Number your reasoning steps and calculations (Step 1:, Step 2:, etc) inside the <think></think> tags
Then, thoroughly assess your confidence in that answer by evaluating your thinking process so far.
Finally, classify your confidence into one of the following classes based on how likely your answer is to be
correct:
- "Almost no chance" (0.0–0.1)
- "Highly unlikely" (0.1–0.2)
- "Chances are slight" (0.2–0.3)
- "Unlikely" (0.3–0.4)
- "Less than even" (0.4–0.5)
- "Better than even" (0.5–0.6)
- "Likely" (0.6–0.7)
- "Very good chance" (0.7–0.8)
- "Highly likely" (0.8–0.9)
- "Almost certain" (0.9–1.0)
Each category reflects the probability that your answer is correct.
At the very end of your output, format your answer and confidence as
Answer: $ANSWER
Confidence: $CLASS
where CLASS is one of the names (only the names without the probability ranges) of the classes above.
"""

force_step_reasoning = """
I will number my reasoning steps and calculations starting with
Step 1:
"""

self_doubt = "Wait, I think I made a mistake in one of the earlier steps"

pad_token_id = tokenizer.eos_token_id
eos_token_id = tokenizer.eos_token_id
think_start_token_id = tokenizer.encode("<think>", add_special_tokens=False)[0]
think_end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]

def format_num(text: str) -> float:
    return float(text.strip().replace(",", "").replace("$", ""))

def prepare_prompt(text: str) -> str:
    prompt = tokenizer.apply_chat_template([
        { "role": "system", "content": system_prompt },
        { "role": "user", "content": text }
    ], tokenize=False, add_generation_prompt=True)
    prompt += force_step_reasoning
    return prompt

def generate(prompts: List[str], stop_token_ids: List[int], max_new_tokens: int = 4096) -> List[str]:
    sampling_params = SamplingParams(
        temperature=0.6,
        top_p=0.95,
        top_k=20,
        max_tokens=max_new_tokens,
        stop_token_ids=stop_token_ids
    )
    outputs = llm.generate(prompts, sampling_params)
    post_outputs = [output.prompt + output.outputs[0].text for output in outputs]
    return post_outputs

def inject_doubt_14(text: str) -> str:
    segs = text.split("<think>")
    pre_reason = "<think>".join(segs[:-1])
    reason = segs[-1]
    steps = reason.split("\nStep ")
    reasoning = steps[0]
    n = len(steps) - 1
    for i, step in enumerate(steps[1:]):
        reasoning += "\nStep " + step
    
        # Inject doubt just after the last step
        if i + 1 == (n + 3) // 4:
            reasoning += f"\nStep {i+2}:" + self_doubt
            break
    return pre_reason + "<think>" + reasoning

def inject_doubt_34(text: str) -> str:
    segs = text.split("<think>")
    pre_reason = "<think>".join(segs[:-1])
    reason = segs[-1]
    steps = reason.split("\nStep ")
    reasoning = steps[0]
    n = len(steps) - 1
    for i, step in enumerate(steps[1:]):
        reasoning += "\nStep " + step
    
        # Inject doubt just after the last step
        if i + 1 == (3 * n + 3) // 4:
            reasoning += f"\nStep {i+2}:" + self_doubt
            break
    return pre_reason + "<think>" + reasoning 

def inject_doubt_mid(text: str) -> str:
    segs = text.split("<think>")
    pre_reason = "<think>".join(segs[:-1])
    reason = segs[-1]
    steps = reason.split("\nStep ")
    reasoning = steps[0]
    n = len(steps) - 1
    for i, step in enumerate(steps[1:]):
        reasoning += "\nStep " + step
    
        # Inject doubt just after the last step
        if i + 1 == ( n + 1 ) // 2:
            reasoning += f"\nStep {i+2}:" + self_doubt
            break
    return pre_reason + "<think>" + reasoning

def inject_doubt_end(text: str) -> str:
    segs = text.split("<think>")
    pre_reason = "<think>".join(segs[:-1])
    reason = segs[-1]
    steps = reason.split("\nStep ")
    reasoning = steps[0]
    for i, step in enumerate(steps[1:]):
        reasoning += "\nStep " + step
    
        # Inject doubt just after the last step
        if i == len(steps) - 2:
            reasoning += f"\nStep {i+2}:" + self_doubt
            break
    return pre_reason + "<think>" + reasoning  

def parse_ans_conf(text: str) -> List[str]:
    post_reason = text.split("</think>")[-1].strip().replace("\n", "")
    confidence = post_reason.split("Confidence:")[-1].strip()
    answer_str = post_reason.split("Answer:")[-1].split("Confidence:")[0]
    return answer_str, confidence

res = []
sample = test

questions = sample["question"]
golds = [float(answer.split("####")[-1].strip().replace(",", "")) for answer in sample["answer"]]

# let the model complete it's reasoning for the question
pre_reasoning_prompts = [prepare_prompt(question) for question in questions]
reasoning_outputs = generate(pre_reasoning_prompts, [think_end_token_id, eos_token_id])

# Now inject self doubt in between the reasoning 
injection_prompt = [inject_doubt_end(reasoning) for reasoning in reasoning_outputs]
post_injection = generate(injection_prompt, [think_end_token_id, eos_token_id])

# Now force the answer and confidence in the format "Answer: ...."
pre_answer_prompts = [reasoning + "\n</think>\nAnswer:" for reasoning in post_injection]
outputs = generate(pre_answer_prompts, [eos_token_id], max_new_tokens=100)

for output, question, gold in zip(outputs, questions, golds):

    answer_str, confidence = parse_ans_conf(output)
    
    try:
        nums = re.findall(r"-?\d[\d,]*\.?\d*", answer_str)
        answer = format_num(nums[0])
    except:
        nums = re.findall(r"-?\d[\d,]*\.?\d*", output)
        answer = format_num(nums[-1])
        print(answer, confidence, gold)

    res.append({
        "question": question,
        "gold": gold,
        "text": output,
        "answer": answer,
        "confidence": confidence,
        "correct": answer == gold
    })

In [None]:
with open("doubt-end-7.json", "w") as file:
    json.dump(res, file, indent=2)