# Code Demo For Trajectory Pruning
This file shows the basic implementation of the experiments of the trajectory pruning.

In [12]:
import os
from collections import defaultdict
import random
random.seed(42)

import numpy as np
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

### Preparation

load tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/QwQ-32B")

load AIME24 data

In [3]:
dataset = load_dataset("HuggingFaceH4/aime_2024", split="train")

### Prepare Original Trajectories

define verify function

In [7]:
def score(predictions, references):
    try:
        from latex2sympy2_extended import NormalizationConfig
        from math_verify import (ExprExtractionConfig,
                                    LatexExtractionConfig, parse, verify)
    except ImportError:
        raise ImportError('Failed to import required modules. Please '
                            'install the necessary packages: '
                            'pip install math_verify latex2sympy2_extended')

    correct = 0
    count = 0
    details = []
    for i, j in zip(predictions, references):
        count += 1
        j_with_env = f'${j}$'
        gold_parsed = parse(
            j_with_env,
            extraction_mode='first_match',
            extraction_config=[
                LatexExtractionConfig(),
                ExprExtractionConfig(),
            ],
        )

        if len(gold_parsed) != 0:
            # We require the answer to be provided in correct
            # latex (no malformed operators)
            answer_parsed = parse(
                i,
                extraction_config=[
                    LatexExtractionConfig(
                        normalization_config=NormalizationConfig(
                            nits=False,
                            malformed_operators=False,
                            basic_latex=True,
                            equations=True,
                            boxed='all',
                            units=True,
                        ),
                        # Ensures that boxed is tried first
                        boxed_match_priority=0,
                        try_extract_without_anchor=False,
                    )
                ],
                extraction_mode='first_match',
            )

            answer_correct = float(verify(answer_parsed, gold_parsed))
            correct += answer_correct
            detail = {
                'pred': str(answer_parsed),
                'answer': str(gold_parsed),
                'correct': True if answer_correct else False,
            }
            details.append(detail)
    return details

load model

In [None]:
model = LLM("Qwen/Qwen3-32B", tensor_parallel_size=2, gpu_memory_utilization=0.8)

construct prompts

In [5]:
prompt_template = "{question}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}"
inputs = []
answers = []
questions = []
for example in dataset:
    questions.append(example["problem"])
    messages = [
        {"role": "user", "content": prompt_template.format(question=questions[-1])}
    ]
    inputs.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True))
    answers.append(example["answer"])

generate response, only one sample for simple demo

In [6]:
sampling_params = SamplingParams(
    max_tokens=32768,  
    temperature=0.6,  
    top_p=0.95,
    min_p=0.0,
    top_k=40,
    n=16,
    skip_special_tokens=False,
)
outputs = model.generate(inputs[:1], sampling_params, use_tqdm=True)

Processed prompts: 100%|██████████| 16/16 [08:58<00:00, 33.64s/it, est. speed input: 4.70 toks/s, output: 130.23 toks/s]


parse and evaluate the trajectories

In [11]:
trajectories = []
references = []
for output, answer in zip(outputs, answers[:1]):
    prompt = output.prompt
    for r in output.outputs:
        trajectories.append(r.text)
        references.append(answer)
scores = score(trajectories, references)

print("mean accuracy: ", np.mean([score["correct"] for score in scores]))



mean accuracy:  1.0


In [13]:
del model
torch.cuda.empty_cache()

### Prepare Summarized Trajectories

load summarization model, we utilize Qwen3-32B-Instruct

In [14]:
sum_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
sum_model = LLM("Qwen/Qwen2.5-32B-Instruct", trust_remote_code=True, tensor_parallel_size=2, gpu_memory_utilization=0.7)

INFO 05-26 07:22:29 [config.py:583] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 05-26 07:22:29 [config.py:1515] Defaulting to use mp for distributed inference
INFO 05-26 07:22:29 [config.py:1693] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-26 07:22:34 [__init__.py:256] Automatically detected platform cuda.
[2025-05-26 07:22:36,005] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
INFO 05-26 07:22:36 [core.py:53] Initializing a V1 LLM engine (v0.8.1) with config: model='Qwen/Qwen2.5-32B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-32B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable

Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/17 [00:04<01:12,  4.53s/it]
Loading safetensors checkpoint shards:  12% Completed | 2/17 [00:09<01:11,  4.76s/it]
Loading safetensors checkpoint shards:  18% Completed | 3/17 [00:13<01:02,  4.45s/it]
Loading safetensors checkpoint shards:  24% Completed | 4/17 [00:17<00:57,  4.40s/it]
Loading safetensors checkpoint shards:  29% Completed | 5/17 [00:22<00:52,  4.35s/it]
Loading safetensors checkpoint shards:  35% Completed | 6/17 [00:26<00:47,  4.33s/it]
Loading safetensors checkpoint shards:  41% Completed | 7/17 [00:30<00:43,  4.30s/it]
Loading safetensors checkpoint shards:  47% Completed | 8/17 [00:35<00:38,  4.32s/it]
Loading safetensors checkpoint shards:  53% Completed | 9/17 [00:40<00:36,  4.56s/it]
Loading safetensors checkpoint shards:  59% Completed | 10/17 [00:44<00:32,  4.66s/it]
Loading safetensors checkpoint shards:  65% Completed | 11/17

[1;36m(VllmWorker rank=0 pid=3194096)[0;0m INFO 05-26 07:24:09 [loader.py:429] Loading weights took 77.91 seconds
[1;36m(VllmWorker rank=1 pid=3194147)[0;0m INFO 05-26 07:24:09 [loader.py:429] Loading weights took 77.35 seconds
[1;36m(VllmWorker rank=0 pid=3194096)[0;0m INFO 05-26 07:24:10 [gpu_model_runner.py:1176] Model loading took 30.7098 GB and 79.461885 seconds
[1;36m(VllmWorker rank=1 pid=3194147)[0;0m INFO 05-26 07:24:10 [gpu_model_runner.py:1176] Model loading took 30.7098 GB and 79.465753 seconds
[1;36m(VllmWorker rank=0 pid=3194096)[0;0m INFO 05-26 07:24:24 [backends.py:409] Using cache directory: /root/.cache/vllm/torch_compile_cache/4f59ca9ae3/rank_0_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=0 pid=3194096)[0;0m INFO 05-26 07:24:24 [backends.py:419] Dynamo bytecode transform time: 14.91 s
[1;36m(VllmWorker rank=1 pid=3194147)[0;0m INFO 05-26 07:24:25 [backends.py:409] Using cache directory: /root/.cache/vllm/torch_compile_cache/4f59ca9ae3/rank_1_0 for 

construct prompts

In [16]:
summarize_prompt_template = "{trajectory}\n\n\nSummarize the aforementioned reasoning process and not explicitly include the final conclusion and answer. Only provide the English summary."

summarize_messages = [
    [
        {
            "role": "user", "content": summarize_prompt_template.format(
            # remove answer
            trajectory=sum_tokenizer.decode(sum_tokenizer.encode(trajectory.split("</think>")[0].split("\n\n**Final Answer**")[0])[:24576])
        )},
    ] for trajectory in trajectories
]
summarize_prompts = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True) for message in summarize_messages]

generate summarized trajectories

In [18]:
sampling_params = SamplingParams(n=1, max_tokens=8192, seed=42, temperature=1.0)
request_outputs = sum_model.generate(summarize_prompts, sampling_params, use_tqdm=True) 
summarized_trajectories = [output.text for resquest_output in request_outputs for output in resquest_output.outputs]

Processed prompts: 100%|██████████| 16/16 [00:31<00:00,  1.94s/it, est. speed input: 1829.23 toks/s, output: 217.19 toks/s]


In [27]:
trajectories[2]

'Okay, let\'s see. So, Aya goes for a 9-kilometer walk every morning and stops at a coffee shop. The problem gives me two scenarios where her walking speed affects the total time, including the time spent at the coffee shop, which is the same t minutes in both cases. Then, they want me to find out how long it takes her when she walks at s + 1/2 km/h, again including the t minutes at the coffee shop. \n\nFirst, let me parse the information. \n\nIn the first scenario, when she walks at a constant speed of s km/h, the total time (walking + coffee shop) is 4 hours. In the second scenario, when she walks at s + 2 km/h, the total time is 2 hours and 24 minutes. The coffee shop time t is the same in both cases. \n\nSo, I need to find t and s first, and then use those to calculate the total time when her speed is s + 1/2. \n\nLet me start by converting all the times to hours or all to minutes? Probably hours since her speed is in km per hour. Let me check:\n\nFirst scenario: total time is 4 ho

In [24]:
summarized_trajectories[2]

"The problem gives two scenarios with Aya walking at different speeds and needing to account for a fixed time t in the coffee shop. The goal is to determine the total time taken when she walks at a speed of s + 1/2 km/h, including the time spent at the coffee shop. The initial steps involve converting the total times given in the scenarios (4 hours and 2 hours and 24 minutes) into equations involving the walking speeds. To create these equations, we use the relationship between speed, distance, and time, where the distance is fixed at 9 km in both cases.\n\nThe first scenario's equation is 9/s = 4 - t/60 and the second scenario's equation is 9/(s + 2) = 2.4 - t/60. By subtracting the second equation from the first, the t/60 terms cancel, leaving us with an equation to solve for s. After solving, we find that s = 2.5 km/h. Plugging s back into one of the initial equations allows us to solve for t, giving us t = 24 minutes.\n\nFor the final scenario where she walks at s + 1/2 km/h (makin

In [21]:
del sum_model
torch.cuda.empty_cache()

### Generate Answer Based on the Summarized Trajectories

load model

In [22]:
model = LLM("Qwen/Qwen3-32B", tensor_parallel_size=2, gpu_memory_utilization=0.8)

INFO 05-26 07:41:50 [config.py:583] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 05-26 07:41:50 [config.py:1515] Defaulting to use mp for distributed inference
INFO 05-26 07:41:50 [config.py:1693] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-26 07:41:56 [__init__.py:256] Automatically detected platform cuda.
[2025-05-26 07:41:57,696] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
INFO 05-26 07:41:58 [core.py:53] Initializing a V1 LLM engine (v0.8.1) with config: model='Qwen/Qwen3-32B', speculative_config=None, tokenizer='Qwen/Qwen3-32B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/17 [00:01<00:19,  1.21s/it]
Loading safetensors checkpoint shards:  12% Completed | 2/17 [00:02<00:18,  1.22s/it]
Loading safetensors checkpoint shards:  18% Completed | 3/17 [00:03<00:14,  1.05s/it]
Loading safetensors checkpoint shards:  24% Completed | 4/17 [00:04<00:14,  1.10s/it]
Loading safetensors checkpoint shards:  29% Completed | 5/17 [00:05<00:13,  1.15s/it]
Loading safetensors checkpoint shards:  35% Completed | 6/17 [00:06<00:12,  1.17s/it]
Loading safetensors checkpoint shards:  41% Completed | 7/17 [00:08<00:11,  1.19s/it]
Loading safetensors checkpoint shards:  47% Completed | 8/17 [00:09<00:10,  1.19s/it]
Loading safetensors checkpoint shards:  53% Completed | 9/17 [00:10<00:09,  1.19s/it]
Loading safetensors checkpoint shards:  59% Completed | 10/17 [00:11<00:08,  1.18s/it]
Loading safetensors checkpoint shards:  65% Completed | 11/17

[1;36m(VllmWorker rank=1 pid=3199230)[0;0m INFO 05-26 07:42:35 [loader.py:429] Loading weights took 19.99 seconds


Loading safetensors checkpoint shards:  94% Completed | 16/17 [00:18<00:01,  1.14s/it]


[1;36m(VllmWorker rank=1 pid=3199230)[0;0m INFO 05-26 07:42:35 [gpu_model_runner.py:1176] Model loading took 30.5159 GB and 23.406672 seconds


Loading safetensors checkpoint shards: 100% Completed | 17/17 [00:19<00:00,  1.14s/it]
Loading safetensors checkpoint shards: 100% Completed | 17/17 [00:19<00:00,  1.16s/it]
[1;36m(VllmWorker rank=0 pid=3199191)[0;0m 


[1;36m(VllmWorker rank=0 pid=3199191)[0;0m INFO 05-26 07:42:36 [loader.py:429] Loading weights took 19.78 seconds
[1;36m(VllmWorker rank=0 pid=3199191)[0;0m INFO 05-26 07:42:36 [gpu_model_runner.py:1176] Model loading took 30.5159 GB and 24.727743 seconds
INFO 05-26 07:42:40 [kv_cache_utils.py:537] GPU KV cache size: 206,672 tokens
INFO 05-26 07:42:40 [kv_cache_utils.py:540] Maximum concurrency for 40,960 tokens per request: 5.05x
INFO 05-26 07:42:40 [kv_cache_utils.py:537] GPU KV cache size: 206,672 tokens
INFO 05-26 07:42:40 [kv_cache_utils.py:540] Maximum concurrency for 40,960 tokens per request: 5.05x
[1;36m(VllmWorker rank=1 pid=3199230)[0;0m INFO 05-26 07:42:50 [custom_all_reduce.py:229] Registering 0 cuda graph addresses
[1;36m(VllmWorker rank=0 pid=3199191)[0;0m INFO 05-26 07:42:50 [custom_all_reduce.py:229] Registering 0 cuda graph addresses
[1;36m(VllmWorker rank=1 pid=3199230)[0;0m INFO 05-26 07:42:50 [gpu_model_runner.py:1499] Graph capturing finished in 10 secs,

construct conditioned prompts

In [25]:
messages = [
    {"role": "user", "content": prompt_template.format(question=questions[0])},
]
completion_prompts = []
for summarized_trajectory in summarized_trajectories:
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + summarized_trajectory + "\n\n**Final Answer**\n\\boxed{"
    completion_prompts.append(prompt)

generate answers conditioned on the summarized trajectories

In [26]:
sampling_params = SamplingParams(n=1, max_tokens=20, stop=["}"], seed=42, temperature=0.0)
request_outputs = model.generate(completion_prompts, sampling_params)
responses = [output.text for resquest_output in request_outputs for output in resquest_output.outputs]
scores = score(["\\boxed{" + respose + "}" for respose in responses], [answers[0]] * len(responses))

print("trajectory pruning mean accuracy: ", np.mean([score["correct"] for score in scores]))

Processed prompts: 100%|██████████| 16/16 [00:01<00:00, 10.76it/s, est. speed input: 6288.64 toks/s, output: 43.06 toks/s]


trajectory pruning mean accuracy:  0.9375
