In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
)

# Initialize LLM with steering vector capability
llm = LLM(
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)



  from .autonotebook import tqdm as notebook_tqdm


INFO 10-15 17:00:51 [__init__.py:244] Automatically detected platform cuda.
INFO 10-15 17:01:00 [config.py:841] This model supports multiple tasks: {'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 10-15 17:01:00 [config.py:1472] Using max model len 131072


2025-10-15 17:01:01,051	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 10-15 17:01:01 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7506+g9004da86c.d20251015) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=None, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.92it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.92it/s]



INFO 10-15 17:01:04 [default_loader.py:272] Loading weights took 0.87 seconds
INFO 10-15 17:01:05 [model_runner.py:1255] Model loading took 3.3461 GiB and 1.443162 seconds
INFO 10-15 17:01:06 [worker.py:295] Memory profiling takes 1.34 seconds
INFO 10-15 17:01:06 [worker.py:295] the current vLLM instance can use total_gpu_memory (93.00GiB) x gpu_memory_utilization (0.90) = 83.70GiB
INFO 10-15 17:01:06 [worker.py:295] model weights take 3.35GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 8.10GiB; the rest of the memory reserved for KV Cache is 72.11GiB.
INFO 10-15 17:01:06 [executor_base.py:115] # cuda blocks: 168773, # CPU blocks: 9362
INFO 10-15 17:01:06 [executor_base.py:120] Maximum concurrency for 131072 tokens per request: 20.60x
INFO 10-15 17:01:10 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 4.95 seconds


In [2]:
# Define the suffix for newline tokens in the tokenizer
target_suffix = "ĊĊ"  # "\n\n" is tokenized as "ĊĊ"

# Get complete tokenizer vocabulary
vocab = tokenizer.get_vocab()

# Find all tokens and their IDs that end with the target suffix
# These are the newline tokens we'll apply steering to
matching_tokens_ids = [
    token_id
    for token, token_id in vocab.items()
    if isinstance(token, str) and token.endswith(target_suffix)
]

# Configure steering vector request for SEAL control
sv_request = SteerVectorRequest(
    # Name and ID for the steering vector
    steer_vector_name="complex_control",
    steer_vector_id=4,
    
    # Configure the three steering vectors (execution, reflection, transition)
    vector_configs=[
        # Execution vector (positive scale to promote execution-like text)
        VectorConfig(
            path="execution_avg_vector.gguf",
            scale=0.5,                            # Positive scale promotes this behavior
            target_layers=[20],                   # Apply at layer 20
            generate_trigger_tokens=matching_tokens_ids,  # Apply to newline tokens
            algorithm="direct",                   # Direct application
            normalize=False                       # Do not normalize vectors
        ),
        
        # Reflection vector (negative scale to suppress reflection)
        VectorConfig(
            path="reflection_avg_vector.gguf",
            scale=-0.5,                           # Negative scale suppresses this behavior
            target_layers=[20],
            generate_trigger_tokens=matching_tokens_ids,
            algorithm="direct",
            normalize=False
        ),
        
        # Transition vector (negative scale to suppress transitions)
        VectorConfig(
            path="transition_avg_vector.gguf",
            scale=-0.5,                           # Negative scale suppresses this behavior
            target_layers=[20],
            generate_trigger_tokens=matching_tokens_ids,
            algorithm="direct", 
            normalize=False
        ),
    ],
    
    # Additional parameters
    debug=False,                        # Don't output debug info
    conflict_resolution="sequential"    # Apply vectors in sequence
)

# MATH500

In [3]:
import json
file_path = "/home/yequan/Project/R-KV/HuggingFace/data/math.jsonl"

problems = []
answers = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        problems.append(item["problem"])
        answers.append(item["answer"])

# 看看前两个
print("Problems:", problems[:2])
print("Answers:", answers[:2])


examples = ["Please reason step by step, and put your final answer within \\boxed{}.\nUser: " + prompt + "\nAssistant: <think>" for prompt in problems]


Problems: ['Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$', 'Define\n\\[p = \\sum_{k = 1}^\\infty \\frac{1}{k^2} \\quad \\text{and} \\quad q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}.\\]Find a way to write\n\\[\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3}\\]in terms of $p$ and $q.$']
Answers: ['\\left( 3, \\frac{\\pi}{2} \\right)', 'p - q']


In [4]:
# Generate response with SEAL steering
example_answers = llm.generate(
    examples[:5], 
    SamplingParams(
        temperature=0,
        max_tokens=8192,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

Adding requests: 100%|██████████| 5/5 [00:00<00:00, 119.51it/s]
Processed prompts: 100%|██████████| 5/5 [02:19<00:00, 27.93s/it, est. speed input: 4.83 toks/s, output: 104.53 toks/s] 


In [5]:
from math_verify import parse, verify, LatexExtractionConfig, ExprExtractionConfig
outputs = [output.outputs[0].text for output in example_answers]
extraction_target = (ExprExtractionConfig(), LatexExtractionConfig())
results = []
for i, llm_output in enumerate(outputs):
    gold = parse(f"${answers[i]}$", extraction_config=extraction_target)
    answer = parse(llm_output, extraction_config=extraction_target)
    result = verify(gold, answer)
    results.append(result)
accuracy = sum(results) / len(results)
print(accuracy)

ModuleNotFoundError: No module named 'math_verify'

In [18]:
tokenizer = AutoTokenizer.from_pretrained(
    "/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/"
)
length = 0
for i in range(len(outputs)):
    length += len(tokenizer.tokenize(outputs[i], add_special_tokens=True))
print("Length: ", length/len(outputs))

Length:  3074.668


# GSM8k

In [None]:
import json
file_path = "/home/yequan/Project/R-KV/HuggingFace/data/gsm8k.jsonl"

problems = []
answers = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        problems.append(item["question"])
        answers.append(item["answer"])

# 看看前两个
print("Problems:", problems[:2])
print("Answers:", answers[:2])


examples = ["Please reason step by step, and put your final answer within \\boxed{}.\nUser: " + prompt + "\nAssistant: <think>" for prompt in problems]


Problems: ["Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?']
Answers: ['Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18', 'It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3']


In [None]:
example_answers = llm.generate(
    examples, 
    SamplingParams(
        temperature=0,
        max_tokens=8192,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

In [5]:
from math_verify import parse, verify, LatexExtractionConfig, ExprExtractionConfig
outputs = [output.outputs[0].text for output in example_answers]
extraction_target = (ExprExtractionConfig(), LatexExtractionConfig())
results = []
for i, llm_output in enumerate(outputs):
    gold = parse(f"${answers[i]}$", extraction_config=extraction_target)
    answer = parse(llm_output, extraction_config=extraction_target)
    result = verify(gold, answer)
    results.append(result)
accuracy = sum(results) / len(results)
print(accuracy)

0.8233510235026535


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    "/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/"
)
length = 0
for i in range(len(outputs)):
    length += len(tokenizer.tokenize(outputs[i], add_special_tokens=True))
print("Length: ", length/len(outputs))

Length:  1460.1266110689917
