In [1]:
import json
from typing import Iterable, Union, Any
def load_jsonl(file) -> Iterable[Any]:
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                yield json.loads(line)
            except:
                print("Error in loading:", line)
                exit()
for i in range(1, 41):
    cot_data = list(load_jsonl(f"/home/wenhao/Time-Constrained-CoT/outputs/12_11/models--Qwen--Qwen2.5-3B-Instruct/math/test_corse-to-fine-structured_-1_seed0_t0.6_s0_e-1_cot_{i}.jsonl"))
    print(cot_data[0]['code'][0])

**Coarse Reasoning**

1. Let the unknown negative number be \( x \).
2. According to the problem, \( 6x = x - 20 \).
3. Simplify to find \( 5x = -20 \).
4. Therefore, \( x = -4 \).

\(\boxed{-4}\)

**Fine Reasoning**

1. Let the unknown negative number be \( x \).
2. The problem states that when the number is multiplied by six, the result is 20 less than the original number. This can be written as:
   \[
   6x = x - 20
   \]
3. To isolate \( x \), subtract \( x \) from both sides of the equation:
   \[
   6x - x = x - 20 - x
   \]
   Simplifying, we get:
   \[
   5x = -20
   \]
4. To solve for \( x \), divide both sides by 5:
   \[
   x = \frac{-20}{5}
   \]
   Simplifying, we get:
   \[
   x = -4
   \]

**Final Answer** within \(\boxed{}\)

\(\boxed{-4}\)
**Coarse Reasoning**

Let's denote the negative number as \( x \).

Given:
\[ 6x = x - 20 \]

Solve for \( x \):
\[ 6x - x = -20 \]
\[ 5x = -20 \]
\[ x = -4 \]

So, the original number is \(-4\).

**Fine Reasoning**

Let's denote the

In [17]:
from transformers import Qwen2Tokenizer

# 加载 QwenTokenizer
tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")

# 要编码的文本
text = "hello world! How are you? I am fine, what about going to dinner"

# 编码文本
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# 手动计算偏移量
offsets = []
current_position = 0

for token in tokens:
    # 处理特殊前缀（如 'Ġ' 表示空格）
    if token.startswith("Ġ"):
        token_text = token[1:]  # 去掉 'Ġ'
        # 查找起始位置并考虑前置空格
        start = text.find(token_text, current_position)
        if start > 0 and text[start - 1] == " ":
            start -= 1
    else:
        token_text = token
        start = text.find(token_text, current_position)
    
    # 验证匹配是否成功
    if start == -1:
        print(f"Warning: Token '{token}' not found in text.")
        continue

    end = start + len(token_text)
    offsets.append((start, end))
    current_position = end  # 更新当前偏移量

# 输出每个 token 及其对应的文本部分
for token_id, (start, end) in zip(token_ids, offsets):
    print(f"Token ID: {token_id}, Text: '{text[start:end]}', Start: {start}, End: {end}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Token ID: 14990, Text: 'hello', Start: 0, End: 5
Token ID: 1879, Text: ' worl', Start: 5, End: 10
Token ID: 0, Text: '!', Start: 11, End: 12
Token ID: 2585, Text: ' Ho', Start: 12, End: 15
Token ID: 525, Text: ' ar', Start: 16, End: 19
Token ID: 498, Text: ' yo', Start: 20, End: 23
Token ID: 30, Text: '?', Start: 24, End: 25
Token ID: 358, Text: ' ', Start: 25, End: 26
Token ID: 1079, Text: ' a', Start: 27, End: 29
Token ID: 6915, Text: ' fin', Start: 30, End: 34
Token ID: 11, Text: ',', Start: 35, End: 36
Token ID: 1128, Text: ' wha', Start: 36, End: 40
Token ID: 911, Text: ' abou', Start: 41, End: 46
Token ID: 2087, Text: ' goin', Start: 47, End: 52
Token ID: 311, Text: ' t', Start: 53, End: 55
Token ID: 13856, Text: ' dinne', Start: 56, End: 62


In [23]:
from transformers import Qwen2Tokenizer

# 加载 QwenTokenizer
tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")

# 要编码的文本
text = "This is a sentence"

# 编码文本
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# 手动计算偏移量
offsets = []
current_position = 0

for token in tokens:
    token_text = token.replace("▁", "")  # 部分 tokenizer 使用 '▁' 表示空格
    start = text.find(token_text, current_position)
    end = start + len(token_text)
    offsets.append((start, end))
    current_position = end

# 输出每个 token 及其对应的文本部分
for token_id, (start, end) in zip(token_ids, offsets):
    print(f"Token ID: {token_id}, Text: '{text[start:end]}'", start, end)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Token ID: 1986, Text: 'This' 0 4
Token ID: 374, Text: '' -1 2
Token ID: 264, Text: '' -1 1
Token ID: 11652, Text: '' -1 8


In [25]:
from transformers import Qwen2Tokenizer

# 加载 QwenTokenizer
tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")

# 要编码的文本
text = "hello world! How are you? I am fine, what about going to dinner"

# 编码文本
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# 初始化偏移量
offsets = []
current_position = 0

for token in tokens:
    # 去掉分词前缀（如特殊符号）以提取纯文本部分
    token_text = token.replace("Ġ", "").strip()
    
    # 当前部分逐字匹配，避免使用 `find`
    start = current_position
    end = start + len(token_text)
    
    # 检查匹配的文本是否与 token_text 对应
    if text[start:end] != token_text:
        while start < len(text) and text[start:end] != token_text:
            start += 1
            end = start + len(token_text)
    
    # 添加偏移量并更新当前位置
    offsets.append((start, end))
    current_position = end

# 输出每个 token 及其对应的文本部分
for token_id, (start, end) in zip(token_ids, offsets):
    print(f"Token ID: {token_id}, Text: '{text[start:end]}', Start: {start}, End: {end}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Token ID: 14990, Text: 'hello', Start: 0, End: 5
Token ID: 1879, Text: 'world', Start: 6, End: 11
Token ID: 0, Text: '!', Start: 11, End: 12
Token ID: 2585, Text: 'How', Start: 13, End: 16
Token ID: 525, Text: 'are', Start: 17, End: 20
Token ID: 498, Text: 'you', Start: 21, End: 24
Token ID: 30, Text: '?', Start: 24, End: 25
Token ID: 358, Text: 'I', Start: 26, End: 27
Token ID: 1079, Text: 'am', Start: 28, End: 30
Token ID: 6915, Text: 'fine', Start: 31, End: 35
Token ID: 11, Text: ',', Start: 35, End: 36
Token ID: 1128, Text: 'what', Start: 37, End: 41
Token ID: 911, Text: 'about', Start: 42, End: 47
Token ID: 2087, Text: 'going', Start: 48, End: 53
Token ID: 311, Text: 'to', Start: 54, End: 56
Token ID: 13856, Text: 'dinner', Start: 57, End: 63


In [31]:
from transformers import AutoTokenizer
def crop_cot(cot, token_budget, tokenizer):
    
    tokens = tokenizer.tokenize(cot)
    if len(tokens) <= token_budget:
        return cot
    
    # 初始化偏移量
    offsets = []
    current_position = 0

    for token in tokens[:token_budget]:
        # 去掉分词前缀（如特殊符号）以提取纯文本部分
        token_text = token.replace("Ġ", "").strip()
        
        # 当前部分逐字匹配，避免使用 `find`
        start = current_position
        end = start + len(token_text)
        
        # 检查匹配的文本是否与 token_text 对应
        if cot[start:end] != token_text:
            while start < len(cot) and cot[start:end] != token_text:
                start += 1
                end = start + len(token_text)
        
        # 添加偏移量并更新当前位置
        offsets.append((start, end))
        current_position = end
    
    return cot[:current_position]

tokenizer = AutoTokenizer.from_pretrained(
                '/data03/sunyi/hf_cache/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1', trust_remote_code=True
            )
cot = "This is a very long sentence very that we want to crop to fit within a token budget."
token_budget = 5
cropped_cot = crop_cot(cot, token_budget, tokenizer)
print(cropped_cot)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


This is a very long
