In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from dataset import ReasoningHashDataset
checkpoint_path = "model_20250205_103231/model_checkpoint_batch_50"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
dataset = ReasoningHashDataset(
    tokenizer=tokenizer,
    num_samples=10000,  # Small number for testing
    hash_length=4,  # Shorter hashes for testing
    chains=[2, 3, 4],  # Simpler chain lengths
    vary_hash=True,
    num_chains=3,
    device="cuda",
    rl=True
)



In [2]:
dataset.get_eval_item(0)

('Map:\no7in=>alwr\no7in=>nm3e\nxp6q=>mitl\nfzjz=>oxfv\nalwr=>xp6q\nnm3e=>v2a7\n3hgn=>fzjz\no7in=>3hgn\nStart: o7in\nTask: Multiple hash chains are provided. Find the shortest chain and provide the end 4 char hash. Think hard in tag! Circle your answer in <circle>HERE</circle> after </think>\n-----\nSTART\n<think>Lets reconstruct the list:\nnm3e=>v2a7\no7in=>nm3e\nfzjz=>oxfv\nalwr=>xp6q\n3hgn=>fzjz\nxp6q=>mitl\no7in=>3hgn\no7in=>alwr\n</think>\n<circle>v2a7</circle>',
 [('o7in', 'alwr'),
  ('o7in', 'nm3e'),
  ('xp6q', 'mitl'),
  ('fzjz', 'oxfv'),
  ('alwr', 'xp6q'),
  ('nm3e', 'v2a7'),
  ('3hgn', 'fzjz'),
  ('o7in', '3hgn')],
 'o7in',
 'v2a7',
 'Map:\no7in=>alwr\no7in=>nm3e\nxp6q=>mitl\nfzjz=>oxfv\nalwr=>xp6q\nnm3e=>v2a7\n3hgn=>fzjz\no7in=>3hgn\nStart: o7in\nTask: Multiple hash chains are provided. Find the shortest chain and provide the end 4 char hash. Think hard in tag! Circle your answer in <circle>HERE</circle> after </think>\n-----\nSTART\n<think>')

In [23]:
tokenizer.decode(dataset[0]["target"])

'h6lc<|endoftext|><|endoftext|><|endoftext|>'

In [24]:
import torch as T

# Load tokenizer and model
model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=T.bfloat16
)

# Set model to eval mode for inference
model.eval()

# Quick test to verify loading
# test_input = "Map:\n9lqz=>9h1e\nmvnz=>z0e6\nmvnz=>9lqz\nz0e6=>xtka\nmvnz=>7ck5\nxtka=>ati4\nStart: mvnz\nTask: Multiple hash chains are provided. Find the shortest chain and provide the end 4 char hash. Think hard in tag! Circle your answer in <circle>HERE</circle> after </think>\n-----\nSTART\n"
# input_ids = tokenizer(t, return_tensors="pt").to(model.device)

with T.no_grad():
    output = model.generate(
        input_ids=dataset[0]["input"]["input_ids"].unsqueeze(0),
        attention_mask=dataset[0]["input"]["attention_mask"].unsqueeze(0),
        max_new_tokens=400,
    )

print("Test output:", tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Test output: Map:
qo09=>h6lc
7t8s=>hj60
rnzf=>gy3g
52ic=>rnzf
qo09=>7t8s
qo09=>52ic
Start: qo09
Task: Multiple hash chains are provided. Find the shortest chain and provide the end 4 char hash. Think hard in tag! Circle your answer in <circle>HERE</circle> after </think>
-----
START
<think>52ic=>rnzf is not provided, so we can't find the shortest chain.
qo09=>h6lc is provided, so we can't find the shortest chain.
7t8s=>hj60 is provided, so we can't find the shortest chain.
qo09=>52ic is provided, so we can't find the shortest chain.
52ic=>rnzf is provided, so we can't find the shortest chain.
qo09=>7t8s is provided, so we can't find the shortest chain.
</circle>h6lc</think>
<circle>h6lc</circle>


In [32]:
tokenizer.eos_token_id

151643

In [16]:
from typing import List
from tqdm import tqdm
import torch

def generate_hash(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(model.device)
    pad_token_id = tokenizer.eos_token_id
    
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        num_return_sequences=1,
        temperature=0.6,
        top_p=0.95,
        pad_token_id=pad_token_id,
        do_sample=True,
    )
    
    generated_text = tokenizer.decode(output[0])
    # slide to just generated part
    generated_text = generated_text[generated_text.index(prompt)+len(prompt):]
    # get whats in between <circle> and </circle>
    # check if it has circle
    try:
        if "<circle>" not in generated_text and "</circle>" not in generated_text:
            return "NO CIRCLE"
        circle = generated_text.split("<circle>")[1].split("</circle>")[0]
    except:
        return "NO CIRCLE"
    return circle

def evaluate_reasoning_hash(model, tokenizer, num_tests: int = 100, hash_length: int = 5, 
                            chains: List[int] = [3, 4, 5, 6], vary_hash: bool = True, num_chains: int = 4):
    model.eval()
    
    # Create evaluation dataset
    eval_dataset = ReasoningHashDataset(tokenizer, num_samples=num_tests, hash_length=hash_length, 
                                        chains=chains, vary_hash=vary_hash, num_chains=num_chains)
    
    correct_predictions = 0
    total_predictions = 0
    chain_accuracies = {}

    for _ in tqdm(range(len(eval_dataset)), desc="Evaluating Reasoning Hash", leave=False):
        full_text, hash_list, start, actual_target, prompt = eval_dataset.get_eval_item(_)
        predicted_target = generate_hash(model, tokenizer, prompt)

        predicted_target = predicted_target[:3].lower()
        actual_target = actual_target[:3].lower()

        total_predictions += 1
        if any(predicted_target[i:i+3] == actual_target[i:i+3] for i in range(len(predicted_target)-2)):
            correct_predictions += 1
        
        # Determine the chain length for this sample
        chain_length = ReasoningHashDataset.find_shortest_path(hash_list, start) + 1
        
        # Initialize the chain_length entry if it doesn't exist
        if chain_length not in chain_accuracies:
            chain_accuracies[chain_length] = {'correct': 0, 'total': 0}
        
        chain_accuracies[chain_length]['total'] += 1
        if predicted_target == actual_target:
            chain_accuracies[chain_length]['correct'] += 1
        else:
            print(f"@@@Failed with {predicted_target} and {actual_target}")

    overall_accuracy = correct_predictions / total_predictions
    for length in chain_accuracies:
        if chain_accuracies[length]['total'] > 0:
            chain_accuracies[length]['accuracy'] = chain_accuracies[length]['correct'] / chain_accuracies[length]['total']
        else:
            chain_accuracies[length]['accuracy'] = 0

    return overall_accuracy, chain_accuracies

In [25]:
evaluate_reasoning_hash(model, tokenizer,  num_tests=100, hash_length=4, chains=[6,5,4,3,2], vary_hash=True, num_chains=3)

Evaluating Reasoning Hash:   1%|          | 1/100 [00:02<03:45,  2.28s/it]

@@@Failed with k0n and zve


Evaluating Reasoning Hash:   3%|▎         | 3/100 [00:06<03:46,  2.33s/it]

@@@Failed with dpr and grc


Evaluating Reasoning Hash:   4%|▍         | 4/100 [00:09<03:44,  2.34s/it]

@@@Failed with pro and 5e0


Evaluating Reasoning Hash:   7%|▋         | 7/100 [00:16<03:35,  2.31s/it]

@@@Failed with kip and 2yk


Evaluating Reasoning Hash:   8%|▊         | 8/100 [00:18<03:29,  2.28s/it]

@@@Failed with mr6 and frc


Evaluating Reasoning Hash:   9%|▉         | 9/100 [00:20<03:25,  2.26s/it]

@@@Failed with 2ku and llt


Evaluating Reasoning Hash:  11%|█         | 11/100 [00:25<03:23,  2.29s/it]

@@@Failed with unk and fao


Evaluating Reasoning Hash:  12%|█▏        | 12/100 [00:27<03:17,  2.25s/it]

@@@Failed with eml and 38m


Evaluating Reasoning Hash:  13%|█▎        | 13/100 [00:30<03:26,  2.38s/it]

@@@Failed with rpl and eza


Evaluating Reasoning Hash:  16%|█▌        | 16/100 [00:36<03:15,  2.33s/it]

@@@Failed with jm9 and mj9


Evaluating Reasoning Hash:  17%|█▋        | 17/100 [00:39<03:09,  2.29s/it]

@@@Failed with grz and nby


Evaluating Reasoning Hash:  18%|█▊        | 18/100 [00:41<03:10,  2.32s/it]

@@@Failed with 8qs and qa3


Evaluating Reasoning Hash:  19%|█▉        | 19/100 [00:43<03:06,  2.30s/it]

@@@Failed with 42p and jfc


Evaluating Reasoning Hash:  21%|██        | 21/100 [01:14<13:27, 10.22s/it]

@@@Failed with no  and ruy


Evaluating Reasoning Hash:  22%|██▏       | 22/100 [01:16<10:08,  7.81s/it]

@@@Failed with 8tn and 4r4


Evaluating Reasoning Hash:  23%|██▎       | 23/100 [01:19<07:50,  6.11s/it]

@@@Failed with 28l and 7xe


Evaluating Reasoning Hash:  24%|██▍       | 24/100 [01:21<06:14,  4.93s/it]

@@@Failed with uq9 and umd


Evaluating Reasoning Hash:  25%|██▌       | 25/100 [01:23<05:06,  4.09s/it]

@@@Failed with ard and ntf


Evaluating Reasoning Hash:  27%|██▋       | 27/100 [01:27<03:49,  3.15s/it]

@@@Failed with uj9 and t9e


Evaluating Reasoning Hash:  30%|███       | 30/100 [01:34<03:03,  2.62s/it]

@@@Failed with 1dg and 5gx


Evaluating Reasoning Hash:  31%|███       | 31/100 [01:36<02:48,  2.44s/it]

@@@Failed with rep and cgg


Evaluating Reasoning Hash:  32%|███▏      | 32/100 [01:39<02:40,  2.36s/it]

@@@Failed with 9ma and 2l5


Evaluating Reasoning Hash:  34%|███▍      | 34/100 [01:43<02:32,  2.30s/it]

@@@Failed with bed and nqb


Evaluating Reasoning Hash:  35%|███▌      | 35/100 [01:46<02:35,  2.39s/it]

@@@Failed with gfw and 3dd


Evaluating Reasoning Hash:  36%|███▌      | 36/100 [01:48<02:32,  2.39s/it]

@@@Failed with 8b8 and 9zl


Evaluating Reasoning Hash:  37%|███▋      | 37/100 [01:50<02:25,  2.31s/it]

@@@Failed with jbc and kao


Evaluating Reasoning Hash:  38%|███▊      | 38/100 [01:52<02:23,  2.31s/it]

@@@Failed with pr9 and 5t1


Evaluating Reasoning Hash:  40%|████      | 40/100 [01:57<02:13,  2.22s/it]

@@@Failed with wt1 and 0td


Evaluating Reasoning Hash:  43%|████▎     | 43/100 [02:04<02:12,  2.33s/it]

@@@Failed with 2k5 and si6


Evaluating Reasoning Hash:  44%|████▍     | 44/100 [02:06<02:07,  2.28s/it]

@@@Failed with ano and ybf


Evaluating Reasoning Hash:  46%|████▌     | 46/100 [02:11<02:02,  2.27s/it]

@@@Failed with ypa and 7sh


Evaluating Reasoning Hash:  47%|████▋     | 47/100 [02:13<02:00,  2.28s/it]

@@@Failed with 6sa and 5gf


Evaluating Reasoning Hash:  49%|████▉     | 49/100 [02:17<01:55,  2.27s/it]

@@@Failed with dkr and k9w


Evaluating Reasoning Hash:  52%|█████▏    | 52/100 [02:24<01:45,  2.19s/it]

@@@Failed with r88 and n8l


Evaluating Reasoning Hash:  53%|█████▎    | 53/100 [02:26<01:43,  2.20s/it]

@@@Failed with nrr and zse


Evaluating Reasoning Hash:  55%|█████▌    | 55/100 [02:30<01:39,  2.21s/it]

@@@Failed with hu3 and jcv


Evaluating Reasoning Hash:  58%|█████▊    | 58/100 [02:37<01:32,  2.19s/it]

@@@Failed with 257 and t6y


Evaluating Reasoning Hash:  61%|██████    | 61/100 [02:43<01:23,  2.15s/it]

@@@Failed with 14l and u9v


Evaluating Reasoning Hash:  62%|██████▏   | 62/100 [03:12<06:28, 10.22s/it]

@@@Failed with no  and j8f


Evaluating Reasoning Hash:  63%|██████▎   | 63/100 [03:14<04:48,  7.79s/it]

@@@Failed with iy9 and ls3


Evaluating Reasoning Hash:  65%|██████▌   | 65/100 [03:19<02:57,  5.07s/it]

@@@Failed with 2a7 and 23r


Evaluating Reasoning Hash:  66%|██████▌   | 66/100 [03:21<02:21,  4.15s/it]

@@@Failed with l11 and gqr


Evaluating Reasoning Hash:  68%|██████▊   | 68/100 [03:25<01:38,  3.07s/it]

@@@Failed with 4fv and gex


Evaluating Reasoning Hash:  69%|██████▉   | 69/100 [03:28<01:27,  2.81s/it]

@@@Failed with 56m and w9p


Evaluating Reasoning Hash:  70%|███████   | 70/100 [03:30<01:19,  2.65s/it]

@@@Failed with 0rp and t5i


Evaluating Reasoning Hash:  71%|███████   | 71/100 [03:33<01:16,  2.63s/it]

@@@Failed with zew and f89


Evaluating Reasoning Hash:  74%|███████▍  | 74/100 [03:39<01:01,  2.37s/it]

@@@Failed with 13t and lyx


Evaluating Reasoning Hash:  75%|███████▌  | 75/100 [03:42<01:01,  2.45s/it]

@@@Failed with 0yu and c0i


Evaluating Reasoning Hash:  76%|███████▌  | 76/100 [04:11<04:09, 10.41s/it]

@@@Failed with aok and j07


Evaluating Reasoning Hash:  78%|███████▊  | 78/100 [04:42<05:14, 14.30s/it]

@@@Failed with no  and i4i


Evaluating Reasoning Hash:  79%|███████▉  | 79/100 [04:45<03:44, 10.70s/it]

@@@Failed with w6m and n6q


Evaluating Reasoning Hash:  80%|████████  | 80/100 [04:47<02:45,  8.28s/it]

@@@Failed with ms3 and 2j1


Evaluating Reasoning Hash:  81%|████████  | 81/100 [04:49<02:02,  6.47s/it]

@@@Failed with fxe and a83


Evaluating Reasoning Hash:  86%|████████▌ | 86/100 [05:01<00:41,  2.96s/it]

@@@Failed with xcd and p13


Evaluating Reasoning Hash:  89%|████████▉ | 89/100 [05:07<00:27,  2.48s/it]

@@@Failed with 0qv and dtc


Evaluating Reasoning Hash:  90%|█████████ | 90/100 [05:09<00:23,  2.37s/it]

@@@Failed with 0ia and v60


Evaluating Reasoning Hash:  91%|█████████ | 91/100 [05:12<00:20,  2.32s/it]

@@@Failed with aii and rfb


Evaluating Reasoning Hash:  92%|█████████▏| 92/100 [05:13<00:17,  2.17s/it]

@@@Failed with ys7 and n8j


Evaluating Reasoning Hash:  93%|█████████▎| 93/100 [05:16<00:15,  2.18s/it]

@@@Failed with 9m5 and yfm


Evaluating Reasoning Hash:  95%|█████████▌| 95/100 [05:20<00:11,  2.23s/it]

@@@Failed with bnf and j3n


Evaluating Reasoning Hash:  96%|█████████▌| 96/100 [05:22<00:08,  2.24s/it]

@@@Failed with mpb and zcu


Evaluating Reasoning Hash:  97%|█████████▋| 97/100 [05:25<00:06,  2.26s/it]

@@@Failed with ghq and dgn


Evaluating Reasoning Hash:  99%|█████████▉| 99/100 [05:29<00:02,  2.29s/it]

@@@Failed with zp5 and dxn


                                                                            

(0.37,
 {3: {'correct': 11, 'total': 22, 'accuracy': 0.5},
  4: {'correct': 4, 'total': 32, 'accuracy': 0.125},
  5: {'correct': 5, 'total': 19, 'accuracy': 0.2631578947368421},
  2: {'correct': 17, 'total': 27, 'accuracy': 0.6296296296296297}})

In [17]:
evaluate_reasoning_hash(model, tokenizer,  num_tests=100, hash_length=4, chains=[6,5,4,3,2], vary_hash=True, num_chains=3)

Evaluating Reasoning Hash:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating Reasoning Hash:   1%|          | 1/100 [00:02<03:49,  2.32s/it]

@@@Failed with lke and dvh


Evaluating Reasoning Hash:   2%|▏         | 2/100 [00:04<03:58,  2.43s/it]

@@@Failed with qbv and hid


Evaluating Reasoning Hash:   3%|▎         | 3/100 [00:07<04:01,  2.49s/it]

@@@Failed with 772 and 0no


Evaluating Reasoning Hash:   4%|▍         | 4/100 [00:09<04:02,  2.53s/it]

@@@Failed with d19 and t8n


Evaluating Reasoning Hash:   6%|▌         | 6/100 [00:15<04:07,  2.64s/it]

@@@Failed with yzx and hho


Evaluating Reasoning Hash:   8%|▊         | 8/100 [00:20<03:54,  2.55s/it]

@@@Failed with 2po and u3k


Evaluating Reasoning Hash:   9%|▉         | 9/100 [00:22<03:51,  2.54s/it]

@@@Failed with bmi and sxy


Evaluating Reasoning Hash:  10%|█         | 10/100 [00:25<03:53,  2.60s/it]

@@@Failed with 1os and miv


Evaluating Reasoning Hash:  12%|█▏        | 12/100 [00:30<03:41,  2.52s/it]

@@@Failed with wpg and 7gc


Evaluating Reasoning Hash:  13%|█▎        | 13/100 [00:32<03:31,  2.43s/it]

@@@Failed with e0t and yi1


Evaluating Reasoning Hash:  14%|█▍        | 14/100 [00:35<03:30,  2.45s/it]

@@@Failed with any and 028


Evaluating Reasoning Hash:  15%|█▌        | 15/100 [00:37<03:26,  2.43s/it]

@@@Failed with imp and kek


Evaluating Reasoning Hash:  18%|█▊        | 18/100 [00:45<03:31,  2.58s/it]

@@@Failed with 2uw and 2tz


Evaluating Reasoning Hash:  20%|██        | 20/100 [00:50<03:33,  2.67s/it]

@@@Failed with stb and 7jf


Evaluating Reasoning Hash:  22%|██▏       | 22/100 [00:55<03:14,  2.49s/it]

@@@Failed with ep8 and quq


Evaluating Reasoning Hash:  23%|██▎       | 23/100 [00:58<03:14,  2.52s/it]

@@@Failed with k84 and jgx


Evaluating Reasoning Hash:  26%|██▌       | 26/100 [01:03<02:26,  1.98s/it]

@@@Failed with qg2 and zbs


Evaluating Reasoning Hash:  28%|██▊       | 28/100 [01:08<02:44,  2.29s/it]

@@@Failed with 8ui and fxf


Evaluating Reasoning Hash:  29%|██▉       | 29/100 [01:11<02:47,  2.35s/it]

@@@Failed with 3iw and dwz


Evaluating Reasoning Hash:  33%|███▎      | 33/100 [01:20<02:31,  2.27s/it]

@@@Failed with a5x and 7pj


Evaluating Reasoning Hash:  34%|███▍      | 34/100 [01:23<02:38,  2.40s/it]

@@@Failed with x5t and 5fd


Evaluating Reasoning Hash:  35%|███▌      | 35/100 [01:25<02:45,  2.54s/it]

@@@Failed with 8so and umy


Evaluating Reasoning Hash:  37%|███▋      | 37/100 [01:31<02:41,  2.56s/it]

@@@Failed with xkc and u9u


Evaluating Reasoning Hash:  40%|████      | 40/100 [01:38<02:33,  2.56s/it]

@@@Failed with ywl and 0vs


Evaluating Reasoning Hash:  42%|████▏     | 42/100 [01:43<02:25,  2.51s/it]

@@@Failed with caf and 9k4


Evaluating Reasoning Hash:  43%|████▎     | 43/100 [01:46<02:20,  2.46s/it]

@@@Failed with skq and tk6


Evaluating Reasoning Hash:  44%|████▍     | 44/100 [01:48<02:24,  2.58s/it]

@@@Failed with eoa and efd


Evaluating Reasoning Hash:  45%|████▌     | 45/100 [01:51<02:16,  2.49s/it]

@@@Failed with nw3 and q74


Evaluating Reasoning Hash:  46%|████▌     | 46/100 [01:53<02:07,  2.37s/it]

@@@Failed with wwu and wu3


Evaluating Reasoning Hash:  47%|████▋     | 47/100 [01:55<02:04,  2.34s/it]

@@@Failed with xi4 and u1u


Evaluating Reasoning Hash:  48%|████▊     | 48/100 [01:58<02:08,  2.48s/it]

@@@Failed with nhk and km1


Evaluating Reasoning Hash:  50%|█████     | 50/100 [02:04<02:12,  2.65s/it]

@@@Failed with 6xg and jzj


Evaluating Reasoning Hash:  51%|█████     | 51/100 [02:06<02:04,  2.54s/it]

@@@Failed with pnu and nwe


Evaluating Reasoning Hash:  52%|█████▏    | 52/100 [02:08<02:01,  2.54s/it]

@@@Failed with kmy and p0b


Evaluating Reasoning Hash:  54%|█████▍    | 54/100 [02:13<01:50,  2.40s/it]

@@@Failed with 2ek and kn2


Evaluating Reasoning Hash:  58%|█████▊    | 58/100 [02:23<01:40,  2.39s/it]

@@@Failed with 5iy and 1y5


Evaluating Reasoning Hash:  59%|█████▉    | 59/100 [02:25<01:39,  2.44s/it]

@@@Failed with zsq and a1i


Evaluating Reasoning Hash:  60%|██████    | 60/100 [02:28<01:45,  2.64s/it]

@@@Failed with 1g1 and iev


Evaluating Reasoning Hash:  61%|██████    | 61/100 [02:30<01:37,  2.51s/it]

@@@Failed with lpj and 0ly


Evaluating Reasoning Hash:  64%|██████▍   | 64/100 [02:38<01:29,  2.49s/it]

@@@Failed with 8nf and xig


Evaluating Reasoning Hash:  67%|██████▋   | 67/100 [02:45<01:22,  2.50s/it]

@@@Failed with 684 and jrq


Evaluating Reasoning Hash:  71%|███████   | 71/100 [02:55<01:12,  2.50s/it]

@@@Failed with s6w and ru7


Evaluating Reasoning Hash:  72%|███████▏  | 72/100 [02:58<01:10,  2.53s/it]

@@@Failed with nrp and h1u


Evaluating Reasoning Hash:  73%|███████▎  | 73/100 [03:00<01:08,  2.54s/it]

@@@Failed with ah9 and i8u


Evaluating Reasoning Hash:  76%|███████▌  | 76/100 [03:08<01:03,  2.63s/it]

@@@Failed with 0g6 and zd4


Evaluating Reasoning Hash:  77%|███████▋  | 77/100 [03:10<00:58,  2.53s/it]

@@@Failed with 3uu and 0uq


Evaluating Reasoning Hash:  80%|████████  | 80/100 [03:17<00:48,  2.44s/it]

@@@Failed with vin and uze


Evaluating Reasoning Hash:  81%|████████  | 81/100 [03:20<00:46,  2.47s/it]

@@@Failed with 2wb and 7nv


Evaluating Reasoning Hash:  83%|████████▎ | 83/100 [03:25<00:41,  2.47s/it]

@@@Failed with zrp and 7e0


Evaluating Reasoning Hash:  86%|████████▌ | 86/100 [03:32<00:33,  2.38s/it]

@@@Failed with x8t and ap2


Evaluating Reasoning Hash:  87%|████████▋ | 87/100 [03:35<00:32,  2.51s/it]

@@@Failed with 40e and kn8


Evaluating Reasoning Hash:  88%|████████▊ | 88/100 [03:38<00:31,  2.61s/it]

@@@Failed with oz4 and j8g


Evaluating Reasoning Hash:  92%|█████████▏| 92/100 [03:47<00:19,  2.43s/it]

@@@Failed with zip and mcp


Evaluating Reasoning Hash:  93%|█████████▎| 93/100 [03:50<00:17,  2.45s/it]

@@@Failed with aeb and pqq


Evaluating Reasoning Hash:  94%|█████████▍| 94/100 [03:52<00:14,  2.48s/it]

@@@Failed with vep and yc0


Evaluating Reasoning Hash:  95%|█████████▌| 95/100 [03:55<00:12,  2.47s/it]

@@@Failed with n4v and 2qb


Evaluating Reasoning Hash:  96%|█████████▌| 96/100 [03:57<00:09,  2.48s/it]

@@@Failed with 61x and k4l


Evaluating Reasoning Hash:  99%|█████████▉| 99/100 [04:04<00:02,  2.43s/it]

@@@Failed with 3vn and a4k


                                                                            

(0.42,
 {5: {'correct': 8, 'total': 34, 'accuracy': 0.23529411764705882},
  3: {'correct': 14, 'total': 25, 'accuracy': 0.56},
  4: {'correct': 12, 'total': 26, 'accuracy': 0.46153846153846156},
  2: {'correct': 8, 'total': 15, 'accuracy': 0.5333333333333333}})

In [19]:
evaluate_reasoning_hash(model, tokenizer,  num_tests=100, hash_length=4, chains=[2,3,4,5,6], vary_hash=True, num_chains=3)

Evaluating Reasoning Hash:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating Reasoning Hash:   1%|          | 1/100 [00:02<04:05,  2.48s/it]

@@@Failed with mjr and yiq


Evaluating Reasoning Hash:   2%|▏         | 2/100 [00:04<03:49,  2.34s/it]

@@@Failed with 9qw and ssb


Evaluating Reasoning Hash:   3%|▎         | 3/100 [00:07<04:10,  2.59s/it]

@@@Failed with y8j and ceh


Evaluating Reasoning Hash:   4%|▍         | 4/100 [00:10<04:10,  2.61s/it]

@@@Failed with m39 and nwg


Evaluating Reasoning Hash:   5%|▌         | 5/100 [00:12<03:58,  2.51s/it]

@@@Failed with o8s and 7cw


Evaluating Reasoning Hash:   6%|▌         | 6/100 [00:15<04:00,  2.56s/it]

@@@Failed with 2ii and r8w


Evaluating Reasoning Hash:   7%|▋         | 7/100 [00:17<03:59,  2.57s/it]

@@@Failed with hik and fzj


Evaluating Reasoning Hash:   8%|▊         | 8/100 [00:19<03:39,  2.39s/it]

@@@Failed with ytv and 1zg


Evaluating Reasoning Hash:   9%|▉         | 9/100 [00:22<03:46,  2.49s/it]

@@@Failed with v0m and 56a


Evaluating Reasoning Hash:  10%|█         | 10/100 [00:24<03:41,  2.46s/it]

@@@Failed with r9n and 1eb


Evaluating Reasoning Hash:  13%|█▎        | 13/100 [00:32<03:31,  2.44s/it]

@@@Failed with 55n and 0tw


Evaluating Reasoning Hash:  14%|█▍        | 14/100 [00:35<03:37,  2.53s/it]

@@@Failed with 7nw and yaj


Evaluating Reasoning Hash:  15%|█▌        | 15/100 [00:37<03:37,  2.55s/it]

@@@Failed with uef and p2z


Evaluating Reasoning Hash:  17%|█▋        | 17/100 [00:41<03:18,  2.39s/it]

@@@Failed with oxi and l4z


Evaluating Reasoning Hash:  19%|█▉        | 19/100 [00:46<03:07,  2.31s/it]

@@@Failed with osr and tdx


Evaluating Reasoning Hash:  20%|██        | 20/100 [00:49<03:10,  2.38s/it]

@@@Failed with pd7 and tiw


Evaluating Reasoning Hash:  23%|██▎       | 23/100 [00:56<03:07,  2.44s/it]

@@@Failed with s84 and q9d


Evaluating Reasoning Hash:  24%|██▍       | 24/100 [00:58<03:02,  2.41s/it]

@@@Failed with w01 and 9qn


Evaluating Reasoning Hash:  25%|██▌       | 25/100 [01:01<03:05,  2.48s/it]

@@@Failed with pd3 and 689


Evaluating Reasoning Hash:  28%|██▊       | 28/100 [01:08<03:01,  2.52s/it]

@@@Failed with dfn and 0lm


Evaluating Reasoning Hash:  31%|███       | 31/100 [01:16<02:56,  2.56s/it]

@@@Failed with mst and lb3


Evaluating Reasoning Hash:  32%|███▏      | 32/100 [01:18<02:48,  2.48s/it]

@@@Failed with 5x0 and 3a2


Evaluating Reasoning Hash:  33%|███▎      | 33/100 [01:21<02:41,  2.42s/it]

@@@Failed with y0z and vvy


Evaluating Reasoning Hash:  34%|███▍      | 34/100 [01:23<02:43,  2.48s/it]

@@@Failed with qza and 269


Evaluating Reasoning Hash:  37%|███▋      | 37/100 [01:30<02:27,  2.34s/it]

@@@Failed with c9c and dv8


Evaluating Reasoning Hash:  38%|███▊      | 38/100 [01:33<02:28,  2.40s/it]

@@@Failed with t2f and pwd


Evaluating Reasoning Hash:  40%|████      | 40/100 [01:38<02:24,  2.41s/it]

@@@Failed with 9ah and 9ll


Evaluating Reasoning Hash:  41%|████      | 41/100 [01:40<02:25,  2.46s/it]

@@@Failed with ki1 and fw7


Evaluating Reasoning Hash:  42%|████▏     | 42/100 [01:43<02:20,  2.42s/it]

@@@Failed with qrr and fcb


Evaluating Reasoning Hash:  43%|████▎     | 43/100 [01:45<02:21,  2.48s/it]

@@@Failed with ee2 and 3h0


Evaluating Reasoning Hash:  44%|████▍     | 44/100 [01:48<02:25,  2.60s/it]

@@@Failed with cdf and wot


Evaluating Reasoning Hash:  45%|████▌     | 45/100 [01:50<02:16,  2.48s/it]

@@@Failed with xf7 and ntc


Evaluating Reasoning Hash:  49%|████▉     | 49/100 [02:00<02:05,  2.46s/it]

@@@Failed with 162 and gaj


Evaluating Reasoning Hash:  50%|█████     | 50/100 [02:02<02:00,  2.42s/it]

@@@Failed with w4j and cxm


Evaluating Reasoning Hash:  53%|█████▎    | 53/100 [02:09<01:47,  2.28s/it]

@@@Failed with lhr and wd7


Evaluating Reasoning Hash:  54%|█████▍    | 54/100 [02:12<01:51,  2.43s/it]

@@@Failed with kd7 and vsj


Evaluating Reasoning Hash:  55%|█████▌    | 55/100 [02:14<01:51,  2.48s/it]

@@@Failed with pr0 and pia


Evaluating Reasoning Hash:  56%|█████▌    | 56/100 [02:17<01:46,  2.42s/it]

@@@Failed with x8o and oz0


Evaluating Reasoning Hash:  57%|█████▋    | 57/100 [02:19<01:45,  2.46s/it]

@@@Failed with 42f and rsw


Evaluating Reasoning Hash:  60%|██████    | 60/100 [02:27<01:38,  2.47s/it]

@@@Failed with bsu and q74


Evaluating Reasoning Hash:  63%|██████▎   | 63/100 [02:34<01:31,  2.48s/it]

@@@Failed with wz7 and dt3


Evaluating Reasoning Hash:  65%|██████▌   | 65/100 [02:39<01:23,  2.38s/it]

@@@Failed with ku and pz2


Evaluating Reasoning Hash:  66%|██████▌   | 66/100 [02:41<01:19,  2.35s/it]

@@@Failed with 6ug and 7vb


Evaluating Reasoning Hash:  67%|██████▋   | 67/100 [02:44<01:20,  2.45s/it]

@@@Failed with t6f and j3y


Evaluating Reasoning Hash:  73%|███████▎  | 73/100 [02:59<01:07,  2.50s/it]

@@@Failed with ai2 and 7aa


Evaluating Reasoning Hash:  74%|███████▍  | 74/100 [03:02<01:04,  2.49s/it]

@@@Failed with tri and 3ky


Evaluating Reasoning Hash:  75%|███████▌  | 75/100 [03:04<01:03,  2.54s/it]

@@@Failed with r07 and oq0


Evaluating Reasoning Hash:  77%|███████▋  | 77/100 [03:09<00:58,  2.55s/it]

@@@Failed with aau and apv


Evaluating Reasoning Hash:  78%|███████▊  | 78/100 [03:11<00:53,  2.44s/it]

@@@Failed with ze2 and xao


Evaluating Reasoning Hash:  80%|████████  | 80/100 [03:17<00:50,  2.53s/it]

@@@Failed with ycx and won


Evaluating Reasoning Hash:  83%|████████▎ | 83/100 [03:24<00:41,  2.45s/it]

@@@Failed with 984 and 794


Evaluating Reasoning Hash:  84%|████████▍ | 84/100 [03:26<00:38,  2.38s/it]

@@@Failed with 8h8 and l9q


Evaluating Reasoning Hash:  86%|████████▌ | 86/100 [03:31<00:32,  2.32s/it]

@@@Failed with x9v and ure


Evaluating Reasoning Hash:  87%|████████▋ | 87/100 [03:33<00:30,  2.33s/it]

@@@Failed with smb and b8y


Evaluating Reasoning Hash:  92%|█████████▏| 92/100 [03:46<00:20,  2.52s/it]

@@@Failed with ohp and 7jy


Evaluating Reasoning Hash:  94%|█████████▍| 94/100 [03:50<00:14,  2.34s/it]

@@@Failed with u7j and umv


Evaluating Reasoning Hash:  96%|█████████▌| 96/100 [03:55<00:09,  2.41s/it]

@@@Failed with jma and e4f


Evaluating Reasoning Hash:  97%|█████████▋| 97/100 [03:58<00:07,  2.42s/it]

@@@Failed with luo and a4h


Evaluating Reasoning Hash:  98%|█████████▊| 98/100 [04:01<00:05,  2.57s/it]

@@@Failed with h7c and q46


Evaluating Reasoning Hash:  99%|█████████▉| 99/100 [04:03<00:02,  2.53s/it]

@@@Failed with f55 and nhc


                                                                            

@@@Failed with nyt and b5b




(0.39,
 {4: {'correct': 8, 'total': 24, 'accuracy': 0.3333333333333333},
  3: {'correct': 8, 'total': 25, 'accuracy': 0.32},
  5: {'correct': 9, 'total': 29, 'accuracy': 0.3103448275862069},
  2: {'correct': 14, 'total': 22, 'accuracy': 0.6363636363636364}})