In [1]:
import os
import sys
import glob
import subprocess
import shutil
import time
from pathlib import Path
from functools import lru_cache
import torch
import fla.utils

In [2]:
# ==============================================================================
# 2. MODEL LOADING (PAPER REPLICATION MODE)
# ==============================================================================
import torch
from pathlib import Path
from peft import PeftModel
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from fla.models.delta_net import DeltaNetForCausalLM, DeltaNetConfig

# --- Configuration ---
USE_ADAPTER = False 
BASE_MODEL_ID = "fla-hub/delta_net-1.3B-100B"
DEVICE_MAP = "cuda:0" # Explicitly set to GPU

# --- REPLICATION FACTOR 1: PRECISION ---
# The paper used bfloat16 (standard for 1.3B training).
# We stick to this. (Unlike our previous 'float32' fix which boosted scores).
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

# --- A. Register Architecture ---
try:
    AutoConfig.register("delta_net", DeltaNetConfig)
    AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM)
    print("‚úÖ DeltaNet architecture registered.")
except ValueError:
    pass

# --- B. Load Base Model ---
print(f"‚è≥ Loading Base Model: {BASE_MODEL_ID}...")
model = DeltaNetForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    device_map=DEVICE_MAP,
    torch_dtype=DTYPE,
)

# --- C. Load Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # LM-Harness usually prefers left for gen, but right is standard for base models

# --- D. REPLICATION FACTOR 2: Context Limit (CRITICAL) ---
# Your previous code: 131072 (128k) -> Allowed extrapolation
# Paper Replication:  4096  (4k)   -> Forces truncation on long tasks
#print("üîí LOCKING Context Limit to 4096 (Paper Replication Mode)...")
#model.config.max_position_embeddings = 4096
#model.config.max_length = 4096
#tokenizer.model_max_length = 4096

model.eval()

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ DeltaNet architecture registered.
‚è≥ Loading Base Model: fla-hub/delta_net-1.3B-100B...


DeltaNetForCausalLM(
  (model): DeltaNetModel(
    (embeddings): Embedding(32000, 2048, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x DeltaNetBlock(
        (attn_norm): RMSNorm(2048, eps=1e-06)
        (attn): DeltaNet(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (b_proj): Linear(in_features=2048, out_features=16, bias=False)
          (q_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
          (k_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
          (v_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
 

In [13]:
# ==============================================================================
# 4. RULER EVALUATION HARNESS (Force-Unlocked)
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_2" 
LENGTHS_TO_TEST = [2048] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    limit=50,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
üöÄ Starting RULER Evaluation: niah_single_2
üìè Testing Lengths: [2048]


niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 565.77it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 1366.79it/s]
Running generate_until requests: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [07:00<00:00,  8.42s/it]



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |    1|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |   -1|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [14]:
# ==============================================================================
# 4. RULER EVALUATION HARNESS (Force-Unlocked)
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_3" 
LENGTHS_TO_TEST = [2048] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    limit=50,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
üöÄ Starting RULER Evaluation: niah_single_3
üìè Testing Lengths: [2048]


niah_single_3: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:01<00:00, 262.23it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 1376.54it/s]
Running generate_until requests: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [06:44<00:00,  8.09s/it]



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_3|      1|none  |     0|  2048|   | 0.72|¬±  |0.0641|
|             |       |none  |     0|  4096|‚Üë  |-1.00|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [13]:
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
import json
import pandas as pd  # Recommended for the final summary table

# ... (Previous model/tokenizer setup code here) ...

# 1. SETUP MODEL WRAPPER
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)
lm_obj._max_length = 131072
print(f"‚úÖ Max Length set to: {lm_obj.max_length}")

# 2. EVALUATION CONFIGURATION
TASKS = ["niah_single_1", "niah_single_2", "niah_single_3"]
FEWSHOT_COUNTS = [2]
LENGTHS_TO_TEST = [2048, 4096, 8192]

all_summary_results = []

print(f"üöÄ Starting Multi-Task Evaluation...")

# 3. LOOP THROUGH FEWSHOT COMBINATIONS
for n_shot in FEWSHOT_COUNTS:
    print(f"\n--- üìä Running {n_shot}-shot Evaluations ---")
    
    # We run all tasks for the current shot count in one go
    results = simple_evaluate(
        model=lm_obj,
        tasks=TASKS,
        device="cuda",
        num_fewshot=n_shot,
        metadata={
            "max_seq_lengths": LENGTHS_TO_TEST,
            "tokenizer": BASE_MODEL_ID 
        }
    )
    
    # Extract scores for each task
    for task_name in TASKS:
        # Note: The metric key might vary (e.g., 'acc', 'exact_match'). 
        # Adjust 'acc' if your specific RULER task uses a different metric name.
        score = results["results"][task_name].get("acc,none") or \
                results["results"][task_name].get("acc") or \
                results["results"][task_name].get("exact_match")
        
        all_summary_results.append({
            "Task": task_name,
            "Fewshot": n_shot,
            "Score": round(score, 4) if score is not None else "N/A"
        })

# 4. SHOW OVERLOOKABLE RESULTS
print("\n" + "="*50)
print("üèÜ FINAL COMPARISON TABLE")
print("="*50)

df = pd.DataFrame(all_summary_results)
pivot_df = df.pivot(index='Task', columns='Fewshot', values='Score')
print(pivot_df)

# 5. SAVE FULL RAW DATA
output_file = "niah_comprehensive_results.json"
with open(output_file, "w") as f:
    json.dump(all_summary_results, f, indent=4)
print(f"\nüíæ Summary saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


‚úÖ Max Length set to: 131072
üöÄ Starting Multi-Task Evaluation...

--- üìä Running 2-shot Evaluations ---


niah_single_1: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: repeat | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 588.19it/s]
Generating synthetic samples: repeat | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 212.77it/s]
Generating synthetic samples: repeat | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:04<00:00, 121.63it/s]
niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: e

KeyboardInterrupt: 

In [5]:
# ==============================================================================
# 4. RULER EVALUATION HARNESS (Force-Unlocked)
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_2" 
LENGTHS_TO_TEST = [2048] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    limit=20,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
üöÄ Starting RULER Evaluation: niah_single_2
üìè Testing Lengths: [2048]


niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 649.87it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<00:00, 1381.57it/s]
Running generate_until requests: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:31<00:00,  7.58s/it]



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |    1|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |   -1|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [1]:
import os
import sys

# --- CRITICAL WINDOWS FIX ---
# Force Single-Threaded Compilation.
# This prevents the race condition that corrupted the cache previously.
os.environ['TRITON_WORKER_COUNT'] = '1'

print("‚úÖ Triton Windows Configuration Applied.")

# --- NOW IMPORT LIBRARIES ---
import torch
import fla
from lm_eval import simple_evaluate

# Test if it works immediately
print("üöÄ Testing Triton compilation...")
try:
    import triton
    import triton.language as tl
    # Simple test kernel
    @triton.jit
    def add_kernel(x_ptr, y_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
        pass
    print("‚úÖ Triton Compiler is responding!")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Triton might still be unstable: {e}")

‚úÖ Triton Windows Configuration Applied.
üöÄ Testing Triton compilation...
‚úÖ Triton Compiler is responding!


In [2]:
import os
import sys
import triton.runtime.cache

# --- WINDOWS TRITON CONFIGURATION ---

# 1. Force Single Thread (Required for Windows stability)
os.environ['TRITON_WORKER_COUNT'] = '1'

# 2. Local Cache (Prevents permission errors)
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.makedirs(local_cache, exist_ok=True)

# 3. Simple Retry for File Locking
def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    # Serialize
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data

    # Skip if exists
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        return filepath

    # Write with retry
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except (PermissionError, OSError):
        pass 
        
    return filepath

# Apply Patch
triton.runtime.cache.FileCacheManager.put = simple_put
print(f"‚úÖ Windows Environment Ready.\n   Cache: {local_cache}")

# --- IMPORTS ---
import torch
import fla
from lm_eval import simple_evaluate

‚úÖ Windows Environment Ready.
   Cache: D:\Users\Louis\PycharmProjects\Master_thesis\LCA-Thesis\Evlauation\triton_cache


In [2]:
import os
import subprocess
import sys
import tempfile

# --- WINDOWS MSVC COMPILER SETUP FOR NOTEBOOKS (FIXED) ---

def setup_msvc_environment():
    """
    Detects the Visual Studio installation and injects the necessary 
    LIB/INCLUDE paths into the current Python process so Triton can compile.
    """
    if 'VCToolsInstallDir' in os.environ:
        print("‚úÖ MSVC Environment already active.")
        return

    print("‚öôÔ∏è Configuring MSVC environment...")
    
    # 1. Find vcvars64.bat
    #    Checking standard paths + the 'Insiders' path from your error log
    possible_paths = [
        r"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\18\Insiders\VC\Auxiliary\Build\vcvars64.bat" 
    ]
    
    vcvars_path = None
    for p in possible_paths:
        if os.path.exists(p):
            vcvars_path = p
            break
            
    # Fallback: Use vswhere.exe if standard paths fail
    if not vcvars_path:
        try:
            path = subprocess.check_output(
                r'"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -latest -find VC\Auxiliary\Build\vcvars64.bat', 
                shell=True
            ).decode().strip()
            if os.path.exists(path):
                vcvars_path = path
        except:
            pass

    if not vcvars_path:
        print("‚ùå Could not find 'vcvars64.bat'. Please ensure C++ Build Tools are installed.")
        return

    print(f"   Found Script: {vcvars_path}")

    # 2. Extract Environment Variables
    try:
        # Create a dummy batch file that calls vcvars64 then prints the env
        with tempfile.NamedTemporaryFile(suffix='.bat', delete=False, mode='w') as f:
            # CORRECTION: Used the correct variable 'vcvars_path' here
            f.write(f'call "{vcvars_path}" > nul\n')
            f.write('set\n')
            temp_bat = f.name
        
        # Run it and capture output
        output = subprocess.check_output(temp_bat, shell=True).decode('utf-8', errors='ignore')
        
        # Cleanup temp file
        os.remove(temp_bat)

        # 3. Apply to current process
        count = 0
        for line in output.splitlines():
            if '=' in line:
                key, _, value = line.partition('=')
                # We inject everything related to the compiler
                if key.upper() in ['PATH', 'LIB', 'INCLUDE', 'LIBPATH', 'VCTOOLSINSTALLDIR']:
                    os.environ[key] = value
                    count += 1
        
        print(f"‚úÖ Injected {count} MSVC environment variables.")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to load MSVC environment: {e}")

# Run the setup
setup_msvc_environment()

# --- STANDARD TRITON LOCK FIX (Keep this!) ---
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

import triton.runtime.cache
def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ Triton File Lock Patch Applied.")

# --- NOW IMPORT ---
import torch
import fla
from lm_eval import simple_evaluate

‚úÖ MSVC Environment already active.
‚úÖ Triton File Lock Patch Applied.


In [2]:
import torch
import triton
import triton.language as tl
import fla.modules.activations

# --- WINDOWS FLA PATCH: MANUAL KERNEL CALL ---

def patch_swiglu():
    print("üîß Applying manual patch to 'swiglu_fwd'...")

    # 1. Extract the raw kernel (bypass the broken Autotuner object)
    #    If we already patched it, 'swiglu_fwd_kernel' is the raw function. 
    #    If not, we grab '.fn' from the Autotuner wrapper.
    kernel_obj = fla.modules.activations.swiglu_fwd_kernel
    if hasattr(kernel_obj, 'fn'):
        raw_kernel = kernel_obj.fn
    else:
        raw_kernel = kernel_obj

    # 2. Define a robust replacement function
    #    We manually calculate the Grid and Block Size (B) instead of letting Triton guess.
    def manual_swiglu_fwd(x, y):
        T, D = x.numel(), x.shape[-1]
        z = torch.empty_like(x)
        
        # HARDCODED CONFIGURATION (The fix)
        # We pick a safe block size (128) that works on all GPUs.
        B_SIZE = 128
        
        # Calculate grid manually: ceil(T / B)
        grid = (triton.cdiv(T, B_SIZE),)
        
        # EXPLICIT CALL:
        # We pass T, D as positional args, and B as a keyword arg (constexpr).
        # This satisfies the kernel's signature "def kernel(..., T, D, B: tl.constexpr)"
        raw_kernel[grid](x, y, z, T, D, B=B_SIZE)
        
        return z

    # 3. Overwrite the library function
    fla.modules.activations.swiglu_fwd = manual_swiglu_fwd
    print("   ‚úÖ Patch applied. Autotuner bypassed.")

patch_swiglu()

# --- STANDARD TRITON FILE LOCK FIX ---
import os
import triton.runtime.cache
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ File Lock Patch Applied.")

# --- NOW RUN EVALUATION ---
import fla
from lm_eval import simple_evaluate
# ... Paste your evaluation code here ...

üîß Applying manual patch to 'swiglu_fwd'...
   ‚úÖ Patch applied. Autotuner bypassed.
‚úÖ File Lock Patch Applied.


In [3]:
import sys
import torch
import triton
import triton.language as tl
import fla.modules.activations

# --- WINDOWS STABILITY FIX: DISABLE AUTOTUNER ---

def strip_autotuner():
    print("üîß Patching FLA kernels to bypass Autotuner crash...")
    
    # 1. Target the failing kernel object
    #    The 'swiglu_fwd_kernel' is currently wrapped in an Autotuner object.
    #    This wrapper is what causes the IndexError.
    autotuner_wrapper = fla.modules.activations.swiglu_fwd_kernel
    
    # 2. Extract the raw JIT function
    #    Every Autotuner wraps a raw '.fn'. We want that raw function.
    if hasattr(autotuner_wrapper, 'fn'):
        raw_kernel = autotuner_wrapper.fn
        
        # 3. Overwrite the library's reference
        #    Now, when FLA calls this kernel, it calls the raw compiler directly.
        fla.modules.activations.swiglu_fwd_kernel = raw_kernel
        print("   ‚úÖ Successfully stripped Autotuner from 'swiglu_fwd_kernel'")
    else:
        print("   ‚ö†Ô∏è Kernel appeared to be already patched or different type.")

strip_autotuner()

# --- STANDARD FILE LOCK PATCH (Required) ---
import os
import triton.runtime.cache
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ File Lock Patch Applied.")

# --- NOW RUN EVALUATION ---
import fla
from lm_eval import simple_evaluate
# ... Paste your evaluation code here ...

üîß Patching FLA kernels to bypass Autotuner crash...
   ‚úÖ Successfully stripped Autotuner from 'swiglu_fwd_kernel'
‚úÖ File Lock Patch Applied.


In [5]:
# ==============================================================================
# üõ†Ô∏è AUTO-PATCHER: PERMANENTLY FIX TRITON ON DISK
# ==============================================================================
import os
import triton.runtime.cache

# 1. Find the file
cache_file = os.path.abspath(triton.runtime.cache.__file__)
print(f"üìç Targeting file: {cache_file}")

# 2. Read the content
with open(cache_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# 3. Apply the patch
new_lines = []
patched = False

for line in lines:
    # We are looking for the exact line causing the crash:
    if "os.replace(temp_path, filepath)" in line and "try:" not in line:
        indent = line.split("os.replace")[0] # Keep indentation
        
        # We replace the single line with a robust Try/Except block
        new_lines.append(f"{indent}# --- WINDOWS FIX (AUTO-PATCHED) ---\n")
        new_lines.append(f"{indent}try:\n")
        new_lines.append(f"{indent}    os.replace(temp_path, filepath)\n")
        new_lines.append(f"{indent}except OSError as e:\n")
        new_lines.append(f"{indent}    # WinError 5 = Access Denied (File is locked/loaded)\n")
        new_lines.append(f"{indent}    if getattr(e, 'winerror', None) == 5 or e.errno == 13:\n")
        new_lines.append(f"{indent}        pass # File exists and is locked. Assume success.\n")
        new_lines.append(f"{indent}    else:\n")
        new_lines.append(f"{indent}        raise e\n")
        new_lines.append(f"{indent}# --------------------------------\n")
        patched = True
    else:
        new_lines.append(line)

# 4. Save changes
if patched:
    try:
        with open(cache_file, "w", encoding="utf-8") as f:
            f.writelines(new_lines)
        print("\n‚úÖ SUCCESS: Library file patched on disk.")
        print("   The PermissionError is now physically impossible.")
    except PermissionError:
        print("\n‚ùå ERROR: Could not write to disk. Run VS Code/Jupyter as Administrator.")
else:
    if any("WINDOWS FIX" in line for line in lines):
        print("\n‚úÖ File was ALREADY patched. You are good.")
    else:
        print("\n‚ö†Ô∏è Warning: Could not find the line to patch. Check file manually.")

üìç Targeting file: D:\Users\Louis\PycharmProjects\Master_thesis\Babilong_Benchmark\.venv\Lib\site-packages\triton\runtime\cache.py

‚úÖ SUCCESS: Library file patched on disk.
   The PermissionError is now physically impossible.


In [19]:
# ==============================================================================
# 4. RULER EVALUATION 2
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 1. PRE-PATCH THE CONFIG (Just in case)

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)

# --- üõ†Ô∏è STEP 4.5: THE NUCLEAR FIX üõ†Ô∏è ---
lm_obj._max_length = 131072
print(f"‚úÖ Force-set Harness Max Length to: {lm_obj.max_length}")
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_1" 
LENGTHS_TO_TEST = [2048, 4096, 8192]
#[2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    num_fewshot=2,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
‚úÖ Force-set Harness Max Length to: 131072
üöÄ Starting RULER Evaluation: niah_single_1
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_1: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.


Generating synthetic samples: repeat | 2048:   0%|                                              | 0/500 [00:00<?, ?it/s][A[A

Generating synthetic samples: repeat | 2048:  11%|‚ñà‚ñà‚ñà‚ñâ                                | 54/500 [00:00<00:00, 532.98it/s][A[A

Generating synthetic samples: repeat | 2048:  22%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                           | 108/500 [00:00<00:00, 524.32it/s][A[A

Generating synthetic samples: repeat | 2048:  32%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                       | 161/500 [00:00<00:00, 510.11it/s][A[A

Generating synthetic samples: repeat | 2048:  43%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                    | 213/500 [00:00<00:00, 511.61it/s][A[A

Generating synthetic samples: repeat | 2048:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ


üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_1|      1|none  |     2|  2048|   |0.750|¬±  |0.0194|
|             |       |none  |     2|  4096|‚Üë  |0.656|¬±  |   N/A|
|             |       |none  |     2|  8192|‚Üë  |0.576|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [20]:
# ==============================================================================
# 4. RULER EVALUATION 2
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 1. PRE-PATCH THE CONFIG (Just in case)

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)

# --- üõ†Ô∏è STEP 4.5: THE NUCLEAR FIX üõ†Ô∏è ---
lm_obj._max_length = 131072
print(f"‚úÖ Force-set Harness Max Length to: {lm_obj.max_length}")
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_2" 
LENGTHS_TO_TEST = [2048, 4096, 8192]
#[2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    num_fewshot=2,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
‚úÖ Force-set Harness Max Length to: 131072
üöÄ Starting RULER Evaluation: niah_single_2
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 515.50it/s]
Generating synthetic samples: essay | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 190.60it/s]
Generating synthetic samples: essay | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 73.87it/s]
Overwriting default num_fewshot of niah_single_2 from None to 2
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [00:02<00:00, 702.81it/s]



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     2|  2048|   |0.338|¬±  |0.0212|
|             |       |none  |     2|  4096|‚Üë  |0.306|¬±  |   N/A|
|             |       |none  |     2|  8192|‚Üë  |0.088|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [21]:
# ==============================================================================
# 4. RULER EVALUATION 2
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 1. PRE-PATCH THE CONFIG (Just in case)

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)

# --- üõ†Ô∏è STEP 4.5: THE NUCLEAR FIX üõ†Ô∏è ---
lm_obj._max_length = 131072
print(f"‚úÖ Force-set Harness Max Length to: {lm_obj.max_length}")
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_3" 
LENGTHS_TO_TEST = [2048, 4096, 8192]
#[2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    num_fewshot=2,
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
‚úÖ Force-set Harness Max Length to: 131072
üöÄ Starting RULER Evaluation: niah_single_3
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_3: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 520.96it/s]
Generating synthetic samples: essay | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 190.08it/s]
Generating synthetic samples: essay | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 73.10it/s]
Overwriting default num_fewshot of niah_single_3 from None to 2
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [00:03<00:00, 451.18it/s]



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_3|      1|none  |     2|  2048|   |0.094|¬±  |0.0131|
|             |       |none  |     2|  4096|‚Üë  |0.060|¬±  |   N/A|
|             |       |none  |     2|  8192|‚Üë  |0.022|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [4]:
# ==============================================================================
# ABLATION TEST: Chunked vs Recurrent Mode for DeltaNet
# ==============================================================================
# This tests whether the computation mode affects NIAH accuracy
# Run this BEFORE loading the model (or restart kernel between tests)

import os
import gc
import torch
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table

# --- TEST CONFIGURATION ---
TASK = "niah_single_2"
LENGTHS = [2048]
LIMIT = 20  # Small sample for quick comparison
BASE_MODEL_ID = "fla-hub/delta_net-1.3B-100B"

results_summary = {}

# ==============================================================================
# TEST 1: Default Chunked Mode (Triton)
# ==============================================================================
print("=" * 60)
print("üß™ TEST 1: CHUNKED MODE (Default Triton)")
print("=" * 60)

# Set environment for chunked mode
os.environ["FLA_BACKEND"] = "triton"
if "FLA_CHUNK_SIZE" in os.environ:
    del os.environ["FLA_CHUNK_SIZE"]

# Import FLA after setting env vars
import fla.modules.mlp
from fla.models.delta_net import DeltaNetForCausalLM, DeltaNetConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

# Disable fused SwiGLU to avoid Triton issues
fla.modules.mlp.GatedMLP.fuse_swiglu = False
DeltaNetConfig.fuse_swiglu = False

# Register architecture
try:
    AutoConfig.register("delta_net", DeltaNetConfig)
    AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM)
except ValueError:
    pass

# Load model
model_chunked = DeltaNetForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)
model_chunked.eval()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Wrap and evaluate
lm_obj = HFLM(pretrained=model_chunked, tokenizer=tokenizer, batch_size=1, trust_remote_code=True)
lm_obj._max_length = 131072

results_chunked = simple_evaluate(
    model=lm_obj,
    tasks=[TASK],
    device="cuda",
    num_fewshot=0,
    limit=LIMIT,
    metadata={"max_seq_lengths": LENGTHS, "tokenizer": BASE_MODEL_ID}
)

print("\nüìä CHUNKED MODE RESULTS:")
print(make_table(results_chunked))
results_summary["chunked"] = results_chunked

# Cleanup
del model_chunked, lm_obj
gc.collect()
torch.cuda.empty_cache()

# ==============================================================================
# TEST 2: Force Small Chunk Size (More Sequential-like)
# ==============================================================================
print("\n" + "=" * 60)
print("üß™ TEST 2: SMALL CHUNK SIZE (chunk_size=64)")
print("=" * 60)

# Force smaller chunk size (more sequential behavior)
os.environ["FLA_CHUNK_SIZE"] = "64"

# Reload model with new settings
# Note: You may need to restart kernel for env vars to fully take effect
model_sequential = DeltaNetForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)
model_sequential.eval()

lm_obj2 = HFLM(pretrained=model_sequential, tokenizer=tokenizer, batch_size=1, trust_remote_code=True)
lm_obj2._max_length = 131072

results_sequential = simple_evaluate(
    model=lm_obj2,
    tasks=[TASK],
    device="cuda",
    num_fewshot=0,
    limit=LIMIT,
    metadata={"max_seq_lengths": LENGTHS, "tokenizer": BASE_MODEL_ID}
)

print("\nüìä SMALL CHUNK RESULTS:")
print(make_table(results_sequential))
results_summary["small_chunk"] = results_sequential

# ==============================================================================
# COMPARISON SUMMARY
# ==============================================================================
print("\n" + "=" * 60)
print("üìà ABLATION SUMMARY")
print("=" * 60)
print(f"Task: {TASK} @ {LENGTHS[0]} tokens")
print(f"Samples: {LIMIT}")
print()
print("Mode           | Accuracy")
print("-" * 30)
for mode, res in results_summary.items():
    # Extract accuracy (adjust key based on actual result structure)
    try:
        acc = res["results"][TASK].get("acc,none", res["results"][TASK].get("exact_match,none", "N/A"))
        print(f"{mode:14} | {acc}")
    except:
        print(f"{mode:14} | See table above")

üß™ TEST 1: CHUNKED MODE (Default Triton)


`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 635.96it/s]
Overwriting default num_fewshot of niah_single_2 from None to 0
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚


üìä CHUNKED MODE RESULTS:
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |    1|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |   -1|¬±  |   N/A|


üß™ TEST 2: SMALL CHUNK SIZE (chunk_size=64)


`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 635.65it/s]
Overwriting default num_fewshot of niah_single_2 from None to 0
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚


üìä SMALL CHUNK RESULTS:
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |    1|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |   -1|¬±  |   N/A|


üìà ABLATION SUMMARY
Task: niah_single_2 @ 2048 tokens
Samples: 20

Mode           | Accuracy
------------------------------
chunked        | N/A
small_chunk    | N/A
