In [1]:
# ==============================================================================
# 1. SETUP & ENVIRONMENT
# ==============================================================================
import os
import sys
import glob
import torch
from pathlib import Path
from functools import lru_cache

# --- A. Detect Project Root (Critical for Notebooks) ---
# Automatically finds the root 'Babilong_Benchmark' folder
current_path = Path(os.getcwd())
if (current_path / "source").exists():
    project_root = current_path
elif (current_path.parent / "source").exists():
    project_root = current_path.parent
else:
    # Fallback to parent if structure is standard
    project_root = current_path.parent

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
    print(f"‚úÖ Added Project Root to Path: {project_root}")

# --- B. Windows Compiler Config (DeltaNet Requirement) ---
# Locates the Visual Studio C++ compiler for JIT compilation
print("\n‚öôÔ∏è Configuring Windows Environment...")
patterns = [
    r"C:\Program Files\Microsoft Visual Studio\**\Hostx64\x64\cl.exe",
    r"C:\Program Files (x86)\Microsoft Visual Studio\**\Hostx64\x64\cl.exe"
]
compiler_path = None
for pattern in patterns:
    matches = glob.glob(pattern, recursive=True)
    if matches:
        compiler_path = sorted(matches)[-1]  # Use the newest version
        break

if compiler_path:
    os.environ["CC"] = compiler_path
    os.environ["CXX"] = compiler_path
    print(f"‚úÖ Compiler configured: {os.path.basename(compiler_path)}")
else:
    print("‚ö†Ô∏è Warning: Visual Studio 'cl.exe' not found. Triton/CUDA kernels may fail.")

# --- C. GPU Patch for FLA Library ---
# Monkey-patch 'fla' to force CUDA usage on Windows
import fla.utils
fla.utils.get_available_device = lru_cache(maxsize=None)(lambda: 'cuda')
fla.utils._cpu_device_warning = lambda: None
print("‚úÖ DeltaNet GPU Lock removed.")


‚öôÔ∏è Configuring Windows Environment...


Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.


‚úÖ Compiler configured: cl.exe
‚úÖ DeltaNet GPU Lock removed.


In [4]:
!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git

Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git
  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to c:\users\louis\appdata\local\temp\pip-req-build-utm3yx1v
  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 69ecd0b929701d346c1119d5cd0563ade4ab3536
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git 'C:\Users\Louis\AppData\Local\Temp\pip-req-build-utm3yx1v'


In [3]:
# ==============================================================================
# 2. MODEL LOADING & CONFIGURATION
# ==============================================================================
import torch
from pathlib import Path
from peft import PeftModel
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from fla.models.delta_net import DeltaNetForCausalLM, DeltaNetConfig

# --- Configuration ---
USE_ADAPTER = False  # Set to True to load fine-tuned weights
BASE_MODEL_ID = "fla-hub/delta_net-1.3B-100B"
ADAPTER_PATH = Path("../babilong_deltanet_finetune").resolve()

# Use bfloat16 for stability with Linear Attention models
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
DEVICE_MAP = {"": "cuda"}

# --- A. Register Custom Architecture ---
# Registers 'delta_net' so AutoModel and PeftModel can recognize it
try:
    AutoConfig.register("delta_net", DeltaNetConfig)
    AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM)
    print("‚úÖ DeltaNet architecture registered in Transformers.")
except ValueError:
    print("‚ÑπÔ∏è  DeltaNet already registered.")

# --- B. Load Base Model ---
print(f"‚è≥ Loading Base Model: {BASE_MODEL_ID}...")
model = DeltaNetForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    device_map=DEVICE_MAP,
    torch_dtype=DTYPE,
)

# --- C. Load Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- D. Context Limit Hack (CRITICAL) ---
# DeltaNet supports infinite context, but the HF config defaults to 2048.
# We must manually override this to allow RULER to test long sequences (4k, 8k+).
print("üîì Unlocking Model Context Limits to 128k...")
model.config.max_position_embeddings = 131072
model.config.max_length = 131072
tokenizer.model_max_length = 131072

# --- E. Load Adapter (Optional) ---
if USE_ADAPTER:
    # Logic to find the adapter if it's in a slightly different path
    if not (ADAPTER_PATH / "adapter_config.json").exists():
        fallback = Path("../Pretraining/babilong_deltanet_finetune").resolve()
        if (fallback / "adapter_config.json").exists():
            ADAPTER_PATH = fallback
    
    if (ADAPTER_PATH / "adapter_config.json").exists():
        print(f"üîó Loading LoRA Adapter from: {ADAPTER_PATH}")
        model = PeftModel.from_pretrained(model, str(ADAPTER_PATH))
        print("‚úÖ Adapter attached successfully.")
    else:
        raise FileNotFoundError(f"‚ùå Adapter not found at: {ADAPTER_PATH}")
else:
    print("‚ÑπÔ∏è  Running in Baseline Mode (No Adapter).")

model.eval()
print("‚úÖ Model ready for evaluation.")

‚úÖ Registered DeltaNet architecture in Transformers registry.
‚è≥ Loading DeltaNet Base: fla-hub/delta_net-1.3B-100B...
‚ÑπÔ∏è  Running in Baseline Mode (No Adapter).
‚úÖ Base DeltaNet Loaded Successfully.


DeltaNetForCausalLM(
  (model): DeltaNetModel(
    (embeddings): Embedding(32000, 2048, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x DeltaNetBlock(
        (attn_norm): RMSNorm(2048, eps=1e-06)
        (attn): DeltaNet(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (b_proj): Linear(in_features=2048, out_features=16, bias=False)
          (q_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
          (k_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
          (v_conv1d): ShortConvolution(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048, bias=False, activation=silu, backend=triton)
 

In [5]:
# ==============================================================================
# 4. RULER EVALUATION HARNESS (Force-Unlocked)
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 1. PRE-PATCH THE CONFIG (Just in case)
print("üîì Unlocking DeltaNet Config Limits...")
# We set every possible attribute name that HF might check
model.config.max_position_embeddings = 131072
model.config.max_length = 131072
model.config.seq_length = 131072  # Sometimes used by custom configs
tokenizer.model_max_length = 131072

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)

# --- üõ†Ô∏è STEP 4.5: THE NUCLEAR FIX üõ†Ô∏è ---
# We directly overwrite the internal property of the Harness wrapper.
# This bypasses any logic it used to "guess" the length from the config.
lm_obj._max_length = 131072
print(f"‚úÖ Force-set Harness Max Length to: {lm_obj.max_length}")
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_2" 
LENGTHS_TO_TEST = [2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
üîì Unlocking DeltaNet Config Limits...
‚úÖ Force-set Harness Max Length to: 131072
üöÄ Starting RULER Evaluation: niah_single_2
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 590.80it/s]
Generating synthetic samples: essay | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 208.54it/s]
Generating synthetic samples: essay | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 73.57it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [00:01<00:00, 1133.40it/s]
Running generate_until requests: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [2:53:28<00:00,  6.94s/it]  



üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |1.000|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |0.684|¬±  |   N/A|
|             |       |none  |     0|  8192|‚Üë  |0.230|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json
