In [1]:
import os
import sys
import torch
import triton

# 1. THE CRITICAL BYPASS: This forces the model to use stable PyTorch kernels
# and avoids the Triton bug that causes "IndexError: list index out of range"
os.environ["FLA_BACKEND"] = "torch"

# 2. Diagnostic Checks
print(f"--- Environment Status ---")
print(f"Python Version: {sys.version.split()[0]}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Triton Version: {triton.__version__}")
print(f"Backend Forced to: {os.environ.get('FLA_BACKEND')}")

print(f"\n--- Hardware Status ---")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

--- Environment Status ---
Python Version: 3.11.14
PyTorch Version: 2.5.1+cu121
Triton Version: 3.1.0
Backend Forced to: torch

--- Hardware Status ---
GPU Available: True
GPU Name: NVIDIA GeForce RTX 3080
VRAM Total: 10.74 GB


In [None]:
import sys
import torch

# 1. Patch the torch.cpu bug we saw earlier
if not hasattr(torch.cpu, 'device'):
    torch.cpu.device = lambda index=None: torch.device('cpu')

# 2. Hard-disable the broken Triton SwiGLU kernel globally
import fla.modules.mlp
fla.modules.mlp.GatedMLP.fuse_swiglu = False

# 3. Force the config to match
from fla.models.delta_net.configuration_delta_net import DeltaNetConfig
DeltaNetConfig.fuse_swiglu = False

print("üõë Triton SwiGLU hard-disabled. The model will now use stable PyTorch math.")

Current Triton version 3.1.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.


In [3]:
import torch

# This creates the dummy attribute the library is looking for
if not hasattr(torch.cpu, 'device'):
    torch.cpu.device = lambda index=None: torch.device('cpu')
    print("ü©π Applied emergency fix for torch.cpu.device")

ü©π Applied emergency fix for torch.cpu.device


In [7]:
# This installs all specific RULER requirements at once
!pip install "lm-eval[ruler]"

Collecting wonderwords (from lm-eval[ruler])
  Using cached wonderwords-3.0.1-py3-none-any.whl.metadata (11 kB)
Using cached wonderwords-3.0.1-py3-none-any.whl (51 kB)
Installing collected packages: wonderwords
Successfully installed wonderwords-3.0.1


In [5]:
# 1. Install PEFT (and standard dependencies just in case)
!pip install peft transformers accelerate

# 2. If you are using bitsandbytes for 8-bit/4-bit loading
!pip install bitsandbytes

Collecting peft
  Using cached peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting accelerate
  Using cached accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Using cached peft-0.18.0-py3-none-any.whl (556 kB)
Using cached accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate, peft
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2/2[0m [peft][32m1/2[0m [peft]
[1A[2KSuccessfully installed accelerate-1.12.0 peft-0.18.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m11.4 MB/s[0m  [33m0:00:05[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: b

In [4]:
import os
import torch

# Force FLA to use the triton backend
os.environ["FLA_BACKEND"] = "triton"

# Verify the GPU is visible to Torch
print(f"CUDA Available: {torch.cuda.is_available()}")

CUDA Available: True


In [2]:
import os
import sys
import glob
import subprocess
import shutil
import time
from pathlib import Path
from functools import lru_cache
import torch
import fla.utils

Current Triton version 3.1.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.


In [3]:
import os
import torch

# 1. Force the library to use Triton instead of rolling back to CPU
os.environ["FLA_BACKEND"] = "triton"

# 2. Verify GPU again just to be safe
if torch.cuda.is_available():
    print(f"üöÄ GPU is Active: {torch.cuda.get_device_name(0)}")
else:
    print("‚ùå GPU not found by Torch")

üöÄ GPU is Active: NVIDIA GeForce RTX 3080


In [9]:
import fla
from fla.models.delta_net.configuration_delta_net import DeltaNetConfig
from fla.models.delta_net.modeling_delta_net import DeltaNetForCausalLM

# 1. Force the config to disable the broken Triton kernel
DeltaNetConfig.fuse_swiglu = False

# 2. Tell the internal module not to even attempt fusing
import fla.modules.mlp
fla.modules.mlp.GatedMLP.fuse_swiglu = False

print("üõë Triton SwiGLU Fusing hard-disabled. Bypassing Autotuner.")

üõë Triton SwiGLU Fusing hard-disabled. Bypassing Autotuner.


In [12]:
import triton
from fla.modules.activations import swiglu_fwd_kernel

print("--- Triton Kernel Inspection ---")
print(f"Kernel Name: {swiglu_fwd_kernel.fn.__name__}")
print(f"Expected Arg Names: {swiglu_fwd_kernel.fn.arg_names}")
# This is where the bug likely lives:
print(f"Autotuner Key Indices: {swiglu_fwd_kernel.key_idx}") 

# Check if the number of keys exceeds the number of arguments
if max(swiglu_fwd_kernel.key_idx) >= len(swiglu_fwd_kernel.fn.arg_names):
    print("‚ùå BUG FOUND: Autotuner is looking for an argument index that doesn't exist!")
else:
    print("‚úÖ Autotuner indices are within range. The issue may be internal to Triton's cache.")

--- Triton Kernel Inspection ---
Kernel Name: swiglu_fwd_kernel
Expected Arg Names: ['x', 'y', 'z', 'T', 'B', 'D']
Autotuner Key Indices: [5]
‚úÖ Autotuner indices are within range. The issue may be internal to Triton's cache.


In [4]:
import torch
import fla.modules.activations
import fla.modules.mlp

# 1. Define a pure-PyTorch version of SwiGLU 
# This does the exact same math as the Triton kernel but safely
def manual_swiglu_fwd(x, y):
    # Standard SwiGLU: x * silu(x) * y
    return torch.nn.functional.silu(x) * y

# 2. "Monkey Patch" the library at the root level
# We replace the function that calls the Triton kernel
fla.modules.activations.swiglu_fwd = manual_swiglu_fwd

# 3. Disable the fusing flag at the class level to be safe
fla.modules.mlp.GatedMLP.fuse_swiglu = False

print("ü©π Manual PyTorch Patch applied to 'swiglu_fwd'.")
print("‚úÖ Triton Autotuner bypassed for this layer. You can now run the RULER cell.")

ü©π Manual PyTorch Patch applied to 'swiglu_fwd'.
‚úÖ Triton Autotuner bypassed for this layer. You can now run the RULER cell.


In [4]:
!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git

Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git
  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to c:\users\louis\appdata\local\temp\pip-req-build-utm3yx1v
  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 69ecd0b929701d346c1119d5cd0563ade4ab3536
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git 'C:\Users\Louis\AppData\Local\Temp\pip-req-build-utm3yx1v'


In [5]:
# ==============================================================================
# 2. MODEL LOADING & CONFIGURATION
# ==============================================================================
import torch
from pathlib import Path
from peft import PeftModel
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from fla.models.delta_net import DeltaNetForCausalLM, DeltaNetConfig

# --- Configuration ---
USE_ADAPTER = False  # Set to True to load fine-tuned weights
BASE_MODEL_ID = "fla-hub/delta_net-1.3B-100B"
ADAPTER_PATH = Path("../babilong_deltanet_finetune").resolve()

# Use bfloat16 for stability with Linear Attention models
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
DEVICE_MAP = {"": 0}

# --- A. Register Custom Architecture ---
# Registers 'delta_net' so AutoModel and PeftModel can recognize it
try:
    AutoConfig.register("delta_net", DeltaNetConfig)
    AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM)
    print("‚úÖ DeltaNet architecture registered in Transformers.")
except ValueError:
    print("‚ÑπÔ∏è  DeltaNet already registered.")

# --- B. Load Base Model ---
print(f"‚è≥ Loading Base Model: {BASE_MODEL_ID}...")
model = DeltaNetForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    device_map=DEVICE_MAP,
    torch_dtype=DTYPE,
)

# --- C. Load Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- D. Context Limit Hack (CRITICAL) ---
# DeltaNet supports infinite context, but the HF config defaults to 2048.
# We must manually override this to allow RULER to test long sequences (4k, 8k+).
print("üîì Unlocking Model Context Limits to 128k...")
model.config.max_position_embeddings = 131072
model.config.max_length = 131072
tokenizer.model_max_length = 131072

# --- E. Load Adapter (Optional) ---
if USE_ADAPTER:
    # Logic to find the adapter if it's in a slightly different path
    if not (ADAPTER_PATH / "adapter_config.json").exists():
        fallback = Path("../Pretraining/babilong_deltanet_finetune").resolve()
        if (fallback / "adapter_config.json").exists():
            ADAPTER_PATH = fallback
    
    if (ADAPTER_PATH / "adapter_config.json").exists():
        print(f"üîó Loading LoRA Adapter from: {ADAPTER_PATH}")
        model = PeftModel.from_pretrained(model, str(ADAPTER_PATH))
        print("‚úÖ Adapter attached successfully.")
    else:
        raise FileNotFoundError(f"‚ùå Adapter not found at: {ADAPTER_PATH}")
else:
    print("‚ÑπÔ∏è  Running in Baseline Mode (No Adapter).")

model.eval()
print("‚úÖ Model ready for evaluation.")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ DeltaNet architecture registered in Transformers.
‚è≥ Loading Base Model: fla-hub/delta_net-1.3B-100B...
üîì Unlocking Model Context Limits to 128k...
‚ÑπÔ∏è  Running in Baseline Mode (No Adapter).
‚úÖ Model ready for evaluation.


In [7]:
# --- STEP 4.1: MODEL WRAPPING ---
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,        # Essential for 8k+ on a 10GB GPU
    max_length=131072,   # Pass it here directly to avoid the 'nuclear' hack
    trust_remote_code=True
)

# --- STEP 4.2: THE SAFETY CHECK ---
# Manually verify the harness sees the expanded limit
print(f"‚úÖ Harness effective max length: {lm_obj.max_length}")

# --- STEP 4.3: RUN EVALUATION ---
# NIAH tasks often require 'num_fewshot=0' for pure retrieval testing
results = simple_evaluate(
    model=lm_obj,
    tasks=["ruler_niah_s1"], # Double-check this name with --tasks list
    num_fewshot=0,
    device="cuda",
    limit=10, # Number of samples per length
    metadata={"max_seq_lengths": [2048]} 
)

NameError: name 'HFLM' is not defined

In [16]:
# ==============================================================================
# 4. RULER EVALUATION HARNESS (Force-Unlocked)
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_2" 
LENGTHS_TO_TEST = [2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
üöÄ Starting RULER Evaluation: niah_single_2
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.




Generating synthetic samples: essay | 2048:   0%|                                               | 0/500 [00:00<?, ?it/s][A[A[A[A



Generating synthetic samples: essay | 2048:   9%|‚ñà‚ñà‚ñà‚ñé                                 | 44/500 [00:00<00:01, 434.43it/s][A[A[A[A



Generating synthetic samples: essay | 2048:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                              | 88/500 [00:00<00:00, 416.24it/s][A[A[A[A



Generating synthetic samples: essay | 2048:  27%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                          | 135/500 [00:00<00:00, 437.27it/s][A[A[A[A



Generating synthetic samples: essay | 2048:  37%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                      | 185/500 [00:00<00:00, 460.21it/s][A[A[A[A



Generating synthetic samples: essay | 2048:  47%


üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_2|      1|none  |     0|  2048|   |1.000|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |0.676|¬±  |   N/A|
|             |       |none  |     0|  8192|‚Üë  |0.222|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [17]:
# ==============================================================================
# 4. RULER EVALUATION 2
# ==============================================================================
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import simple_evaluate
from lm_eval.utils import make_table
import json

print("\nüîå Plugging DeltaNet into Evaluation Harness...")

# 1. PRE-PATCH THE CONFIG (Just in case)

# 2. WRAP THE MODEL
lm_obj = HFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
)

# --- üõ†Ô∏è STEP 4.5: THE NUCLEAR FIX üõ†Ô∏è ---
lm_obj._max_length = 131072
print(f"‚úÖ Force-set Harness Max Length to: {lm_obj.max_length}")
# ---------------------------------------

# 3. DEFINE TASK
TASK_NAME = "niah_single_1" 
LENGTHS_TO_TEST = [2048, 4096, 8192] 

print(f"üöÄ Starting RULER Evaluation: {TASK_NAME}")
print(f"üìè Testing Lengths: {LENGTHS_TO_TEST}")

# 4. RUN EVALUATION
results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK_NAME],
    device="cuda",
    metadata={
        "max_seq_lengths": LENGTHS_TO_TEST,
        "tokenizer": BASE_MODEL_ID 
    }
)

# 5. PRINT & SAVE
print("\n" + "="*40)
print(f"üèÜ RESULTS: DeltaNet ({'Fine-Tuned' if USE_ADAPTER else 'Baseline'})")
print("="*40)
print(make_table(results))

output_file = f"results_deltanet_{'finetuned' if USE_ADAPTER else 'baseline'}.json"
with open(output_file, "w") as f:
    json.dump(results["results"], f, indent=4)
print(f"\nüíæ Results saved to {output_file}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
HF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causal
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration



üîå Plugging DeltaNet into Evaluation Harness...
‚úÖ Force-set Harness Max Length to: 131072
üöÄ Starting RULER Evaluation: niah_single_1
üìè Testing Lengths: [2048, 4096, 8192]


niah_single_1: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: repeat | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:01<00:00, 444.65it/s]
Generating synthetic samples: repeat | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 192.19it/s]
Generating synthetic samples: repeat | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:05<00:00, 85.39it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà


üèÜ RESULTS: DeltaNet (Baseline)
|    Tasks    |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|-------------|------:|------|-----:|-----:|---|----:|---|------|
|niah_single_1|      1|none  |     0|  2048|   |    1|¬±  |     0|
|             |       |none  |     0|  4096|‚Üë  |    1|¬±  |   N/A|
|             |       |none  |     0|  8192|‚Üë  |    1|¬±  |   N/A|


üíæ Results saved to results_deltanet_baseline.json


In [5]:
import os
import sys

# CRITICAL FIX for Windows:
# Force Triton to use only 1 worker to prevent file locking race conditions.
os.environ['TRITON_WORKER_COUNT'] = '1'

# OPTIONAL: Move cache to a local folder to avoid User/OneDrive permission locks
# This creates a 'triton_cache' folder in your project directory
os.environ['TRITON_CACHE_DIR'] = os.path.join(os.getcwd(), "triton_cache")

print(f"üîß Triton Windows Fix Applied:\n - Worker Count: {os.environ['TRITON_WORKER_COUNT']}\n - Cache Dir: {os.environ['TRITON_CACHE_DIR']}")

üîß Triton Windows Fix Applied:
 - Worker Count: 1
 - Cache Dir: D:\Users\Louis\PycharmProjects\Master_thesis\LCA-Thesis\Evlauation\triton_cache


In [1]:
import os
import sys

# --- CRITICAL WINDOWS FIX ---
# Force Single-Threaded Compilation.
# This prevents the race condition that corrupted the cache previously.
os.environ['TRITON_WORKER_COUNT'] = '1'

print("‚úÖ Triton Windows Configuration Applied.")

# --- NOW IMPORT LIBRARIES ---
import torch
import fla
from lm_eval import simple_evaluate

# Test if it works immediately
print("üöÄ Testing Triton compilation...")
try:
    import triton
    import triton.language as tl
    # Simple test kernel
    @triton.jit
    def add_kernel(x_ptr, y_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
        pass
    print("‚úÖ Triton Compiler is responding!")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Triton might still be unstable: {e}")

‚úÖ Triton Windows Configuration Applied.
üöÄ Testing Triton compilation...
‚úÖ Triton Compiler is responding!


In [2]:
import os
import sys
import triton.runtime.cache

# --- WINDOWS TRITON CONFIGURATION ---

# 1. Force Single Thread (Required for Windows stability)
os.environ['TRITON_WORKER_COUNT'] = '1'

# 2. Local Cache (Prevents permission errors)
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.makedirs(local_cache, exist_ok=True)

# 3. Simple Retry for File Locking
def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    # Serialize
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data

    # Skip if exists
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        return filepath

    # Write with retry
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except (PermissionError, OSError):
        pass 
        
    return filepath

# Apply Patch
triton.runtime.cache.FileCacheManager.put = simple_put
print(f"‚úÖ Windows Environment Ready.\n   Cache: {local_cache}")

# --- IMPORTS ---
import torch
import fla
from lm_eval import simple_evaluate

‚úÖ Windows Environment Ready.
   Cache: D:\Users\Louis\PycharmProjects\Master_thesis\LCA-Thesis\Evlauation\triton_cache


In [2]:
import os
import subprocess
import sys
import tempfile

# --- WINDOWS MSVC COMPILER SETUP FOR NOTEBOOKS (FIXED) ---

def setup_msvc_environment():
    """
    Detects the Visual Studio installation and injects the necessary 
    LIB/INCLUDE paths into the current Python process so Triton can compile.
    """
    if 'VCToolsInstallDir' in os.environ:
        print("‚úÖ MSVC Environment already active.")
        return

    print("‚öôÔ∏è Configuring MSVC environment...")
    
    # 1. Find vcvars64.bat
    #    Checking standard paths + the 'Insiders' path from your error log
    possible_paths = [
        r"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Auxiliary\Build\vcvars64.bat",
        r"C:\Program Files\Microsoft Visual Studio\18\Insiders\VC\Auxiliary\Build\vcvars64.bat" 
    ]
    
    vcvars_path = None
    for p in possible_paths:
        if os.path.exists(p):
            vcvars_path = p
            break
            
    # Fallback: Use vswhere.exe if standard paths fail
    if not vcvars_path:
        try:
            path = subprocess.check_output(
                r'"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -latest -find VC\Auxiliary\Build\vcvars64.bat', 
                shell=True
            ).decode().strip()
            if os.path.exists(path):
                vcvars_path = path
        except:
            pass

    if not vcvars_path:
        print("‚ùå Could not find 'vcvars64.bat'. Please ensure C++ Build Tools are installed.")
        return

    print(f"   Found Script: {vcvars_path}")

    # 2. Extract Environment Variables
    try:
        # Create a dummy batch file that calls vcvars64 then prints the env
        with tempfile.NamedTemporaryFile(suffix='.bat', delete=False, mode='w') as f:
            # CORRECTION: Used the correct variable 'vcvars_path' here
            f.write(f'call "{vcvars_path}" > nul\n')
            f.write('set\n')
            temp_bat = f.name
        
        # Run it and capture output
        output = subprocess.check_output(temp_bat, shell=True).decode('utf-8', errors='ignore')
        
        # Cleanup temp file
        os.remove(temp_bat)

        # 3. Apply to current process
        count = 0
        for line in output.splitlines():
            if '=' in line:
                key, _, value = line.partition('=')
                # We inject everything related to the compiler
                if key.upper() in ['PATH', 'LIB', 'INCLUDE', 'LIBPATH', 'VCTOOLSINSTALLDIR']:
                    os.environ[key] = value
                    count += 1
        
        print(f"‚úÖ Injected {count} MSVC environment variables.")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to load MSVC environment: {e}")

# Run the setup
setup_msvc_environment()

# --- STANDARD TRITON LOCK FIX (Keep this!) ---
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

import triton.runtime.cache
def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ Triton File Lock Patch Applied.")

# --- NOW IMPORT ---
import torch
import fla
from lm_eval import simple_evaluate

‚úÖ MSVC Environment already active.
‚úÖ Triton File Lock Patch Applied.


In [2]:
import torch
import triton
import triton.language as tl
import fla.modules.activations

# --- WINDOWS FLA PATCH: MANUAL KERNEL CALL ---

def patch_swiglu():
    print("üîß Applying manual patch to 'swiglu_fwd'...")

    # 1. Extract the raw kernel (bypass the broken Autotuner object)
    #    If we already patched it, 'swiglu_fwd_kernel' is the raw function. 
    #    If not, we grab '.fn' from the Autotuner wrapper.
    kernel_obj = fla.modules.activations.swiglu_fwd_kernel
    if hasattr(kernel_obj, 'fn'):
        raw_kernel = kernel_obj.fn
    else:
        raw_kernel = kernel_obj

    # 2. Define a robust replacement function
    #    We manually calculate the Grid and Block Size (B) instead of letting Triton guess.
    def manual_swiglu_fwd(x, y):
        T, D = x.numel(), x.shape[-1]
        z = torch.empty_like(x)
        
        # HARDCODED CONFIGURATION (The fix)
        # We pick a safe block size (128) that works on all GPUs.
        B_SIZE = 128
        
        # Calculate grid manually: ceil(T / B)
        grid = (triton.cdiv(T, B_SIZE),)
        
        # EXPLICIT CALL:
        # We pass T, D as positional args, and B as a keyword arg (constexpr).
        # This satisfies the kernel's signature "def kernel(..., T, D, B: tl.constexpr)"
        raw_kernel[grid](x, y, z, T, D, B=B_SIZE)
        
        return z

    # 3. Overwrite the library function
    fla.modules.activations.swiglu_fwd = manual_swiglu_fwd
    print("   ‚úÖ Patch applied. Autotuner bypassed.")

patch_swiglu()

# --- STANDARD TRITON FILE LOCK FIX ---
import os
import triton.runtime.cache
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ File Lock Patch Applied.")

# --- NOW RUN EVALUATION ---
import fla
from lm_eval import simple_evaluate
# ... Paste your evaluation code here ...

üîß Applying manual patch to 'swiglu_fwd'...
   ‚úÖ Patch applied. Autotuner bypassed.
‚úÖ File Lock Patch Applied.


In [3]:
import sys
import torch
import triton
import triton.language as tl
import fla.modules.activations

# --- WINDOWS STABILITY FIX: DISABLE AUTOTUNER ---

def strip_autotuner():
    print("üîß Patching FLA kernels to bypass Autotuner crash...")
    
    # 1. Target the failing kernel object
    #    The 'swiglu_fwd_kernel' is currently wrapped in an Autotuner object.
    #    This wrapper is what causes the IndexError.
    autotuner_wrapper = fla.modules.activations.swiglu_fwd_kernel
    
    # 2. Extract the raw JIT function
    #    Every Autotuner wraps a raw '.fn'. We want that raw function.
    if hasattr(autotuner_wrapper, 'fn'):
        raw_kernel = autotuner_wrapper.fn
        
        # 3. Overwrite the library's reference
        #    Now, when FLA calls this kernel, it calls the raw compiler directly.
        fla.modules.activations.swiglu_fwd_kernel = raw_kernel
        print("   ‚úÖ Successfully stripped Autotuner from 'swiglu_fwd_kernel'")
    else:
        print("   ‚ö†Ô∏è Kernel appeared to be already patched or different type.")

strip_autotuner()

# --- STANDARD FILE LOCK PATCH (Required) ---
import os
import triton.runtime.cache
local_cache = os.path.join(os.getcwd(), "triton_cache")
os.environ['TRITON_CACHE_DIR'] = local_cache
os.environ['TRITON_WORKER_COUNT'] = '1'

def simple_put(self, data, filename, binary=True):
    filepath = os.path.join(self.cache_dir, filename)
    if not isinstance(data, (str, bytes)): data = str(data)
    if binary:
        mode = "wb"; data = data.encode('utf-8') if isinstance(data, str) else data
    else:
        mode = "w"; data = data.decode('utf-8') if isinstance(data, bytes) else data
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0: return filepath
    temp_path = filepath + f".tmp.{os.getpid()}"
    try:
        with open(temp_path, mode) as f: f.write(data)
        os.replace(temp_path, filepath)
    except: pass
    return filepath
triton.runtime.cache.FileCacheManager.put = simple_put
print("‚úÖ File Lock Patch Applied.")

# --- NOW RUN EVALUATION ---
import fla
from lm_eval import simple_evaluate
# ... Paste your evaluation code here ...

üîß Patching FLA kernels to bypass Autotuner crash...
   ‚úÖ Successfully stripped Autotuner from 'swiglu_fwd_kernel'
‚úÖ File Lock Patch Applied.


In [5]:
# ==============================================================================
# üõ†Ô∏è AUTO-PATCHER: PERMANENTLY FIX TRITON ON DISK
# ==============================================================================
import os
import triton.runtime.cache

# 1. Find the file
cache_file = os.path.abspath(triton.runtime.cache.__file__)
print(f"üìç Targeting file: {cache_file}")

# 2. Read the content
with open(cache_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# 3. Apply the patch
new_lines = []
patched = False

for line in lines:
    # We are looking for the exact line causing the crash:
    if "os.replace(temp_path, filepath)" in line and "try:" not in line:
        indent = line.split("os.replace")[0] # Keep indentation
        
        # We replace the single line with a robust Try/Except block
        new_lines.append(f"{indent}# --- WINDOWS FIX (AUTO-PATCHED) ---\n")
        new_lines.append(f"{indent}try:\n")
        new_lines.append(f"{indent}    os.replace(temp_path, filepath)\n")
        new_lines.append(f"{indent}except OSError as e:\n")
        new_lines.append(f"{indent}    # WinError 5 = Access Denied (File is locked/loaded)\n")
        new_lines.append(f"{indent}    if getattr(e, 'winerror', None) == 5 or e.errno == 13:\n")
        new_lines.append(f"{indent}        pass # File exists and is locked. Assume success.\n")
        new_lines.append(f"{indent}    else:\n")
        new_lines.append(f"{indent}        raise e\n")
        new_lines.append(f"{indent}# --------------------------------\n")
        patched = True
    else:
        new_lines.append(line)

# 4. Save changes
if patched:
    try:
        with open(cache_file, "w", encoding="utf-8") as f:
            f.writelines(new_lines)
        print("\n‚úÖ SUCCESS: Library file patched on disk.")
        print("   The PermissionError is now physically impossible.")
    except PermissionError:
        print("\n‚ùå ERROR: Could not write to disk. Run VS Code/Jupyter as Administrator.")
else:
    if any("WINDOWS FIX" in line for line in lines):
        print("\n‚úÖ File was ALREADY patched. You are good.")
    else:
        print("\n‚ö†Ô∏è Warning: Could not find the line to patch. Check file manually.")

üìç Targeting file: D:\Users\Louis\PycharmProjects\Master_thesis\Babilong_Benchmark\.venv\Lib\site-packages\triton\runtime\cache.py

‚úÖ SUCCESS: Library file patched on disk.
   The PermissionError is now physically impossible.
