In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append('../')

In [4]:
import os
import json
import pandas as pd
import warnings
import random
import gc
from typing import Dict, List


In [5]:

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    set_seed,
    GenerationConfig
)
from peft import LoraConfig, get_peft_model
import evaluate

In [6]:
from src.preprocess.deer import DeerToTriplets

In [7]:
from src.preprocess.utils import to_text

In [8]:
from src.models.inference.inference import build_inference_prompt, generate_answers_with, PromptAnswerCollator

In [9]:
from src.models.evaluate import eval_metrics, print_compare

In [10]:
from src.utils.io.read import RawDataReader

In [11]:
from src.settings import Settings

In [12]:
import src.config as cfg 

In [13]:
rdr = RawDataReader(Settings.paths.RAW_DATA_PATH)

In [14]:
ir_triplets_dataset = rdr.read_ir_triplets()

In [15]:
deer_dataset = rdr.read_deer()

In [16]:
deer_to_triplets_converter = DeerToTriplets()

In [17]:
deer_to_triplets_converter.process(deer_dataset)

In [18]:
os.environ["HUGGINGFACE_HUB_TOKEN"] =  cfg.HUGGINGFACE_HUB_TOKEN

In [19]:
HF_TOKEN = os.environ["HUGGINGFACE_HUB_TOKEN"]

In [20]:
HF_TOKEN

'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'

In [21]:
import os
import json
import pandas as pd
import warnings
import random
import gc
from typing import Dict, List

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    set_seed,
    GenerationConfig
)
from peft import LoraConfig, get_peft_model
import evaluate

warnings.filterwarnings("ignore")

In [22]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:


val_data = deer_to_triplets_converter.triplets

In [24]:
# Configuration - GPU Optimized
MODEL_ID =cfg.MODEL_ID  
OUTPUT_DIR =cfg.OUTPUT_DIR  
SEED =cfg.SEED  
VAL_FRACTION =cfg.VAL_FRACTION  
MAX_SEQ_LEN =cfg.MAX_SEQ_LEN  

# LoRA Configuration - Larger for GPU
LORA_R =cfg.LORA_R  
LORA_ALPHA =cfg.LORA_ALPHA  
LORA_DROPOUT =cfg.LORA_DROPOUT  
TARGET_MODULES =cfg.TARGET_MODULES  

# Training Configuration - GPU Optimized
LR =cfg.LR  
EPOCHS =cfg.EPOCHS  
BATCH_SIZE =cfg.BATCH_SIZE  
GRAD_ACCUM =cfg.GRAD_ACCUM  
LOG_STEPS =cfg.LOG_STEPS  
EVAL_STEPS =cfg.EVAL_STEPS  
SAVE_STEPS =cfg.SAVE_STEPS  
GEN_MAX_NEW_TOKENS =cfg.GEN_MAX_NEW_TOKENS  


In [25]:
data = ir_triplets_dataset 

In [26]:


set_seed(SEED)

# Split data
random.Random(SEED).shuffle(data)
split_idx = int(len(data) * (1 - VAL_FRACTION))
train_raw, val_raw = data[:split_idx], data[split_idx:]


# train_raw = data #, _ = data[:split_idx], data[split_idx:]
# val_raw = val_data
 


print(f"Training examples: {len(train_raw)}")
print(f"Validation examples: {len(val_raw)}")


Training examples: 1264
Validation examples: 543


In [27]:
 

train_ds = Dataset.from_list([to_text(x) for x in train_raw])
val_ds = Dataset.from_list([to_text(x) for x in val_raw])
ds = DatasetDict({"train": train_ds, "validation": val_ds})

# Prepare validation data for evaluation
val_prompts = ds["validation"]["prompt"]
val_obs = [p.split("Question:\n")[0].replace("Training Observations:\n", "").strip() for p in val_prompts]
val_qs = [p.split("Question:\n")[1].split("\n\nAnswer:\n")[0].strip() for p in val_prompts]
val_refs = [x.strip() for x in ds["validation"]["response"]]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")



In [28]:
baseline_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
# (optional) also wrap .generate in autocast(bfloat16) as in A)


In [None]:
print("Generating baseline predictions...")
baseline_preds = generate_answers_with(baseline_model,tokenizer,val_obs, val_qs)

In [30]:

baseline_metrics = eval_metrics(baseline_preds, val_refs, rouge, bleu)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:


print("Baseline metrics:", baseline_metrics)

# Save baseline predictions
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(os.path.join(OUTPUT_DIR, "val_predictions_baseline.jsonl"), "w") as f:
    for obs, q, ref, pred in zip(val_obs, val_qs, val_refs, baseline_preds):
        f.write(json.dumps({
            "Training Observations": obs,
            "Question": q,
            "Reference": ref,
            "Prediction": pred
        }, ensure_ascii=False) + "\n")

# Free baseline model memory
# del baseline_model
gc.collect()
torch.cuda.empty_cache()  # Clear GPU memory


Baseline metrics: {'rouge1': np.float64(0.02747859826509425), 'rouge2': np.float64(0.002061315914361044), 'rougeL': np.float64(0.022668664160191032), 'rougeLsum': np.float64(0.022727330004289076), 'bleu': 0.02064748741328389}


In [32]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, token = HF_TOKEN)

In [33]:
print("torch:", torch.__version__, "cuda:", torch.version.cuda, "gpu?", torch.cuda.is_available())


torch: 2.8.0 cuda: None gpu? False


In [34]:

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

In [35]:
import torch, sys, platform
print("Torch:", torch.__version__, "CUDA build:", torch.version.cuda, "GPU available:", torch.cuda.is_available())


Torch: 2.8.0 CUDA build: None GPU available: False


In [39]:

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     quantization_config=bnb_cfg,
#     device_map="auto",
#     low_cpu_mem_usage=True,
# )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,  # Load in float32 first
    # Don't use device_map for now to avoid potential issues
)


In [40]:



# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID
# )

In [41]:
# # Step 2: Fine-tune model
# print("Loading model for fine-tuning...")
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.float32,  # Load in float32 first
#     # Don't use device_map for now to avoid potential issues
# )



# Move model to GPU manually
# model = model.cuda()

# Apply LoRA
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES
)

model = get_peft_model(model, peft_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [42]:


# Debug: Check which parameters require gradients
print("Checking gradient requirements:")
trainable_params = 0
total_params = 0
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        print(f"  Trainable: {name} - shape: {param.shape}")
    else:
        print(f"  Frozen: {name} - shape: {param.shape}")

print(f"Trainable: {trainable_params}, Total: {total_params}")

# Explicitly enable gradients for LoRA parameters
for name, param in model.named_parameters():
    if 'lora' in name.lower():
        param.requires_grad_(True)
        print(f"Enabled gradients for: {name}")

print("Trainable parameters:")
model.print_trainable_parameters()

# Training arguments - GPU optimized
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=2,  # Reduced for float32 memory usage
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    load_best_model_at_end=False,
    gradient_checkpointing=False,  # Disable to avoid potential LoRA conflicts
    report_to="none",
    # Disable mixed precision for now to avoid LoRA compatibility issues
    dataloader_pin_memory=True,  # Enable for GPU
    dataloader_num_workers=0,  # Set to 0 to avoid potential issues
    remove_unused_columns=False,
    prediction_loss_only=True,
    eval_strategy="no",  # Disable evaluation during training for speed
)

# Data collator
data_collator = PromptAnswerCollator(tokenizer, max_seq_len=MAX_SEQ_LEN)

# Test data collator
print("Testing data collator...")
test_batch = data_collator([ds["train"][0], ds["train"][1]])
print(f"Batch shapes: {[f'{k}: {v.shape}' for k, v in test_batch.items()]}")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    data_collator=data_collator,
)

# Train the model
print("Starting fine-tuning...")
trainer.train()


Checking gradient requirements:
  Frozen: base_model.model.model.embed_tokens.weight - shape: torch.Size([151936, 896])
  Frozen: base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight - shape: torch.Size([896, 896])
  Frozen: base_model.model.model.layers.0.self_attn.q_proj.base_layer.bias - shape: torch.Size([896])
  Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight - shape: torch.Size([16, 896])
  Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight - shape: torch.Size([896, 16])
  Frozen: base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight - shape: torch.Size([128, 896])
  Frozen: base_model.model.model.layers.0.self_attn.k_proj.base_layer.bias - shape: torch.Size([128])
  Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight - shape: torch.Size([16, 896])
  Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight - shape: torch.Size([128, 16]

Step,Training Loss
10,3.8363
20,3.4726
30,3.1338
40,2.9612
50,2.798
60,2.904
70,2.7451
80,2.7221
90,2.6996
100,2.498


Generating fine-tuned predictions...


TypeError: generate_answers_with() missing 1 required positional argument: 'q_list'

In [None]:

# Step 3: Evaluate fine-tuned model
print("Generating fine-tuned predictions...")
finetuned_preds = generate_answers_with(model, tokenizer, val_obs, val_qs)

In [44]:

finetuned_metrics = eval_metrics(finetuned_preds, val_refs, rouge, bleu)

print("Fine-tuned metrics:", finetuned_metrics)

# Save fine-tuned predictions
with open(os.path.join(OUTPUT_DIR, "val_predictions_finetuned.jsonl"), "w") as f:
    for obs, q, ref, pred in zip(val_obs, val_qs, val_refs, finetuned_preds):
        f.write(json.dumps({
            "Training Observations": obs,
            "Question": q,
            "Reference": ref,
            "Prediction": pred
        }, ensure_ascii=False) + "\n")

# Save model and tokenizer
model.save_pretrained(os.path.join(OUTPUT_DIR, "adapter"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "adapter"))

# Print comparison
print_compare(baseline_metrics, finetuned_metrics)

print(f"\nSaved outputs in: {OUTPUT_DIR}")
print("Baseline preds: val_predictions_baseline.jsonl")
print("Finetuned preds: val_predictions_finetuned.jsonl")
print("LoRA adapter: adapter/")

# Print some example outputs for inspection
print("\n===== Example Outputs =====")
for i in range(min(3, len(val_obs))):
    print(f"\n--- Example {i+1} ---")
    print(f"Observations: {val_obs[i][:100]}...")
    print(f"Question: {val_qs[i]}")
    print(f"Reference: {val_refs[i]}")
    print(f"Baseline: {baseline_preds[i]}")
    print(f"Fine-tuned: {finetuned_preds[i]}")

print("\n===== Training Complete =====")
print("Check the example outputs above to see if fine-tuning improved the answers.")
print("Training loss should have decreased during fine-tuning.")

Fine-tuned metrics: {'rouge1': np.float64(0.04165053704125564), 'rouge2': np.float64(0.007719141805907616), 'rougeL': np.float64(0.03287722479511852), 'rougeLsum': np.float64(0.032829857208427966), 'bleu': 0.2661169413763071}

===== OOS Metrics (Baseline vs Fine-tuned) =====
rouge1      base:  0.027   ft:  0.042   Δ: +0.014
rouge2      base:  0.002   ft:  0.008   Δ: +0.006
rougeL      base:  0.023   ft:  0.033   Δ: +0.010
rougeLsum   base:  0.023   ft:  0.033   Δ: +0.010
bleu        base:  0.021   ft:  0.266   Δ: +0.245

Saved outputs in: swiss_apertus_8b_ft_inductive_smalllm_compare
Baseline preds: val_predictions_baseline.jsonl
Finetuned preds: val_predictions_finetuned.jsonl
LoRA adapter: adapter/

===== Example Outputs =====

--- Example 1 ---
Observations: High persistence, non-stationarity (ADF p≈0.41), clustered 2008–09 change-points, weak seasonality, ...
Question: What best explains the observed dynamics: calendar effects, random noise, or macro regime shifts?
Reference: Macro

In [45]:
import weightwatcher as ww
base_watcher = ww.WeightWatcher(model=baseline_model)
base_details = base_watcher.analyze()
base_summary = base_watcher.get_summary(base_details)



Running on macOS.
Running on macOS.


In [46]:
watcher = ww.WeightWatcher(model=model)
details = watcher.analyze()
summary = watcher.get_summary(details)

In [47]:
base_summary

{'log_norm': np.float64(2.557011775868817),
 'alpha': np.float64(6.332740802794254),
 'alpha_weighted': np.float64(3.8719353960280394),
 'log_alpha_norm': np.float64(4.251105918200468),
 'log_spectral_norm': np.float64(0.7483215650030496),
 'stable_rank': np.float64(84.57115935377053)}

In [48]:
summary

{'log_norm': np.float64(1.2556405013247842),
 'alpha': np.float64(6.067527066402183),
 'alpha_weighted': np.float64(0.44905566868882674),
 'log_alpha_norm': np.float64(0.8055762928364159),
 'log_spectral_norm': np.float64(-0.05616470487310674),
 'stable_rank': np.float64(44.85347772637995)}