In [6]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

import os
import math
import inspect
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from data_collactor import DataCollator

# --- Configuration ---
MODEL_ID = "deepseek-ai/deepseek-coder-7b-instruct-v1.5" # Or "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
DATA_PATH = "lora_data_train.json"
VAL_DATA_PATH = "lora_data_val.json"
OUTPUT_DIR = "./results_qlora"

# QLoRA Parameters
LORA_R = 8
LORA_ALPHA = 32
LORA_DROPOUT = 0.05


def main():
    print(f"Loading model: {MODEL_ID}")
    
    # 1. Quantization Config (8-bit loading)
    # 8-bit provides better precision than 4-bit, but uses more VRAM (~10GB vs ~6GB for 7B model).
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_quant_type="nf8",
        bnb_8bit_use_double_quant=True,
        bnb_8bit_compute_dtype=torch.float16
    )

    # 2. Load Model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map={"": 0},
        trust_remote_code=True
    )

    # 3. Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right" # Fix for fp16 training

    # 4. Load Dataset
    dataset = load_dataset("json", data_files={"train": DATA_PATH, "validation": VAL_DATA_PATH})

    # 5. Define Masking (Crucial Step)
    # We want the model to learn ONLY the Assistant's response.
    # The DataCollator finds the "response_template" and masks everything before it.
    collator = DataCollator(tokenizer=tokenizer, max_length=1024)

    # 6. LoRA Configuration
    peft_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Target attention layers
    )

    # 7. Training Arguments
    args = SFTConfig(
        output_dir=OUTPUT_DIR,
        num_train_epochs= 1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        logging_steps=10,
        gradient_checkpointing=True,
        save_strategy="epoch",
        eval_strategy="steps",
        eval_steps=50,
        report_to="none",
        remove_unused_columns=False,
        per_device_eval_batch_size=1,
        dataset_kwargs={"skip_prepare_dataset": True},
    )

    # 8. Trainer
    trainer = SFTTrainer(
        model = model, 
        train_dataset = dataset["train"],
        eval_dataset= dataset["validation"],
        processing_class = tokenizer,
        data_collator = collator,
        peft_config = peft_config,
        args = args
    )

    print("Running baseline eval before training...")
    base_metrics = trainer.evaluate()
    base_loss = base_metrics.get("eval_loss")
    if base_loss is not None:
        base_ppl = math.exp(base_loss)
        print(f"Baseline eval_loss: {base_loss:.4f} | ppl: {base_ppl:.2f}")
    else:
        print(f"Baseline eval metrics: {base_metrics}")

    print("Starting training...")
    trainer.train()
    
    print("Running eval after training...")
    final_metrics = trainer.evaluate()
    final_loss = final_metrics.get("eval_loss")
    if final_loss is not None:
        final_ppl = math.exp(final_loss)
        print(f"Final eval_loss: {final_loss:.4f} | ppl: {final_ppl:.2f}")
        if base_loss is not None:
            improvement = base_loss - final_loss
            print(f"Loss improvement: {improvement:.4f}")
    else:
        print(f"Final eval metrics: {final_metrics}")

    print("Saving model...")
    trainer.save_model(os.path.join(OUTPUT_DIR, "final_adapter"))

if __name__ == "__main__":
    main()





Loading model: deepseek-ai/deepseek-coder-7b-instruct-v1.5


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Running baseline eval before training...




The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 100015}.


Baseline eval_loss: 1.6175 | ppl: 5.04
Starting training...


Step,Training Loss,Validation Loss,Model Preparation Time,Entropy,Num Tokens,Mean Token Accuracy
50,0.218,0.166968,0.0049,0.657846,78356.0,0.948561
100,0.2366,0.14994,0.0049,0.636915,156914.0,0.953703


Running eval after training...




Final eval_loss: 0.1493 | ppl: 1.16
Loss improvement: 1.4682
Saving model...


In [7]:
import trl, inspect, trl.trainer.utils as u
print("trl:", trl.__version__)
names = [n for n in dir(u) if "Collator" in n]
print(names)

trl: 0.26.2
['DPODataCollatorWithPadding', 'DataCollatorForChatML', 'RewardDataCollatorWithPadding']


In [3]:
import trl
import inspect

print(f"TRL version: {trl.__version__}")

try:
    from trl import DataCollatorForCompletionOnlyLM
    print("DataCollatorForCompletionOnlyLM found in trl")
except ImportError:
    print("DataCollatorForCompletionOnlyLM NOT found in trl")

try:
    from trl.trainer import DataCollatorForCompletionOnlyLM
    print("DataCollatorForCompletionOnlyLM found in trl.trainer")
except ImportError:
    print("DataCollatorForCompletionOnlyLM NOT found in trl.trainer")

try:
    from trl.trainer.utils import DataCollatorForCompletionOnlyLM
    print("DataCollatorForCompletionOnlyLM found in trl.trainer.utils")
except ImportError:
    print("DataCollatorForCompletionOnlyLM NOT found in trl.trainer.utils")

try:
    from trl import DataCollatorForChatML
    print("DataCollatorForChatML found in trl")
except ImportError:
    print("DataCollatorForChatML NOT found in trl")

try:
    from trl.trainer.utils import DataCollatorForChatML
    print("DataCollatorForChatML found in trl.trainer.utils")
except ImportError:
    print("DataCollatorForChatML NOT found in trl.trainer.utils")


TRL version: 0.26.2
DataCollatorForCompletionOnlyLM NOT found in trl
DataCollatorForCompletionOnlyLM NOT found in trl.trainer
DataCollatorForCompletionOnlyLM NOT found in trl.trainer.utils
DataCollatorForChatML NOT found in trl
DataCollatorForChatML found in trl.trainer.utils


In [8]:
from datasets import load_dataset
from torch.utils.data import DataLoader

MODEL_ID = "deepseek-ai/deepseek-coder-7b-instruct-v1.5" # Or "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
DATA_PATH = "lora_data_train.json"
VAL_DATA_PATH = "lora_data_val.json"
OUTPUT_DIR = "./results_qlora"

dataset = load_dataset("json", data_files={"train": DATA_PATH, "validation": VAL_DATA_PATH})
train_ids = set(dataset["train"]["id"])
val_ids   = set(dataset["validation"]["id"])
print("id overlap:", len(train_ids & val_ids))

train_q = set(dataset["train"]["query"])
val_q   = set(dataset["validation"]["query"])
print("query overlap:", len(train_q & val_q))

print(len(dataset["train"]), len(dataset["validation"]))
print(dataset["train"][0]["id"], dataset["validation"][0]["id"])

from trl.trainer.utils import DataCollatorForChatML
response_template = "<|assistant|>"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix for fp16 training
sig = inspect.signature(DataCollatorForChatML)
kwargs = {"tokenizer": tokenizer}
if "response_template" in sig.parameters:
    kwargs["response_template"] = response_template

collator = DataCollatorForChatML(**kwargs)
if not hasattr(collator, "response_template"):
    collator.response_template = response_template
if hasattr(collator, "prompt_key"):
    # Use the formatted text produced by formatting_func instead of raw messages
    collator.prompt_key = "text"


dl = DataLoader(dataset["train"], batch_size=1, collate_fn=collator)
batch = next(iter(dl))
labels = batch["labels"]
print("labeled token ratio:", (labels != -100).float().mean().item())


id overlap: 0
query overlap: 37
2340 260
l2_market_92ff02cd l2_crossref_fc4c51a4
labeled token ratio: 0.09540636092424393


In [9]:
# the dataset have lackage and the label token ratio have 
ids = batch["input_ids"][0]
labels = batch["labels"][0]
text = tokenizer.decode(ids, skip_special_tokens=False)

# Roughly count labeled span length
labeled_positions = (labels != -100).nonzero().squeeze(-1)
print("labeled tokens:", labeled_positions.numel())
print("total tokens:", labels.numel())
print("first labeled index:", labeled_positions[0].item(), "last:", labeled_positions[-1].item())


labeled tokens: 27
total tokens: 283
first labeled index: 256 last: 282
