In [1]:
from src.DecoderTrainer import DecoderTrainer
from transformers import TrainingArguments

trainer = DecoderTrainer("Qwen/Qwen2.5-3B")
trainer.configure_lora(r=8, lora_alpha=32)

training_args = TrainingArguments(
    output_dir="outputs/logs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    fp16=True,
    fp16_full_eval=False,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    logging_steps=50,
    save_strategy="no",
    report_to="none",  
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "../data/nvd/processed/train.json",
        "validation": "../data/nvd/processed/val.json",
        "test": "../data/nvd/processed/test.json",
    }
)


In [None]:
def format_prompt(example, tokenizer):
    instruction = example["instruction"]
    input_text = example["input"]
    output = example["output"]  # e.g. "CWE-79"

    text = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{input_text}\n\n"
        "### Response:\n"
        f"{output}{tokenizer.eos_token}"
    )

    return {"text": text}


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer = trainer.tokenizer

dataset = dataset.map(
    format_prompt,
    batched=False,          # ðŸ‘ˆ IMPORTANT
    num_proc=1,             # ðŸ‘ˆ IMPORTANT
    desc="Formatting prompts"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,   # causal LM
)

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=128,
        padding=False,   # dynamic padding later
    )

tokenized_ds = dataset.map(
    tokenize,
    batched=True,
    batch_size=128,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing",
)

In [None]:
def add_token_length(batch):
    return {
        "token_len": [len(ids) for ids in batch["input_ids"]]
    }

tokenized_ds = tokenized_ds.map(
    add_token_length,
    batched=True,
)

import numpy as np

lengths = tokenized_ds["train"]["token_len"]

print({
    "min": min(lengths),
    "max": max(lengths),
    "mean": np.mean(lengths),
    "p95": np.percentile(lengths, 95),
})


In [None]:
trainer.train(
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    training_args=training_args,
    data_collator=data_collator
)
trainer.save_lora_adapters("outputs/checkpoints/lora_qwen")

In [None]:
trainable = [
    (n, p.requires_grad)
    for n, p in trainer.model.named_parameters()
    if p.requires_grad
]

print(f"Trainable params: {len(trainable)}")
trainable[:10]
