In [1]:
from src.llm_trainer import LLMTrainer
from transformers import TrainingArguments

trainer = LLMTrainer(
    model_name="Qwen/Qwen2.5-3B",
    model_type="decoder",
    load_in_4bit=False
)

trainer.configure_lora(r=8, lora_alpha=32)

training_args = TrainingArguments(
    output_dir="outputs/logs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    fp16=True,
    fp16_full_eval=False,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    logging_steps=50,
    save_strategy="no",
    report_to="none",  
)

# trainer.train(train_ds, val_ds, "outputs/logs", training_args)
# trainer.save_lora("outputs/checkpoints/lora_qwen")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 1,843,200 || all params: 3,087,781,888 || trainable%: 0.059693335438076124


In [2]:
prompt = "Classify the sentiment: Copper prices fell due to weak demand."
print(trainer.generate(prompt=prompt, max_new_tokens=50))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Classify the sentiment: Copper prices fell due to weak demand. The sentiment of the statement "Copper prices fell due to weak demand" is negative. This is because the statement indicates that the price of copper has decreased, which is generally considered a negative outcome. Additionally, the reason given for the decrease in price


In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "../data/processed/train.json",
        "validation": "../data/processed/val.json",
        "test": "../data/processed/test.json",
    }
)


In [4]:
def format_prompt(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output = example["output"]

    text = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{input_text}\n\n"
        "### Response:\n"
        f"{output}{tokenizer.eos_token}"
    )

    return {"text": text}


In [5]:
from transformers import DataCollatorForLanguageModeling

tokenizer = trainer.tokenizer

dataset = dataset.map(
    format_prompt,
    batched=False,          # ðŸ‘ˆ IMPORTANT
    num_proc=1,             # ðŸ‘ˆ IMPORTANT
    desc="Formatting prompts"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,   # causal LM
)

In [6]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=128,
        padding=False,   # dynamic padding later
    )

tokenized_ds = dataset.map(
    tokenize,
    batched=True,
    batch_size=128,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing",
)

In [7]:
def add_token_length(batch):
    return {
        "token_len": [len(ids) for ids in batch["input_ids"]]
    }

tokenized_ds = tokenized_ds.map(
    add_token_length,
    batched=True,
)

import numpy as np

lengths = tokenized_ds["train"]["token_len"]

print({
    "min": min(lengths),
    "max": max(lengths),
    "mean": np.mean(lengths),
    "p95": np.percentile(lengths, 95),
})


{'min': 24, 'max': 128, 'mean': 51.29545454545455, 'p95': 84.0}


In [8]:
trainer.train(
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    training_args=training_args,
    data_collator=data_collator
)
trainer.save_lora_adapters("outputs/checkpoints/lora_qwen")

RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'

In [None]:
trainable = [
    (n, p.requires_grad)
    for n, p in trainer.model.named_parameters()
    if p.requires_grad
]

print(f"Trainable params: {len(trainable)}")
trainable[:10]
