In [1]:
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1

In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install wandb

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules="all-linear", 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [7]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("text", data_files={"train": "finetuning_data.txt"}, split="train")

In [None]:
train_val_split = dataset.train_test_split(test_size=0.1)

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
block_size = 2048

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    return result

lm_val_datasets = tokenized_val.map(group_texts, batched=True, batch_size=1000)
lm_train_datasets = tokenized_train.map(group_texts, batched=True, batch_size=1000)

In [11]:
from transformers import DataCollatorForLanguageModeling

class MyDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self, examples):
        batch = super().__call__(examples)
        batch["input_ids"] = batch["input_ids"].long()
        if "labels" in batch:
            batch["labels"] = batch["labels"].long()
        return batch

data_collator = MyDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=10,
    save_total_limit=3,
    report_to=["wandb"],    
    run_name="finetune",
    num_train_epochs=100,
    eval_strategy="epoch",
    do_eval=True,
)

trainer = Trainer(
    model=model,
    train_dataset=lm_train_datasets,
    eval_dataset=lm_val_datasets,
    args=training_args,
    data_collator=data_collator,
)
model.config.use_cache = False 
trainer.train()