In [1]:
%%capture
! pip install -U torch transformers datasets bitsandbytes peft huggingface_hub

**In this notebook we are to use QLoRA to fine-tune Llama 3.1 8B on Marcus Aurelius *Meditations*, and see how well the model can mimic the Roman emperor.**

In [2]:
import huggingface_hub

#huggingface_hub.login()

In [3]:
# let's first load the model and it's tokenizer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

modelname = 'meta-llama/Llama-3.1-8B-Instruct'

model = AutoModelForCausalLM.from_pretrained(
    modelname,
    torch_dtype=torch.bfloat16, # setting default precision to bfloat16
    device_map={"": 0}, # map model to cuda
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True, # enables 4bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16, # sets computation data type to bfloat16 for the quantization process
        bnb_4bit_use_double_quant=True, # enables double quantization
        bnb_4bit_quant_type='nf4', # specifying quantization type
    )
)

tokenizer = AutoTokenizer.from_pretrained(modelname)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [5]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # specifying the task
    target_modules=['q_proj', 'k_proj', 'v_proj'],#, 'o_proj'],
    inference_mode=False, # set up for training
    r=4, # lora rank
    lora_alpha=32, 
    lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 8,032,620,544 || trainable%: 0.0294


In [6]:
with open('/kaggle/input/meditations-marcus-aurelius/meditations.txt') as f:
    data = map(lambda l: l.replace('\n', ' '), f.read().split('\n\n'))
    
data = [line for line in data if not line.startswith('BOOK')]
data[:5]

['Provided by The Internet Classics Archive. See bottom for copyright. Available online at     http://classics.mit.edu//Antoninus/meditations.html',
 'The Meditations By Marcus Aurelius',
 ' Translated by George Long',
 '----------------------------------------------------------------------',
 'From my grandfather Verus I learned good morals and the government of my temper. ']

In [7]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": data})
dataset

Dataset({
    features: ['text'],
    num_rows: 525
})

In [8]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(sample):
    return tokenizer(
        sample,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128,
    )

tokenized_dataset = dataset.map(lambda _: tokenize(_['text']), batched=True).remove_columns('text')
tokenized_dataset

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 525
})

In [9]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

training_args = TrainingArguments(
    output_dir='output',
    remove_unused_columns=False,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate= 2e-4,
    num_train_epochs=5,
    fp16=True,
    report_to='none',
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer, 
        mlm=False, 
        return_tensors='pt',
    ),
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
