# IIC-3670 NLP UC

QLoRa paper: https://arxiv.org/abs/2305.14314

In [1]:
import transformers

print(transformers.__version__)

4.31.0


In [2]:
import datasets

print(datasets.__version__)

2.12.0


## Vamos a trabajar con un modelo base cuantizado a 4b, bnb_config define los parámetros de la cuantización

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neo-2.7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/marcelo/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-12.0/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /home/marcelo/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...


  warn(msg)
2024-05-21 15:15:37.215414: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of GPTNeoForCausalLM were not initialized from the model checkpoint at EleutherAI/gpt-neo-2.7b and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

## Usamos una configuración de LoRa

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2621440 || all params: 1395637760 || trainable%: 0.18783097413472102


## Este dataset contiene citas célebres en Inglés

In [7]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Found cached dataset json (/home/marcelo/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/marcelo/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-a5c0e98ac00357d9.arrow


## El trainer usa un paged_adamw_8bit, el cual es importante para QLoRa (ver paper!)

In [8]:
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmarcelo-mendoza-rocha[0m ([33mmendoza-lab[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.0564
2,2.3957
3,2.6589
4,3.295
5,2.7226
6,2.3972
7,2.4743
8,2.5937
9,2.3531
10,2.7645




TrainOutput(global_step=627, training_loss=2.345097280195075, metrics={'train_runtime': 734.4168, 'train_samples_per_second': 3.415, 'train_steps_per_second': 0.854, 'total_flos': 820073507573760.0, 'train_loss': 2.345097280195075, 'epoch': 1.0})

In [9]:
model.save_pretrained("gpt-neo-2_7B-quotes-qlora")

In [10]:
model.push_to_hub("mmendoza/gpt-neo-2_7B-quotes-qlora", use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mmendoza/gpt-neo-2_7B-quotes-qlora/commit/8ab35d85079b69108b990971a3c1938590164c71', commit_message='Upload model', commit_description='', oid='8ab35d85079b69108b990971a3c1938590164c71', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
trainer.save_model("gpt-neo-2_7B-quotes-qlora.h5")

## Para inferencia, debemos combinar el base con el adapter

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig


base_model = "EleutherAI/gpt-neo-2.7b"
adapter_model = "mmendoza/gpt-neo-2_7B-quotes-qlora"

model = AutoModelForCausalLM.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, adapter_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

adapter_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

In [13]:
model = model.to("cuda")
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoForCausalLM(
      (transformer): GPTNeoModel(
        (wte): Embedding(50257, 2560)
        (wpe): Embedding(2048, 2560)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-31): 32 x GPTNeoBlock(
            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (attn): GPTNeoAttention(
              (attention): GPTNeoSelfAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
                (resid_dropout): Dropout(p=0.0, inplace=False)
                (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
                (v_proj): Linear(
                  in_features=2560, out_features=2560, bias=False
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=2560, out_features

## Y ahora podemos usarlo

In [21]:
text = "That is one small step for men"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=8)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


That is one small step for men, one giant leap for mankind.



In [29]:
inputs = tokenizer("The capital of Chile is", return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_length=7, num_beams = 4, temperature = 0.8, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

The capital of Chile is Santiago.
