# IIC-3670 NLP UC

LoRa paper - https://arxiv.org/abs/2106.09685
Colab prop+ con A100

## !git clone https://github.com/gururise/AlpacaDataCleaned.git

In [1]:
ls AlpacaDataCleaned/

alpaca_data_cleaned_archive.json  [0m[01;34meval[0m/                    README.md
alpaca_data_cleaned.json          generate_instruction.py  requirements.txt
alpaca_data.json                  [01;34mgui[0m/                     schema.json
alpacaModifier.py                 LICENSE                  seed_tasks.jsonl
[01;34massets[0m/                           modifierGui.py           [01;34mtools[0m/
DATA_LICENSE                      prompt.txt               utils.py
[01;34mdataset_extensions[0m/               pyproject.toml


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import transformers

print(transformers.__version__)

4.31.0


In [4]:
import datasets

print(datasets.__version__)

2.12.0


## Vamos a trabajar sobre un modelo base gpt-j. Lo vamos a alinear usando Alpaca.

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id

data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data.json")


def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""


data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

data

Found cached dataset json (/home/marcelo/.cache/huggingface/datasets/json/default-635f23300096765a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/marcelo/.cache/huggingface/datasets/json/default-635f23300096765a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-1cc3e361d62b331c.arrow


DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input', 'prompt'],
        num_rows: 52002
    })
})

In [6]:
import torch

print(torch.__version__)

2.1.0.dev20230610+cu118


In [7]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)

True
Quadro RTX 6000
11.8


In [9]:
import bitsandbytes as bnb
import peft
print(peft.__version__)

2024-05-20 10:57:00.333857: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


0.4.0.dev0


## Fine-tuning GPT using Alpaca

In [10]:
import os

#os.environ["CUDA_VISIBLE_DEVICES"] = "0" # si solo vamos a usar la primera GPU que tengamos

import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

## Aquí definimos los parámetros de LoRa

In [11]:
# Settings for A100 - For 3090 
MICRO_BATCH_SIZE = 4  # change to 8 for A100
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2  # paper uses 3
LEARNING_RATE = 2e-5  
CUTOFF_LEN = 256  
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

## Tanto el modelo como el tokenizer son del modelo base

In [12]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B",
                                          add_eos_token=True, 
                                          )



In [13]:
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B",
                                  load_in_8bit=True,
                                  device_map="auto", 
                                  )


In [14]:
model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)



## Configuramos LoRa, vamos a hacer el adapter con q y v en un modelo causal

In [15]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

## Voy a trabajar con una versión procesada de Alpaca, con un poco menos de instrucciones

In [16]:
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data_cleaned.json")

Downloading and preparing dataset json/default to /home/marcelo/.cache/huggingface/datasets/json/default-fe3fd93367acf025/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/marcelo/.cache/huggingface/datasets/json/default-fe3fd93367acf025/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Armamos los batches de datos para entrenar (son batches de instrucciones)

In [18]:
data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

## Es un diccionario!

In [19]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'input_ids', 'attention_mask'],
        num_rows: 51760
    })
})

## El trainer declara como vamos a hacer el forward (a 16FP). Hay un data collator. En alignment entrenamos en modo autorregresivo, por lo que el mlm = False.

In [20]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        output_dir="gpt-j-6B-alpaca",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("gpt-j-6B-alpaca")

[34m[1mwandb[0m: Currently logged in as: [33mmarcelo-mendoza-rocha[0m ([33mmendoza-lab[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.1189
2,2.1096
3,2.0668
4,2.0562
5,2.0673
6,2.0817
7,2.0611
8,2.0388
9,2.0302
10,1.9764


wandb: Network error (ConnectionError), entering retry loop.


## Guardo los pesos del adapter LoRa en hugging face

In [21]:
model.push_to_hub("mmendoza/gpt-j-6B-alpaca", use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/7.38M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mmendoza/gpt-j-6B-alpaca/commit/72f44498b09fe0902782b7fd769e2f12423e3b8d', commit_message='Upload model', commit_description='', oid='72f44498b09fe0902782b7fd769e2f12423e3b8d', pr_url=None, pr_revision=None, pr_num=None)

## También los guardo local

In [22]:
trainer.save_model("gpt-j-6B-alpaca.h5")

## Aquí debemos liberar la memoria de la GPU (reiniciar)

## Para poder usar el modelo alineado, debemos combinarlo con los pesos del adapter. El tokenizer viene del base.

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig


base_model = "EleutherAI/gpt-j-6B"
adapter_model = "mmendoza/gpt-j-6B-alpaca"

model = AutoModelForCausalLM.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, adapter_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/marcelo/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-12.0/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /home/marcelo/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...


  warn(msg)
2024-05-21 10:54:48.456647: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## Coloco los pesos en Cuda para hacer inferencia

In [2]:
model = model.to("cuda")
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTJForCausalLM(
      (transformer): GPTJModel(
        (wte): Embedding(50400, 4096)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-27): 28 x GPTJBlock(
            (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): GPTJAttention(
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_fe

## Ahora puedo hacer un prompt

In [4]:
import torch

inputs = tokenizer("The capital of Chile is", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_length=48, pad_token_id=tokenizer.eos_token_id)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

The capital of Chile is Santiago, which is located in the Central Valley of the country. It is the second largest city in the country, after Valparaíso. Santiago is the economic, cultural, and political center of Chile.
