In [None]:
# Install latest bitsandbytes & transformers, accelerate from source
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets

In [1]:
# Load the model.
# Note: It can take a while to download LLaMA and add the adapter modules.
# You can also use the 13B model by loading in 4bits.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "meta-llama/Llama-2-13b-hf"

print(f"Starting to load the model {model_name} into memory")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={'':torch.cuda.current_device()})

print(f"Successfully loaded the model {model_name} into memory")

  from .autonotebook import tqdm as notebook_tqdm


Starting to load the model meta-llama/Llama-2-13b-hf into memory


Loading checkpoint shards: 100%|██████████| 3/3 [00:54<00:00, 18.05s/it]


Successfully loaded the model meta-llama/Llama-2-13b-hf into memory


In [2]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 13107200 || all params: 6685086720 || trainable%: 0.19606626733482374


from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [5]:
from datasets import load_dataset

%rm -r ~/.cache/huggingface/datasets
data = load_dataset("webis/tldr-17")

Downloading builder script: 100%|██████████| 4.33k/4.33k [00:00<00:00, 10.1MB/s]
Downloading metadata: 100%|██████████| 2.79k/2.79k [00:00<00:00, 13.1MB/s]
Downloading readme: 100%|██████████| 9.14k/9.14k [00:00<00:00, 35.3MB/s]
Downloading data: 100%|██████████| 3.14G/3.14G [06:55<00:00, 7.57MB/s]
Generating train split: 100%|██████████| 3848330/3848330 [02:52<00:00, 22280.44 examples/s]


In [6]:
data = data.map(lambda samples: tokenizer(samples["answer"]), batched=True)

Map: 100%|██████████| 3848330/3848330 [02:24<00:00, 26558.47 examples/s]


In [7]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to="none",
        save_steps=10,
        resume_from_checkpoint=True
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

trainer.train()

transformers.PretrainedConfig().save_pretrained("outputs")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.8332
