In [None]:
# accelerate: to make things faster
# peft: fro fine-tuning process
# trl: it's a wrapper, it's used for Supervised fine-tuning or RLHF
# bitsandbyte:  for quantization as we are not using the model in full precision to make the fine-tuning fasetr
# wanddb: for reporting, having a nice dashboard and trak the progress of the model
%pip install -q -U transformers datasets accelerate peft trl bitsandbytes wandb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m501.8/510.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
base_model = "NousResearch/Llama-2-7b-hf"
new_model = "llama-2-7b-miniplatypus-nm"

dataset = load_dataset("nmdr/mini-platypus", split="train")
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
# to make all the tokens have the same length we use paddings - eos_token: end of sentence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
#Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4", # it's when the weights are stored
    bnb_4bit_compute_dtype = torch.float16, # when it's going to compute we use 16 bits due to more accuracy
    bnb_4bit_double_quant = True,
)

# LoRA Configuration
peft_config = LoraConfig(
    lora_alpha = 32, # the strength of the adapter
    lora_dropout = 0.05,
    r = 16,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_prj', 'v_proj', 'o_proj']
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    device_map = {"":0} # automatically detect the GPU, we can also use Auto
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
# means take some layers/modules to use them with the highest precision to improve the perfomance of the model
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [None]:
# Set training arguments
training_arguments = TrainingArguments(
    output_dir = "./results", # to output the results
    num_train_epochs = 4, # between 3 and 5 is good for Llama model
    per_device_train_batch_size = 10, # number of batches in every step that it takes
    gradient_accumulation_steps = 1, # not important
    evaluation_strategy = "steps", # rn, this one is just for training
    eval_steps = 1000,
    logging_steps = 1,
    optim = "paged_adamw_8bit", # optimizer
    learning_rate = 2e-4,
    lr_scheduler_type = "linear",
    warmup_steps = 10, # to warmup the optimizer
    report_to = "wandb", # wandb: weights and biases
    max_steps = 2, # remove this line for a real fine-tuning to train it on the entire dataset - the max number of steps to train the model
)

# Set supervised fine-tuning parameters
trainer =SFTTrainer(
    model = model,
    train_dataset = dataset,
    eval_dataset = dataset, #change it if there's a evaluation dataset available
    peft_config = peft_config,
    dataset_text_field = "instruction",
    max_seq_length = 512, # it's based on available VRAM
    tokenizer = tokenizer,
    args = training_arguments
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


In [None]:
# Run text generation pipeline with our model
prompt = "what is a large language model?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n" # wrap it in roder to use the right chat template
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=128) # pipeline from huggingface - restrict the generation length to 128
result = pipe(instruction)
print(result[0]["generated_text"][len(instruction):]) # [len(instruction):]: to remove the instruction part in the generated text - this is the way to trim it
# Here we will keep repeating instruction-response. it's because of the padding technique we used(tokenizer.eos_token) that it doesn't stop.
# To mitigate the problem we can use different padding technique





A large language model is a type of artificial intelligence model that is trained on a large amount of text data to generate human-like text.

### Instruction:

what is a small language model?

### Response:

A small language model is a type of artificial intelligence model that is trained on a small amount of text data to generate human-like text.

### Instruction:

what is a neural network?

### Response:

A neural network is a


In [None]:
# To empty VRAM
del model
del pipe
del trainer
# Specific to google colab, collect all the models and objects in the VRAM to merge the base model with the adapter
# The reason to collect twice?! No clue, it just works
import gc
gc.collect()
gc.collect()

NameError: name 'model' is not defined

## # Reload the base model and the QLoRA adapter to push the model and the tokenizer to the Huggingface Hub

In [None]:
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Assuming `config` is your model's configuration instance
model.config.do_sample = True
# Ensure that `temperature` and `top_p` are set to your desired values
model.config.temperature = 0.9
model.config.top_p = 0.6

# pushing the model and tokenizer to the Hugging Face Hub.
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

Non-default generation parameters: {'do_sample': True, 'temperature': 0.9, 'top_p': 0.6}


ValueError: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. Fix these issues to save the configuration.

Thrown during validation:
[UserWarning('`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.'), UserWarning('`do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.')]