## Model ::  Fine-tuning

In [None]:
!pip install -Uq accelerate bitsandbytes datasets transformers peft trl sentencepiece wandb langchain huggingface_hub

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
os.environ["WANDB_PROJECT"] = "finetuning"

In [4]:
# b2554c627fb67f988a0b985e753984a9dbb565b7

import wandb
if wandb.run is not None:
 wandb.finish()

In [5]:
from datasets import load_dataset
dataset_name = "squad_v2"
dataset = load_dataset(dataset_name, split="train")
eval_dataset = load_dataset(dataset_name, split="validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})

In [7]:
eval_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

## The Squad V2 dataset is composed of various features, which we can see here:

```
{'id': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None),
'context': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'answers': Sequence(feature={'text': Value(dtype='string', id=None),
'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}
```

In [8]:
model_id = "openlm-research/open_llama_3b_v2"
new_model_name = f"openllama-3b-peft-{dataset_name}_pankaj"

### Load the model:

In [9]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.float16,
  )

device_map="auto"

base_model = AutoModelForCausalLM.from_pretrained(
 model_id,
 quantization_config=bnb_config,
 device_map="auto",
 trust_remote_code=True,
)
base_model.config.use_cache = False

In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

# output_dir = "/content/gdrive/My Drive/results"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
output_dir = "/content/gdrive/My Drive/results"

### set up a tokenizer

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Setup LORA and other training arguments:

In [13]:
from transformers import TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig
# More info: https://github.com/huggingface/transformers/pull/24906


base_model.config.pretraining_tp = 1

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
  )


training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=100,   ## Keeping it low for Demo purpose
    num_train_epochs=100,
    evaluation_strategy="steps",
    eval_steps=5,
    save_total_limit=5,
    push_to_hub=False,
    load_best_model_at_end=True,
    report_to="wandb"
  )

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="question", # this depends on the dataset!
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=200)]
  )

# b2554c627fb67f988a0b985e753984a9dbb565b7

trainer.train()

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33mpankajshakya627[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
5,No log,4.484687
10,4.283000,4.078515
15,4.283000,3.733849


Step,Training Loss,Validation Loss
5,No log,4.484687
10,4.283000,4.078515
15,4.283000,3.733849


In [None]:
trainer.model.save_pretrained(
 os.path.join(output_dir, "final_checkpoint"),
)

In [None]:
trainer.model.push_to_hub(
 repo_id=new_model_name
)

#### Load the model back using a combination of our Hugging Face username and the repository name

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline


model_id = 'openlm-research/open_llama_3b_v2'
config = PeftConfig.from_pretrained("pankajshakya627/openllama-3b-peft-squad_v2_pankaj")
model = AutoModelForCausalLM.from_pretrained(model_id)
model = PeftModel.from_pretrained(model, "pankajshakya627/openllama-3b-peft-squad_v2_pankaj")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256
  )

llm = HuggingFacePipeline(pipeline=pipe)
