# LLM JP instruction-tuning Finetuning with HuggingFace and Weights and Biases
<!--- @wandbcode{llm-finetune-hf} -->
- Fine-tune a lightweight LLM with LoRA and 8-bit quantization
- Checkpoint the LoRA adapter weights as artifacts

# Setting

In [1]:
!pip install update -q bitsandbytes datasets accelerate loralib 

[0m

In [2]:
import bitsandbytes as bnb
import copy
import glob
import os
import wandb
import json
from tqdm import tqdm
from types import SimpleNamespace
import datasets
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/opt/conda/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32




In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_ENTITY"] = "japan-demo"
os.environ["WANDB_PROJECT"] = "jp-instruction-tuning"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=1024"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mkeisuke-kamata[0m ([33mwandb[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Install Model and Dataset

In [4]:
config = SimpleNamespace(
    BASE_MODEL="cyberagent/open-calm-7b",
    lora_config=SimpleNamespace(
        r=32,
        lora_alpha=16,
        target_modules=["query_key_value"],
        lora_dropout=.1,
        bias="none",
        task_type="CAUSAL_LM"
    ),
    training=SimpleNamespace(
        dataloader_num_workers=16,
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        report_to="wandb",#wandb integration
        warmup_steps=10,
        max_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        save_steps=25,
        output_dir='./outputs'
    )
)

In [5]:
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    config.BASE_MODEL,
    load_in_8bit=True,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL)

ImportError: Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or pip install bitsandbytes` 

In [None]:
dolly_ja = datasets.load_dataset("kunishou/databricks-dolly-15k-ja")

## Dataset

In [None]:
PROMPT_NO_INPUT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response"""

PROMPT_WITH_INPUT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
Input:
{context}
### Response"""

class InstructDataset(Dataset):
    def __init__(self, json_list, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index
        self.features = []
        
        for j in tqdm(json_list):
            # In cases like open_qa where context information is not necessary, there is no input column.
            # Therefore, we differentiate the template sentences based on whether the input column is present or not.
            if 'input' in j:
                source_text = PROMPT_DICT['prompt_input'].format_map(j)
            else:
                source_text = PROMPT_DICT['prompt_no_input'].format_map(j)
            # Combine the instruction sentence and the response sentence, and insert an EOS token at the end
            example_text = source_text + j['output'] + self.tokenizer.eos_token
            # okenize only the instruction sentence (up to 'The following is a task to ~### Response:')
            # What we want is the length of the instruction sentence.
            source_tokenized = self.tokenizer(
                source_text,
                padding='longest',
                truncation=True,
                max_length=512,
                return_length=True,
                return_tensors='pt'
            )
            
            # Tokenize both the instruction sentence and the response sentence
            example_tokenized = self.tokenizer(
                example_text, 
                padding='longest', 
                truncation=True, 
                max_length=512, 
                return_tensors='pt'
            )
            
            input_ids = example_tokenized['input_ids'][0]
            
            # Copy the input sentence as is to be the correct answer that the LLM generates.
            labels = copy.deepcopy(input_ids)
            
            # Length up to the instruction sentence
            source_len = source_tokenized['length'][0]
            
            # Since the desired correct sentence for the LLM to generate also includes the instruction sentence,
            # we fill the section of the instruction sentence with -100 as IGNORE_INDEX to avoid calculating the CrossEntropyLoss.
            labels[:source_len] = self.ignore_index
            
            self.features.append({
                'input_ids': input_ids,
                'labels': labels
            })
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]

class InstructCollator():
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = -100

    def __call__(self, examples):
        input_batch = []
        label_batch = []
        for example in examples:
            input_batch.append(example['input_ids'])
            label_batch.append(example['labels'])
        input_ids = pad_sequence(
            input_batch, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = pad_sequence(
            label_batch, batch_first=True, padding_value=self.ignore_index
        )
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)   
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask
        }

In [None]:
train_dataset = InstructDataset(dolly_ja, tokenizer)
collator = InstructCollator(tokenizer)

## Post processing on the model

In [None]:
# モデルの中身を確認
print(model.gpt_neox.layers[0].attention)
#GPTNeoXAttention(
#  (rotary_emb): RotaryEmbedding()
#  (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
#  (dense): Linear(in_features=4096, out_features=4096, bias=True)

# cast the small parameters (e.g. layernorm) to fp32 for stability
for param in model.parameters():
    param.requires_grad = False # freeze the model - train adapters later
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

## Training

In [None]:
with wandb.init(config=config, job_type="training") as run:
    # Setup for LoRa
    config = wandb.config
    
    lora_config = LoraConfig(**config["lora_config"])
    model_peft = get_peft_model(model, lora_config)    
    model_peft.print_trainable_parameters()

    trainer = transformers.Trainer(
        model=model_peft,
        data_collator=collator,
        args=training_args,
        train_dataset=train_dataset
    )
    
    trainer.train()
    
    model_peft.save_pretrained("./output")
    model_ft = wandb.Artifact(f"finetuned-model", type="model")
    model_ft.add_dir("./output")
    run.log_artifact(model_ft)
    run.log_code()

## (Advanced) Sweep

In [None]:
config = SimpleNamespace(
    BASE_MODEL="cyberagent/open-calm-7b",
    lora_config=SimpleNamespace(
        r=32,
        lora_alpha=16,
        target_modules=["query_key_value"],
        lora_dropout=.1,
        bias="none",
        task_type="CAUSAL_LM"
    ),
    training=SimpleNamespace(
        dataloader_num_workers=16,
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        report_to="wandb",
        warmup_steps=10,
        max_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        save_steps=25,
        output_dir='./outputs',
        report_to="wandb", #wandb integration
    )
)

sweep_configuration = {
    "method": "random",
    "metric": {"goal": "minimize", "name": "loss"},
    "parameters": {
        "lora_config.r": {"values": [2,4,8,16,32]}
        "lora_config.lora_alpha": {"values": [2,4,8,16]},
        "training.learning_rate": {'max': 2e-3, 'min': 2e-4},
    },
}

for param in model.parameters():
        param.requires_grad = False # freeze the model - train adapters later
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

def train_func():
    with wandb.init(config=config, job_type="training") as run:
    # Setup for LoRa
    config = wandb.config    
    lora_config = LoraConfig(**config["lora_config"])
    model_peft = get_peft_model(model, lora_config)
    trainer = transformers.Trainer(
        model=model_peft,
        data_collator=collator,
        args=training_args,
        train_dataset=train_dataset
    )
    trainer.train()
    model_peft.save_pretrained("./output")
    model_ft = wandb.Artifact(f"finetuned-model", type="model")
    model_ft.add_dir("./output")
    run.log_artifact(model_ft)
    run.log_code()
    
sweep_id = wandb.sweep(sweep=sweep_configuration)
# run the sweep
wandb.agent(sweep_id, function=my_train_func)