# LLM JP instruction-tuning Finetuning with HuggingFace and Weights and Biases
<!--- @wandbcode{llm-finetune-hf} -->
- Fine-tune a lightweight LLM with LoRA and 8-bit quantization
- Checkpoint the LoRA adapter weights as artifacts

# Setting

In [None]:
import bitsandbytes as bnb
import copy
import glob
import os
import wandb
import json
from tqdm import tqdm
from types import SimpleNamespace
import datasets
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model

In [11]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_ENTITY"] = "japan-demo"
os.environ["WANDB_PROJECT"] = "jp-instruction-tuning"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=1024"
wandb.login()

## Install Model and Dataset

In [None]:
config = SimpleNamespace(
    BASE_MODEL="cyberagent/open-calm-7b",
    lora_config=SimpleNamespace(
        r=32,
        lora_alpha=16,
        target_modules=["query_key_value"],
        lora_dropout=.1,
        bias="none",
        task_type="CAUSAL_LM"
    ),
    training=SimpleNamespace(
        dataloader_num_workers=16,
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        report_to="wandb",
        warmup_steps=10,
        max_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        save_steps=25,
        output_dir='./outputs',
        report_to="wandb", #wandb integration
    )
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

In [None]:
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    config.BASE_MODEL,
    load_in_8bit=True,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL)

In [None]:
dolly_ja = datasets.load_dataset("kunishou/databricks-dolly-15k-ja")

## Dataset

In [9]:
PROMPT_NO_INPUT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response"""

PROMPT_WITH_INPUT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
Input:
{context}
### Response"""

class InstructDataset(Dataset):
    def __init__(self, json_list, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index
        self.features = []
        
        for j in tqdm(json_list):
            # open_qaなど文脈情報が必要ない場合はinputカラムがないため、
            # inputカラムありなしでテンプレート文を分けている。
            if 'input' in j:
                source_text = PROMPT_DICT['prompt_input'].format_map(j)
            else:
                source_text = PROMPT_DICT['prompt_no_input'].format_map(j)
            # 指示文と回答文を結合し、文末にEOSトークンを挿入
            example_text = source_text + j['output'] + self.tokenizer.eos_token
            # 指示文のみ（「以下は、タスクを〜### 応答:」まで）をtokenize
            # ほしいのは指示文のlength
            source_tokenized = self.tokenizer(
                source_text,
                padding='longest',
                truncation=True,
                max_length=512,
                return_length=True,
                return_tensors='pt'
            )
            
            # 指示文と回答文を全てtokenize
            example_tokenized = self.tokenizer(
                example_text, 
                padding='longest', 
                truncation=True, 
                max_length=512, 
                return_tensors='pt'
            )
            
            input_ids = example_tokenized['input_ids'][0]
            
            # LLMが生成してほしい正解の文章として入力文をそのままコピーする
            labels = copy.deepcopy(input_ids)
            
            # 指示文までの長さ
            source_len = source_tokenized['length'][0]
            
            # LLMに生成してほしい正解文章に指示文も含まれているので、
            # 指示文のところはCrossEntropyLossの損失を計算をしないようにIGNORE_INDEXとして-100で埋める
            labels[:source_len] = self.ignore_index
            
            self.features.append({
                'input_ids': input_ids,
                'labels': labels
            })
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]

class InstructCollator():
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = -100

    def __call__(self, examples):
        input_batch = []
        label_batch = []
        for example in examples:
            input_batch.append(example['input_ids'])
            label_batch.append(example['labels'])
        input_ids = pad_sequence(
            input_batch, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        # labelsのpaddingトークンは先程と同様にignore_indexである-100で埋める
        labels = pad_sequence(
            label_batch, batch_first=True, padding_value=self.ignore_index
        )
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)   
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask
        }

In [7]:
train_dataset = InstructDataset(dolly_ja, tokenizer)
collator = InstructCollator(tokenizer)

# 中身の確認
#loader = DataLoader(train_dataset, collate_fn=collator, batch_size=8, shuffle=True)
#batch = next(iter(loader))
#batch

## Training

In [None]:
with wandb.init(config=config, job_type="training") as run:
    # Setup for LoRa
    config = wandb.config
    #モデル構築のための準備
    # モデルの中身を確認
    print(model.gpt_neox.layers[0].attention)
    #GPTNeoXAttention(
    #  (rotary_emb): RotaryEmbedding()
    #  (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
    #  (dense): Linear(in_features=4096, out_features=4096, bias=True)
    #)
    
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    for param in model.parameters():
        param.requires_grad = False # freeze the model - train adapters later
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()
    class CastOutputToFloat(nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    model.lm_head = CastOutputToFloat(model.lm_head)
     
    lora_config = LoraConfig(**config["lora_config"])
    model_peft = get_peft_model(model, lora_config)
    #いくつのパラメータで学習をするかを確認
    model_peft.print_trainable_parameters()

    trainer = transformers.Trainer(
        model=model_peft,
        data_collator=collator,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # モデル構築
    trainer.train()
    
    # モデルの保存
    model_peft.save_pretrained("./output")
    model_ft = wandb.Artifact(f"finetuned-model", type="model")
    model_ft.add_dir("./output")
    run.log_artifact(model_ft)
    run.log_code()

## (Advanced) Sweep

In [None]:
config = SimpleNamespace(
    BASE_MODEL="cyberagent/open-calm-7b",
    lora_config=SimpleNamespace(
        r=32,
        lora_alpha=16,
        target_modules=["query_key_value"],
        lora_dropout=.1,
        bias="none",
        task_type="CAUSAL_LM"
    ),
    training=SimpleNamespace(
        dataloader_num_workers=16,
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        report_to="wandb",
        warmup_steps=10,
        max_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        save_steps=25,
        output_dir='./outputs',
        report_to="wandb", #wandb integration
    )
)

sweep_configuration = {
    "method": "random",
    "metric": {"goal": "minimize", "name": "loss"},
    "parameters": {
        "lora_config.r": {"values": [2,4,8,16,32]}
        "lora_config.lora_alpha": {"values": [2,4,8,16]},
        "training.learning_rate": {'max': 2e-3, 'min': 2e-4},
    },
}

for param in model.parameters():
        param.requires_grad = False # freeze the model - train adapters later
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

def train_func():
    with wandb.init(config=config, job_type="training") as run:
    # Setup for LoRa
    config = wandb.config    
    lora_config = LoraConfig(**config["lora_config"])
    model_peft = get_peft_model(model, lora_config)
    trainer = transformers.Trainer(
        model=model_peft,
        data_collator=collator,
        args=training_args,
        train_dataset=train_dataset
    )
    trainer.train()
    model_peft.save_pretrained("./output")
    model_ft = wandb.Artifact(f"finetuned-model", type="model")
    model_ft.add_dir("./output")
    run.log_artifact(model_ft)
    run.log_code()
    
sweep_id = wandb.sweep(sweep=sweep_configuration)
# run the sweep
wandb.agent(sweep_id, function=my_train_func)