<a href="https://colab.research.google.com/github/nkyc-no-name/Llama3_Korean/blob/main/llama3_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Library

In [None]:
# pip install -U accelerate==0.29.3 peft==0.10.0 bitsandbytes==0.43.1 transformers==4.40.1 trl==0.8.6  datasets==2.19.0
!pip install -U accelerate==0.29.3 peft==0.10.0 bitsandbytes==0.43.1 transformers==4.40.1 trl==0.8.6 datasets==2.19.0

Collecting accelerate==0.29.3
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import torch
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

# 1. set model and dataset

In [4]:
# set base model path
base_model = "beomi/Llama-3-Open-Ko-8B"

In [5]:
# 현재 사용 중인 GPU의 주요 아키텍처 버전을 반환 8버전 이상 시 bfloat16 활용
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

# BitsAndBytesConfig 객체활용 양자화 설정
# 모델을 4비트 양자화하여 로드
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
    # device_map="auto"
)
model.config.use_cache = True
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
dataset = load_dataset("NoNameFactory/callcenter", '금융보험_상품 가입 및 해지', split = "train")

# ShareGPT {"from": "human", "value" : "Hi"} to ChatML {"role": "user", "content" : "Hi"}
def convert_chat_format(chat):
    conversion_map = {"human": "user", "gpt": "assistant"}
    return [{"role": conversion_map.get(entry["from"], entry["from"]), "content": entry["value"]} for entry in chat]

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [ convert_chat_format(convo) for convo in convos]
    return { "ChatML" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/303k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4330 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/4330 [00:00<?, ? examples/s]

In [9]:
# ChatML {"role": "user", "content" : "Hi"} to Llama3 <|begin_of_text|><|start_header_id|>user<|end_header_id|>Hello!<|eot_id|>
def formatting_prompts_func2(examples):
    convos = examples["ChatML"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func2, batched = True,)

Map:   0%|          | 0/4330 [00:00<?, ? examples/s]

# 2. Config efficient fine-tuning with low-rank adaptation.

# 3. Config training parameter for LoRA (Parameter-Efficient Fine-Tuning (PEFT)

https://huggingface.co/docs/peft/conceptual_guides/lora

In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# 4. Train Model

In [11]:
dataset = dataset.select(range(100))

In [31]:
dataset['text'][0]

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n신용카드를 만들려고 하는데 어떻게 하면 되죠?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n네. 고객님. 현재 나이가 어떻게 되시나요?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n20살이예요<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n가까운 영업점이나, 인터넷뱅킹에서 발급 신청하시면 됩니다.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n발급까지 얼마나 걸리나요?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n카드발급심사 기간에 따라 달라 질 수 있는데 보통 2주 정도 소요 됩니다.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n심사는 어떤걸 하나요?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n현재 직업이 있으신가요?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n아니요. 대학생인데요<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n직업이 없으시면 본인 명의의 재산으로 발급심사가 이루어집니다.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n본인 명의의 재산이 어떤거에요?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n고객님 명의의 부동산 이나 다른 자산을 말합니다.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n재산이 

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=128, # None for unlimited
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)
trainer.train()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
25,3.6539
50,2.6684
75,2.6206




# 5. Verify and Save Model

In [None]:
messages = [
    {"role": "system", "content": "친절한 챗봇으로서 상대방의 요청에 최대한 자세하고 친절하게 답하자. 모든 대답은 한국어(Korean)으로 대답해줘."},
    {"role": "user", "content": "대한민국에서 가장 가볼만한 곳은 어디니?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=512,
    eos_token_id=terminators,
    do_sample=True,
    temperature=1,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))