In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
pretrain_model = "beomi/Llama-3-Open-Ko-8B"
dataset_name = "lwef/aihub-ko-dialogue"

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [3]:
from huggingface_hub import HfFolder
HfFolder.save_token('MY_TOKEN')

In [4]:
prompt_template = '''
아래 대화를 요약해 주세요. 대화 형식은 '#대화 참여자#: 대화 내용'입니다.
### 대화 >>>{dialogue}

### 요약 >>>{summary}'''

In [5]:
from unsloth import FastLanguageModel
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = pretrain_model,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

beomi/Llama-3-Open-Ko-8B does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [7]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    texts = []
    for i in range(len(examples["summary"])):
        source = prompt_template.format(dialogue=examples["dialogue"][i], summary=examples["summary"][i])
        text = f"{source}{EOS_TOKEN}"

        texts.append(text)

    return {"text": texts}

from datasets import load_dataset
dataset = load_dataset(dataset_name, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

# 출력 확인
print("출력 확인:::")
print(dataset[0])
print(dataset[1])

train_dataset = dataset
#train_dataset = dataset.select(range(10000))

Downloading readme:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13119 [00:00<?, ? examples/s]

Map:   0%|          | 0/13119 [00:00<?, ? examples/s]

출력 확인:::
{'dialogue': '#P01#: 우리 스파 공부할 시간 잇나?\n#P02#: 스파 진짜 열받어  ㅠㅠㅠ 끝이없어\n#P01#: 내일 스파가 더 문제다 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 나 진짜 스파 하나도 안햇눈뎅 내일 스파 하나보나??\n#P03#: 엥 스파??? 스파시험화요일 ㅋㅋㅋ', 'summary': '스파 공부를 하나도 안 해서 내일 시험을 걱정하자 시험은 화요일이라고 말한다.', 'text': "\n아래 대화를 요약해 주세요. 대화 형식은 '#대화 참여자#: 대화 내용'입니다.\n### 대화 >>>\n#P01#: 우리 스파 공부할 시간 잇나?\n#P02#: 스파 진짜 열받어  ㅠㅠㅠ 끝이없어\n#P01#: 내일 스파가 더 문제다 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 나 진짜 스파 하나도 안햇눈뎅 내일 스파 하나보나??\n#P03#: 엥 스파??? 스파시험화요일 ㅋㅋㅋ\n\n### 요약 >>>\n스파 공부를 하나도 안 해서 내일 시험을 걱정하자 시험은 화요일이라고 말한다.\n<|end_of_text|>"}
{'dialogue': '#P01#: 삼성 ncs 문제중에 하나가 범이랑 호랑이중에 뭐가 한자어인지 맞추는거 나왓댕\n#P02#: 헐 근데 은근 모르는 사람 있을거 같애\n#P01#: 범 호 모르면 백퍼 틀릴거같앸ㅋ 범보다는 호랑이가 더 귀여워보이고 우리말같아서..\n#P03#: 엥 범이 우리나라 말이고 호랑이가 한자야?\n#P02#: ㅇㅇㅇㅇ\n#P01#: 응\n#P03#: 나 상식이 없네', 'summary': '삼성 국가직무능력표준(ncs)에 범과 호랑이 중에 무엇이 한자어인지 맞추는 문제가 나왔다.', 'text': "\n아래 대화를 요약해 주세요. 대화 형식은 '#대화 참여자#: 대화 내용'입니다.\n### 대화 >>>\n#P01#: 삼성 ncs 문제중에 하나가 범이랑 호랑이중에 뭐가 한자어인지 맞추는거 나왓댕\n#P02#: 헐 근데 은근 모르는 사람 있을거 같애\n#P01#: 범 호 모르면 백퍼 틀릴거같앸ㅋ 범보다는 호랑이가 더 귀여워보이고 우리말

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,    # epoch 수
        per_device_train_batch_size=16,    # 배치 크기
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/13119 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

Counting untrained tokens:   0%|          | 0/13119 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 13,119 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 205
 "-____-"     Number of trainable parameters = 1,092,616,192


Step,Training Loss
1,2.7755
2,2.8216
3,2.7391
4,2.4935
5,2.3637
6,2.1696
7,2.0074
8,1.9949
9,1.8765
10,1.9504


Step,Training Loss
1,2.7755
2,2.8216
3,2.7391
4,2.4935
5,2.3637
6,2.1696
7,2.0074
8,1.9949
9,1.8765
10,1.9504


In [1]:
import sys
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [23]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
dialogue = '''
#P01#: 아 행삶 과제 너무 어려워... 5쪽 쓸게 없는데 ㅡㅡ #P02#: 몬냐몬냐너가더잘써 ㅎㅎ #P01#: 5쪽 대충 의식의 흐름대로 쭉 써야지..이제 1쪽씀 ;; 5쪽 에는 네줄만 적어야지 #P02#: 안대... 뭔가분량중요할거같아 거의꽉채워서쓰셈 #P01#: 못써 쓸말업써 #P02#: 이거중간대체여?? #P01#: ㄴㄴ 그냥 과제임 그래서 더 짜증남'''

# 포맷팅된 프롬프트 생성
formatted_prompt = prompt_template.format(dialogue=dialogue, summary="")

# 토크나이징
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 128,
    eos_token_id=tokenizer.eos_token_id, # EOS 토큰을 사용하여 명시적으로 출력의 끝을 지정.
    use_cache = True
)
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
result = decoded_outputs[0]

print(result)


아래 대화를 요약해 주세요. 대화 형식은 '#대화 참여자#: 대화 내용'입니다.
### 대화 >>>

#P01#: 아 행삶 과제 너무 어려워... 5쪽 쓸게 없는데 ㅡㅡ #P02#: 몬냐몬냐너가더잘써 ㅎㅎ #P01#: 5쪽 대충 의식의 흐름대로 쭉 써야지..이제 1쪽씀 ;; 5쪽 에는 네줄만 적어야지 #P02#: 안대... 뭔가분량중요할거같아 거의꽉채워서쓰셈 #P01#: 못써 쓸말업써 #P02#: 이거중간대체여?? #P01#: ㄴㄴ 그냥 과제임 그래서 더 짜증남

### 요약 >>>

행삶 과제가 너무 어려워서 5쪽을 대충 의식의 흐름대로 써야겠다고 하자 분량이 중요할 것 같으니 꽉 채워서 쓰라고 한다.



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface upload

In [24]:
if False:
  model.push_to_hub("username/modelname",
                    use_auth_token=True,
                    commit_message="Initial commit",
                    private=False)

README.md:   0%|          | 0.00/565 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

Saved model to https://huggingface.co/lsef/dialogue_finetune-1


### open-ko-llm-leaderboard에 업로드할 때

아래의 과정을 수행한 후 submit해서 평가받을 수 있다.

In [None]:
base_model = "beomi/Llama-3-Open-Ko-8B"  # 병합을 수행할 베이스 모델
huggingface_repo = "llm-bench-upload-1"  # 모델을 업로드할 repository
save_method = (
    "merged_16bit"  # "merged_4bit", "merged_4bit_forced", "merged_16bit", "lora"
)

In [None]:
model.push_to_hub_merged(
    huggingface_repo,
    tokenizer,
    save_method=save_method,
    token="MY_HUGGINGFACE_TOKEN",
)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.4 out of 12.67 RAM for saving.


 22%|██▏       | 7/32 [00:00<00:02, 10.48it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [04:17<00:00,  8.04s/it]


Unsloth: Saving to organization with address lwef/llm-bench-upload-1
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address lwef/llm-bench-upload-1
Unsloth: Saving lwef/llm-bench-upload-1/pytorch_model-00001-of-00004.bin...
Unsloth: Saving lwef/llm-bench-upload-1/pytorch_model-00002-of-00004.bin...
Unsloth: Saving lwef/llm-bench-upload-1/pytorch_model-00003-of-00004.bin...
Unsloth: Saving lwef/llm-bench-upload-1/pytorch_model-00004-of-00004.bin...
Unsloth: Uploading all files... Please wait...


pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Done.
Saved merged model to https://huggingface.co/None/llm-bench-upload-1


테스트 코드

In [None]:
revision="main"
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("lwef/llm-bench-upload-1", revision=revision)
model = AutoModel.from_pretrained("lwef/llm-bench-upload-1", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("lwef/llm-bench-upload-1", revision=revision)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]