# 1. 개발 환경 설정

### 1.1 필수 라이브러리 설치하기

In [1]:
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U accelerate==0.27.2

### 1.2 Import modules

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

### 1.3 Huggingface 로그인

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 2. Dataset 생성 및 준비

### 2.1 데이터셋 로드

In [4]:
from datasets import load_dataset

dataset = load_dataset("mteb/amazon_reviews_multi", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### 2.2 데이터셋 탐색

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
})

### 2.3 데이터셋 예시

In [None]:
dataset['train'][0]

{'id': 'en_0964290',
 'text': "I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review ...\n\nArrived broken. Manufacturer defect. Two of the legs of the base were not completely formed, so there was no way to insert the casters. I unpackaged the entire chair and hardware before noticing this. So, I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review of part of a chair I never got to sit in. I will go so far as to include a picture of what their injection molding and quality assurance process missed though. I will be hesitant to buy again. It makes me wonder if there aren't missing structures and supports that don't impede the assembly process.",
 'label': 0,
 'label_text': '0'}

# 3. Gemma 모델의 한국어 요약 테스트

### 3.1 모델 로드

In [None]:
BASE_MODEL = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### 3.2 Gemma-it의 프롬프트 형식

In [None]:
doc = dataset['train']['text'][0]

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

In [None]:
messages = [
    {
        "role": "user",
        "content": "Add the appropriate score label to the next review \n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
prompt

"<bos><start_of_turn>user\nAdd the appropriate score label to the next review \n\nI'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review ...\n\nArrived broken. Manufacturer defect. Two of the legs of the base were not completely formed, so there was no way to insert the casters. I unpackaged the entire chair and hardware before noticing this. So, I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review of part of a chair I never got to sit in. I will go so far as to include a picture of what their injection molding and quality assurance process missed though. I will be hesitant to buy again. It makes me wonder if there aren't missing structures and supports that don't impede the assembly process.<end_of_turn>\n<start_of_turn>model\n"

### 3.3 Gemma-it 추론

In [None]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)

In [None]:
print(outputs[0]["generated_text"][len(prompt):])

Sure, here's the revised review with the appropriate score label:

I am extremely disappointed with this chair. It arrived broken and was not packaged properly, resulting in damage during shipping. The manufacturer's defect is evident in the incomplete base construction, which prevented the casters from being inserted. I spent significant time boxing up the chair and packing it for return, resulting in a 2-star review. I will not be purchasing from this company again.


# 4. Gemma 파인튜닝

#### 주의: Colab GPU 메모리 한계로 이전장 추론에서 사용했던 메모리를 비워 줘야 파인튜닝을 진행 할 수 있습니다. <br> notebook 런타임 세션을 재시작 한 후 1번과 2번의 2.1 항목까지 다시 실행하여 로드 한 후 아래 과정을 진행합니다

In [None]:
!nvidia-smi

Mon Sep 30 06:13:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### 4.1 학습용 프롬프트 조정

In [7]:
def generate_prompt(example):
    prompt_list = []
    for i in range(len(example['text'])):
        prompt_list.append(r"""<bos><start_of_turn>user
Add the appropriate score label to the next review

{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn><eos>""".format(example['text'][i], example['label'][i] + 1))
    return prompt_list

In [8]:
train_data = dataset['train']
print(generate_prompt(train_data[:1])[0])

<bos><start_of_turn>user
Add the appropriate score label to the next review

I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review ...

Arrived broken. Manufacturer defect. Two of the legs of the base were not completely formed, so there was no way to insert the casters. I unpackaged the entire chair and hardware before noticing this. So, I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review of part of a chair I never got to sit in. I will go so far as to include a picture of what their injection molding and quality assurance process missed though. I will be hesitant to buy again. It makes me wonder if there aren't missing structures and supports that don't impede the assembly process.<end_of_turn>
<start_of_turn>model
1<end_of_turn><eos>


### 4.2 QLoRA 설정

In [9]:
lora_config = LoraConfig(
    r=6,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [10]:
BASE_MODEL = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer.padding_side = 'right'



config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

### 4.3 Trainer 실행

In [11]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
#        num_train_epochs = 1,
        max_steps=3000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=0.03,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
trainer.train()

Step,Training Loss
100,3.321
200,2.1424
300,2.0828
400,2.0734
500,2.0178
600,1.9937
700,2.0135
800,1.9868
900,2.0133
1000,1.9785




TrainOutput(global_step=3000, training_loss=2.0246318359375, metrics={'train_runtime': 3918.912, 'train_samples_per_second': 3.062, 'train_steps_per_second': 0.766, 'total_flos': 9971982411595776.0, 'train_loss': 2.0246318359375, 'epoch': 0.06})

### 4.4 Finetuned Model 저장

In [13]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)



In [14]:
!ls -alh lora_adapter

total 29M
drwxr-xr-x 2 root root 4.0K Oct  2 08:26 .
drwxr-xr-x 1 root root 4.0K Oct  2 08:26 ..
-rw-r--r-- 1 root root  689 Oct  2 08:26 adapter_config.json
-rw-r--r-- 1 root root  29M Oct  2 08:26 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Oct  2 08:26 README.md


In [15]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('gemma-2b-it-label-review')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [16]:
!ls -alh ./gemma-2b-it-label-review

total 4.7G
drwxr-xr-x 2 root root 4.0K Oct  2 08:28 .
drwxr-xr-x 1 root root 4.0K Oct  2 08:26 ..
-rw-r--r-- 1 root root  662 Oct  2 08:26 config.json
-rw-r--r-- 1 root root  132 Oct  2 08:26 generation_config.json
-rw-r--r-- 1 root root 4.7G Oct  2 08:28 model-00001-of-00002.safetensors
-rw-r--r-- 1 root root  65M Oct  2 08:28 model-00002-of-00002.safetensors
-rw-r--r-- 1 root root  14K Oct  2 08:28 model.safetensors.index.json


# 5. Gemma 한국어 요약 모델 추론

#### 주의: 마찬가지로 Colab GPU 메모리 한계로 학습 시 사용했던 메모리를 비워 줘야 파인튜닝된 결과를 확인 할 수 있습니다. <br> notebook 런타임 세션을 재시작 한 후 1번과 2번의 2.1 항목까지 다시 실행하여 로드 한 후 아래 과정을 진행합니다

In [5]:
!nvidia-smi

Wed Oct  2 07:12:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### 5.1 Fine-tuned 모델 로드

In [5]:
BASE_MODEL = "google/gemma-2b-it"
FINETUNE_MODEL = "./gemma-2b-it-label-review"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### 5.2 Fine-tuned 모델 추론

In [6]:
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)

In [7]:
doc = dataset['test']['text'][1]

In [8]:
messages = [
    {
        "role": "user",
        "content": "Add the appropriate score label to the next review:\n\n{}".format(doc)
    }
]
prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [9]:
outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

1


## 6. Hugging Face에 업로드


### 6.1 Model과 Tokenizer 업로드


In [13]:
finetune_model.push_to_hub("gemma-2b-it-label-review", use_temp_dir=False)
tokenizer.push_to_hub("gemma-2b-it-label-review", use_temp_dir=False)

model-00003-of-00003.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

TypeError: Object of type method is not JSON serializable