In [1]:
import os
cache_dir = '/home/noah/workspace/dl-study/nlp_study/llama2/cache'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
    
os.environ['HF_HOME'] = cache_dir

In [2]:
from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from huggingface_hub import notebook_login

import wandb

In [3]:
print(torch.cuda.is_available())

True


In [3]:
model_name = 'meta-llama/Llama-2-7b-hf'
data_name = 'heegyu/open-korean-instructions'
fine_tuning_model_name = f'{model_name}-finetuned-open-korean-instructions'

device_map = 'auto'
# auto :라이브러리에게 사용 가능한 모든 디바이스를 자동으로 탐색하고 모델의 서로 다른 부분을 이 디바이스들에 자동으로 분배하도록 요청
auth_token = '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'

In [5]:
# LoRA의 하이퍼파라미터를 설정 
# 알파값을 16으로 설정하여 스케일링
# r은 64로 설정
# 입력 임베딩 사이즈 64랭크까지 압축
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type='CAUSAL_LM'
)

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
)

In [7]:
wandb.login()
wandb.init(project=fine_tuning_model_name.split('/')[-1])

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mletgoofthepizza[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
dataset = load_dataset(data_name, split='train[:10%]')
print((dataset))

Dataset({
    features: ['source', 'text'],
    num_rows: 37516
})


In [9]:
print(dataset[3])

{'source': 'OIG-smallchip2-ko', 'text': '<usr> 저는 발목이 삔 상태이고 더 빨리 낫도록 돕기 위해 제가 무엇을 할 수 있는지 알아야 합니다.\n<bot> 붓기와 염증을 줄이는 데 도움이 되는 얼음 요법을 시도해 볼 수 있습니다. 또한 탄력 붕대나 압축 랩을 사용하여 발목을 추가로 지지할 수 있습니다. 제대로 치료할 수 있도록 가능한 한 멀리 떨어져 있는 것도 중요합니다.'}


In [9]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  quantization_config=bnb_config, # 양자화 설정
                                                  use_cache=False, # 모델이 출력을 캐시할지 여부
                                                  token=auth_token)
base_model.config.pretraining_tp = 1
base_model.gradient_checkpointing_enable()
base_model = prepare_model_for_kbit_training(base_model)
peft_model = get_peft_model(base_model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=auth_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [16]:
# print(len(tokenizer))
model_name = 'beomi/llama-2-ko-7b'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=auth_token)
print(len(tokenizer.eos_token))
print(len(tokenizer.bos_token))
print(len(tokenizer.pad_token))



4
3
4


In [12]:
training_args = TrainingArguments(
    output_dir=fine_tuning_model_name,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim='paged_adamw_32bit',
    logging_steps=5,
    save_strategy='epoch',
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type='cosine',
    disable_tqdm=True,
    report_to='wandb',
    seed=42
)

In [13]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=min(tokenizer.model_max_length, 2048),
    tokenizer=tokenizer,
    packing=True,
    args=training_args
)

In [14]:
trainer.train()



{'loss': 1.0731, 'learning_rate': 1.6393442622950818e-05, 'epoch': 0.01}
{'loss': 1.0793, 'learning_rate': 3.2786885245901635e-05, 'epoch': 0.01}
{'loss': 1.0466, 'learning_rate': 4.918032786885246e-05, 'epoch': 0.02}
{'loss': 1.0135, 'learning_rate': 6.557377049180327e-05, 'epoch': 0.03}
{'loss': 1.0082, 'learning_rate': 8.19672131147541e-05, 'epoch': 0.04}
{'loss': 0.9698, 'learning_rate': 9.836065573770493e-05, 'epoch': 0.04}
{'loss': 0.9428, 'learning_rate': 0.00011475409836065574, 'epoch': 0.05}
{'loss': 0.9418, 'learning_rate': 0.00013114754098360654, 'epoch': 0.06}
{'loss': 0.9139, 'learning_rate': 0.00014754098360655738, 'epoch': 0.07}
{'loss': 0.8944, 'learning_rate': 0.0001639344262295082, 'epoch': 0.07}
{'loss': 0.8798, 'learning_rate': 0.00018032786885245904, 'epoch': 0.08}
{'loss': 0.8416, 'learning_rate': 0.00019672131147540985, 'epoch': 0.09}
{'loss': 0.8476, 'learning_rate': 0.00019999790210013988, 'epoch': 0.1}
{'loss': 0.8153, 'learning_rate': 0.0001999893795328188, '

KeyboardInterrupt: 

In [16]:
wandb.finish()

In [17]:
trainer.save_model()

GatedRepoError: 401 Client Error. (Request ID: Root=1-65d30b97-7de783db232005b877a13825;2a53640e-01a4-4764-bd99-609606d431d3)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it.