In [2]:
from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from huggingface_hub import notebook_login

model_name = 'mistralai/Mistral-7B-v0.1'
data_name = 'heegyu/open-korean-instructions'
fine_tuning_model_name = f'{model_name}-finetuned-open-korean-instructions'
output_dir = "./test/checkpoint-2790"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right' 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
)
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  quantization_config=bnb_config, # 양자화 설정
                                                  use_cache=False) # 모델이 출력을 캐시할지 여부)

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto'
)

lora_merged_model = trained_model.merge_and_unload()
lora_merged_model.save_pretrained('merged', safe_serialization=True)
tokenizer.save_pretrained('merged')
prompt = '<usr> 한국의 수도가 어디야? <bot>'
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()

print(f"-------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"-------------------------\n")

print(f"Base Model Response :\n")
output_base = base_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(output_base.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")

print(f"Trained Model Response :\n")
trained_model = lora_merged_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(trained_model.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")



ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [7]:
# prompt = '<usr> 붕어빵 3개를 친구 3명과 나눠먹는 방법 <bot>'
prompt = '딥러닝이 뭐야?'
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()

print(f"-------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"-------------------------\n")

print(f"Trained Model Response :\n")
trained_model = lora_merged_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(trained_model.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")\
    
print(f"Base Model Response :\n")
output_base = base_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(output_base.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


-------------------------

Prompt:
딥러닝이 뭐야?

-------------------------

Trained Model Response :



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



<bot> 딥 러닝은 인공 지능 분야의 하위 분야입니다. 딥 러닝은 뉴런과 층을 사용하여 복잡한 데이터 세트를 처리하는 데 중점을 둡니다. 이러한 층은 입력 데이터를 처리하고 결과를 출력하는 데 사용됩니다. 딥 러닝은 이미지, 음성, 텍스트 및 기타 형식의 데이터를 분류하거나 분석하는 데 사용됩니다.
-------------------------

Base Model Response :



- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 입니다.

- 딥러닝은 컴퓨터가 학습할 수 있는 알고리즘 
-------------------------



: 

In [4]:
from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from huggingface_hub import notebook_login

model_name = 'mistralai/Mistral-7B-v0.1'
ft_model_name = 'letgoofthepizza/Mistral-7B-v0.1-finetuned-open-korean-instructions'

tokenizer_base = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer_base.pad_token = tokenizer_base.eos_token
tokenizer_base.padding_side = 'right' 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
)
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  quantization_config=bnb_config, # 양자화 설정
                                                  use_cache=False) # 모델이 출력을 캐시할지 여부)


prompt = '<usr> 한국의 수도가 어디야? <bot>'
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()

print(f"-------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"-------------------------\n")

print(f"Base Model Response :\n")
output_base = base_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(output_base.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")




`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacty of 23.69 GiB of which 169.94 MiB is free. Including non-PyTorch memory, this process has 23.51 GiB memory in use. Of the allocated memory 22.70 GiB is allocated by PyTorch, and 517.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

In [3]:
from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from huggingface_hub import notebook_login

model_name = 'mistralai/Mistral-7B-v0.1'
ft_model_name = 'letgoofthepizza/Mistral-7B-v0.1-finetuned-open-korean-instructions'

tokenizer_base = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer_base.pad_token = tokenizer_base.eos_token
tokenizer_base.padding_side = 'right' 

tokenizer_ft = AutoTokenizer.from_pretrained(ft_model_name, trust_remote_code=True)
tokenizer_ft.pad_token = tokenizer_ft.eos_token
tokenizer_ft.padding_side = 'right' 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
)
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  quantization_config=bnb_config, # 양자화 설정
                                                  use_cache=False) # 모델이 출력을 캐시할지 여부)

ft_model = AutoModelForCausalLM.from_pretrained(ft_model_name,
                                                  quantization_config=bnb_config, # 양자화 설정
                                                  use_cache=False) # 모델이 출력을 캐시할지 여부)


prompt = '<usr> 한국의 수도가 어디야? <bot>'
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()

print(f"-------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"-------------------------\n")

print(f"Base Model Response :\n")
output_base = base_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer_base.batch_decode(output_base.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")

print(f"Trained Model Response :\n")
trained_model = ft_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer_ft.batch_decode(trained_model.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")



`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

NameError: name 'tokenizer' is not defined

: 