In [1]:
# https://medium.com/@miloszivic99/finetuning-large-language-models-customize-llama-3-8b-for-your-needs-bfe0f43cd239
import re 
import os
import os.path as osp
import torch
import pandas
import pandas as pd
import json
from typing import Union
from typing import List
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from datasets import concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

bin C:\Users\any\anaconda3\envs\oosij\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
c_path = './data/classification/ukairia777/finance_data.csv' # 분류 .csv
s_path = './data/summarization/aihub_news' # 요약 폴더내.json 들
template_path = './templates/multi.json' # 템플릿  .json
dataset_name = "Smoked-Salmon-s/empathetic_dialogues_ko" # 싱글/멀티 대화형 데이터셋  허깅페이스 :

model_name = "beomi/Llama-3-Open-Ko-8B-Instruct-preview"

name = model_name.split('/')[1]
new_model = name + "_train_v5" 
new_model

'Llama-3-Open-Ko-8B-Instruct-preview_train_v5'

In [None]:
def task_classification_dataset(c_path):
  # Read the CSV file into a pandas DataFrame
  df = pd.read_csv(c_path)
  # Replace values in the 'labels' column
  df['labels'] = df['labels'].replace({'neutral': '중립', 'negative': '부정', 'positive': '긍정'})

  ## 허깅페이스 데이터셋화

  # Hugging Face의 dataset 포맷으로 변환
  huggingface_dataset = Dataset.from_pandas(df.rename(columns={'labels': 'output', 'kor_sentence': 'input'}))

  # instruction 추가 및 컬럼 순서 변경
  huggingface_dataset = huggingface_dataset.map(
      lambda example: {"instruction": '밑의 내용의 감성을 분석하고, 그것이 긍정, 중립, 아니면 부정인지 결정하고 대답해주세요. 해당 감정 레이블은 "긍정", "중립" 또는 "부정"입니다.',
                       "input": example["input"],
                       "output": example["output"]
                       },
      remove_columns=['sentence']
  )

  # 필드와 값을 추가
  sentiment_dataset = huggingface_dataset.map(
      lambda example: {'instruction': example['instruction'], 'input': example['input'], 'output': example['output'], 'source': 'github/ukairia777', 'type': 'task_classification'})

  return sentiment_dataset

def task_summarization_dataset(s_path):
  folder_path  = s_path

  # Initialize an empty list to store data
  data_list = []

  # Get a list of all files in the folder
  file_list = [f for f in os.listdir(folder_path) if f.endswith('.json')]

  # Loop through each JSON file and extract 'passage' and 'summary2'
  for file_name in file_list:
      file_path = os.path.join(folder_path, file_name)

      # Read the JSON file
      with open(file_path, 'r', encoding='utf-8') as file:
          data = json.load(file)

      # Extract 'passage' and 'summary2' fields
      passage = data.get('Meta(Refine)', {}).get('passage', '')
      summary2 = data.get('Annotation', {}).get('summary2', '')

      #if len(summary2.split('. ')) <= 1: # 원인 불명 ?  그냥 문장 길이 미만인 거만 짜르도록
      if len(summary2) <= 100: # 문장 길이가 100 이하인 것은 버리기
        continue

      # Append data to the list
      data_list.append({'file_name': file_name, 'body': passage, 'summary': summary2})

  # Convert the list of dictionaries to a pandas DataFrame
  df = pd.DataFrame(data_list)

  # Hugging Face의 dataset으로 변환
  hf_dataset = Dataset.from_pandas(df)

  # 필요한 형태로 변환
  hf_dataset = hf_dataset.map(
      lambda example: {'instruction': '밑의 내용을 요약해주세요.',
                       'input': example['body'],
                       'output': example['summary']
                       },
      remove_columns=['file_name','body','summary']
                     )
  # 필드와 값을 추가
  summary_dataset = hf_dataset.map(
      lambda example: {'instruction': example['instruction'], 'input': example['input'], 'output': example['output'], 'source': 'aihub', 'type': 'task_summarization'})
  return summary_dataset


def task_combined_dataset(c_dataset, s_dataset):
  # 두 데이터셋 합치기
  combined_dataset = concatenate_datasets([c_dataset, s_dataset])
  # 데이터셋 섞기
  combined_dataset = combined_dataset.shuffle(seed=42)  # seed 값은 원하는 값으로 변경 가능
  # instruction과 input을 합쳐서 새로운 instruction에 값을 주기
  combined_dataset = combined_dataset.map(lambda example: {'new_instruction': example['instruction'].replace('#','').strip() + '\n\n' + example['input'].rstrip(), 'output': example['output'], 'source': example['source'], 'type': example['type']})

  # 기존의 instruction과 input 삭제
  combined_dataset = combined_dataset.remove_columns(['instruction', 'input'])
  # 'new_instruction' 필드의 이름을 'instruction'으로 변경
  combined_dataset = combined_dataset.rename_column('new_instruction', 'instruction')

  return combined_dataset

# 챗 템플릿 v004 : by llama 3 
def process_dataset(dataset):
    #your_system_message = "친절한 챗봇으로서 상대방의 요청에 최대한 자세하고 친절하게 답하자. 모든 대답은 한국어(Korean)으로 대답해줘."
    your_system_message = "당신은 업무적으로 도움을 주고 공감 능력이 있는 AI 봇 픽시입니다. 다음 대화의 흐름을 보고 상대방의 요구에 맞는 답변을 해주세요."
    
    begin_token = '<|begin_of_text|>'
    bos_token = '<|eot_id|>'
    sep_start_token = '<|start_header_id|>'
    sep_end_token = '<|end_header_id|>'

    assistant_full_token = bos_token + sep_start_token + 'assistant' + sep_end_token
    user_full_token = bos_token + sep_start_token + 'user' + sep_end_token
    
    assistant_token =  sep_start_token + 'assistant' + sep_end_token
    user_token = sep_start_token + 'user' + sep_end_token
    system_token =  sep_start_token + 'system' + sep_end_token


    def format_single(instruction, output):
         return system_token +f"\n\n{your_system_message}" + user_full_token + f"\n\n{instruction}" + assistant_full_token + f"\n\n{output}<|eot_id|>"

    def format_multi(instruction, output):
        instruction_parts = instruction.split('\n')
        messages = []
        for part in range(len(instruction_parts)):
            inst_part = instruction_parts[part]
            if inst_part.startswith('질문:'):
                messages.append(user_token + '\n\n' + f"{inst_part[len('질문: '):].strip()}"+ bos_token)
            elif inst_part.startswith('답변:'):
                messages.append(assistant_token + '\n\n'f"{inst_part[len('답변: '):].strip()}"+ bos_token)

        segments = []

        for m in range(len(messages)):
            if m == 0:
                # 시스템 메시지를 첫 번째 세그먼트에만 포함시키기
                system_msg = system_token +f"\n\n{your_system_message}"  + bos_token +  f"{messages[0]}"
                segments.append(system_msg)
            else:
                inst_msg = messages[m]
                segments.append(inst_msg)

        seg_response =  assistant_token + f'\n\n{output}' + '<|eot_id|>'
        segments.append(seg_response)

        return ''.join(segments)

    # Apply formatting to each row and construct the new dataset structure
    new_columns = {key: [] for key in dataset.column_names + ['text']}
    for row in dataset:
        for key in dataset.column_names:
            new_columns[key].append(row[key])
        
        if 'multi' in row['type']:
            in_text = format_multi(row['instruction'], row['output'])
            in_text = remove_emojis(in_text)
            new_columns['text'].append(in_text)
        else:
            in_text = format_single(row['instruction'], row['output'])
            in_text = remove_emojis(in_text)
            new_columns['text'].append(in_text)

    return Dataset.from_dict(new_columns)


def remove_emojis(text):
    # 모든 이모지에 대응하는 유니코드 범위를 좀 더 세밀하게 지정
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F700-\U0001F77F"  # alchemical symbols
                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               "\U00002702-\U000027B0"  # Dingbats
                               #"\U000024C2-\U0001F251"  # Enclosed characters
                               "\U00002500-\U000026FF"  # Miscellaneous Symbols
                               "\U00002B50-\U00002B55"  # Additional emoticons
                               "\U0001F000-\U0001F02F"  # Mahjong tiles
                               "\U0001F0A0-\U0001F0FF"  # Playing cards
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
dataset =  load_dataset(dataset_name) # train_args.data_path
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [6]:
c_dataset= task_classification_dataset(c_path)
s_dataset = task_summarization_dataset(s_path)
chat_dataset = process_dataset(dataset['train'])

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

Map:   0%|          | 0/9836 [00:00<?, ? examples/s]

Map:   0%|          | 0/9836 [00:00<?, ? examples/s]

In [7]:
print(chat_dataset['text'][622])

<|start_header_id|>system<|end_header_id|>

당신은 업무적으로 도움을 주고 공감 능력이 있는 AI 봇 픽시입니다. 다음 대화의 흐름을 보고 상대방의 요구에 맞는 답변을 해주세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

결혼식 준비로 인해 스트레스 받고 있어.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

결혼식 준비로 인해 스트레스를 받고 계시군요. 아름다운 순간을 위한 준비는 때때로 힘들 수 있어요. 그래도 괜찮아요, 여러분의 특별한 날을 위한 것이니까요. 어떤 부분이 가장 도움이 필요하신가요?<|eot_id|>


In [8]:
import time

start_time = time.time()

# 'output' 값에 대한 각 인덱스 구하기
positive_indices = [idx for idx, label in enumerate(c_dataset['output']) if label == '긍정']
neutral_indices = [idx for idx, label in enumerate(c_dataset['output']) if label == '중립']
negative_indices = [idx for idx, label in enumerate(c_dataset['output']) if label == '부정']

# 각 'output' 값에 대한 필요한 수의 샘플 선택
positive_train_size = 1263
neutral_train_size = 2296
negative_train_size = 504

positive_test_size = len(positive_indices) - positive_train_size
neutral_test_size = len(neutral_indices) - neutral_train_size
negative_test_size = len(negative_indices) - negative_train_size

# 훈련 데이터 인덱스 선택
positive_train_indices = positive_indices[:positive_train_size]
neutral_train_indices = neutral_indices[:neutral_train_size]
negative_train_indices = negative_indices[:negative_train_size]

# 테스트 데이터 인덱스 선택
positive_test_indices = positive_indices[positive_train_size:]
neutral_test_indices = neutral_indices[neutral_train_size:]
negative_test_indices = negative_indices[negative_train_size:]

# 훈련 데이터셋 및 테스트 데이터셋 생성
train_dataset_indices = positive_train_indices + neutral_train_indices + negative_train_indices
test_dataset_indices = positive_test_indices + neutral_test_indices + negative_test_indices

train_dataset = Dataset.from_dict({key: [c_dataset[key][idx] for idx in train_dataset_indices] for key in c_dataset.features})
test_dataset = Dataset.from_dict({key: [c_dataset[key][idx] for idx in test_dataset_indices] for key in c_dataset.features})

# 결과 확인
print("Train set shape:", len(train_dataset))
print("Test set shape:", len(test_dataset))

end_time = time.time()
elapsed_time = end_time - start_time
print("프로그램 실행 시간: ", elapsed_time, "초")

c_dataset = train_dataset

# 데이터셋의 크기
total_samples = len(s_dataset)

# 훈련 데이터의 크기
train_size = 4492

# 훈련 데이터 인덱스 범위 선택
train_indices = list(range(train_size))

# 테스트 데이터 인덱스 범위 선택 (나머지)
test_indices = list(range(train_size, total_samples))

# 훈련 데이터와 테스트 데이터로 분할
train_dataset = s_dataset.select(train_indices)
test_dataset = s_dataset.select(test_indices)

# 데이터셋 형태 확인
print("Train set shape:", len(train_dataset))
print("Test set shape:", len(test_dataset))

s_dataset = train_dataset


start_time = time.time()

# 각 타입에 해당하는 인덱스를 구합니다.
single_indices = [idx for idx, label in enumerate(chat_dataset['type']) if label == 'single']
multi2_indices = [idx for idx, label in enumerate(chat_dataset['type']) if label == 'multi_2']
multi3_indices = [idx for idx, label in enumerate(chat_dataset['type']) if label == 'multi_3']

# 각 타입에 대해 조정할 샘플 수를 정의합니다.
single_samples = 6094
multi2_samples = 3712
multi3_samples = 10756

# 샘플 인덱스를 선택합니다.
selected_indices = (single_indices[:single_samples] + 
                    multi2_indices[:multi2_samples] + 
                    multi3_indices[:multi3_samples])

# 선택된 인덱스를 제외한 나머지를 테스트용으로 사용합니다.
test_indices = set(range(len(chat_dataset))) - set(selected_indices)

# train 및 test 데이터셋을 생성합니다.
train_dataset = chat_dataset.select(selected_indices)
test_dataset = chat_dataset.select(list(test_indices))

# 결과 확인
print("Train set shape:", len(train_dataset))
print("Test set shape:", len(test_dataset))

end_time = time.time()
elapsed_time = end_time - start_time
print("프로그램 실행 시간: ", elapsed_time, "초")

chat_dataset = train_dataset

Train set shape: 4063
Test set shape: 783
프로그램 실행 시간:  206.30734992027283 초
Train set shape: 4492
Test set shape: 5344
Train set shape: 20562
Test set shape: 6100
프로그램 실행 시간:  0.2540562152862549 초


In [7]:
task_dataset = task_combined_dataset(c_dataset, s_dataset)
task_dataset = DatasetDict({'train': task_dataset})
task_dataset = process_dataset(task_dataset['train'])
# 두 데이터셋 합치기
combined_dataset = concatenate_datasets([chat_dataset,task_dataset])
dataset = combined_dataset.shuffle(seed=42)  # seed 값은 원하는 값으로 변경 가능
dataset = dataset.shuffle(seed=42)  # seed 값은 원하는 값으로 변경 가능

Map:   0%|          | 0/14682 [00:00<?, ? examples/s]

In [8]:
print(dataset)

Dataset({
    features: ['instruction', 'output', 'source', 'type', 'text'],
    num_rows: 41344
})


In [13]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0 # 0.1


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [14]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4 # 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.01 # 0.001

# Optimizer to use
optim = 'paged_adamw_8bit'# "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type =  'linear'#"constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 37000

# Log every X updates steps
logging_steps = 2000

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None # 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
#device_map = {"": 0}
device_map = 'auto'

'\ndevice_map =  {\n    0: [0, 1, 2, 3, 4, 5, 6, 7],   # GPU 1에 8개 레이어 할당\n    1: [8, 9, 10, 11, 12, 13, 14, 15],  # GPU 2에 다음 8개 레이어 할당\n    2: [16, 17, 18, 19, 20, 21, 22, 23],  # GPU 3에 나머지 8개 레이어 할당\n    3: [24, 25, 26, 27, 28, 29, 30, 31]\n    }\n'

In [15]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, # True 
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)# Check GPU compatibility with bfloat16

In [16]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
#model.gradient_checkpointing_enable() # 훈련속도가 약간 감소하는 대신, 메모리 사용량을 크게 줄임, 속도 저하는 20~30 % 정도
# 테스트 결과 해당 옵션 문제였음 크게 감소도 안하니 생략 
model = prepare_model_for_kbit_training(model)

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha, # 16
    lora_dropout=lora_dropout, # 0
    r=lora_r, # 16 # The number of LoRA layers 8, 16, 32, 64
    #use_gradient_checkpointing = True, # Use gradient checkpointing
    #use_rslora = False, # Use RSLora
    #use_dora = False, # Use DoRa
    #loftq_config = None, # The LoFTQ configuration
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # The target modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# trainable params: 33,554,432 || all params: 6,889,410,560 || trainable%: 0.4870435824338505

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5195983464188562


In [19]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/41344 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [20]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2000,0.9914
4000,0.8998
6000,0.8464
8000,0.795
10000,0.7829


TrainOutput(global_step=10336, training_loss=0.8595301521820918, metrics={'train_runtime': 207235.7309, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.05, 'total_flos': 1.5255286391115448e+18, 'train_loss': 0.8595301521820918, 'epoch': 2.0})

In [2]:
from tensorboard import notebook
output_dir = "./results"

log_dir = "results/runs"
notebook.start("--logdir {} --port 4001".format(log_dir))

In [None]:
import os
import time

output_dir = "./results"

# Ensure TensorBoard logs directory exists
log_dir = os.path.join(output_dir, 'runs')

# Start TensorBoard server
os.system(f'tensorboard --logdir={log_dir} --port=6006')

# Wait a bit for TensorBoard to start
time.sleep(5)

In [22]:
logging.set_verbosity(logging.CRITICAL)
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (def

In [23]:
save_path_eval = 'models/' +  new_model + '_eval'

# Save trained model / tokenizer
trainer.model.save_pretrained(save_path_eval )
trainer.tokenizer.save_pretrained(save_path_eval )

('models/Llama-3-Open-Ko-8B-Instruct-preview_train_v5_eval\\tokenizer_config.json',
 'models/Llama-3-Open-Ko-8B-Instruct-preview_train_v5_eval\\special_tokens_map.json',
 'models/Llama-3-Open-Ko-8B-Instruct-preview_train_v5_eval\\tokenizer.json')