In [1]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

In [2]:
# 1. 허깅페이스 허브에서 데이터셋 로드
from datasets import load_dataset
import random

dataset = load_dataset("iamjoon/esg-survey-datasets", split="train")

# 2. OpenAI format으로 데이터 변환을 위한 함수 
def format_data(sample):
    # OpenAI format으로 변환
    return {
        "messages": [
            {
                "role": "system",
                "content": sample['system_prompt'],
            },
            {
                "role": "user",
                "content": sample["user_prompt"],
            },
            {
                "role": "assistant",
                "content": sample["assistant"]
            },
        ],
    }

# 3. 전체 데이터에 OpenAI 포맷 전처리 적용
formatted_dataset = dataset.map(format_data)

# 4. 데이터를 7:2 비율로 학습/테스트 분할 (HuggingFace 내장 메서드 사용)
split_dataset = formatted_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# 5. 결과 확인
print(f"전체 데이터 수: {len(formatted_dataset)}")
print(f"학습 데이터 수: {len(train_dataset)} ({len(train_dataset)/len(formatted_dataset)*100:.1f}%)")
print(f"테스트 데이터 수: {len(test_dataset)} ({len(test_dataset)/len(formatted_dataset)*100:.1f}%)")

전체 데이터 수: 4599
학습 데이터 수: 3219 (70.0%)
테스트 데이터 수: 1380 (30.0%)


In [3]:
# 데이터셋 저장
test_dataset.save_to_disk("test_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1380 [00:00<?, ? examples/s]

In [4]:
from transformers import  AutoTokenizer

# 허깅페이스 모델 ID
model_id = "NCSOFT/Llama-VARCO-8B-Instruct"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
prompt_lst = []
label_lst = []

for messages in test_dataset["messages"]:
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    input = text.split('<|start_header_id|>assistant<|end_header_id|>\n')[0] + '<|start_header_id|>assistant<|end_header_id|>\n'
    label = text.split('<|start_header_id|>assistant<|end_header_id|>\n')[1].split('<|eot_id|>')[0]
    prompt_lst.append(input)
    label_lst.append(label)

In [6]:
import string
from vllm import LLM, SamplingParams

llm = LLM(model="iamjoon/llama3-8b-esg-survey-model")

INFO 08-07 08:16:52 [__init__.py:239] Automatically detected platform cuda.
INFO 08-07 08:17:00 [config.py:585] This model supports multiple tasks: {'reward', 'score', 'classify', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 08-07 08:17:00 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-07 08:17:02 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='iamjoon/llama3-8b-esg-survey-model', speculative_config=None, tokenizer='iamjoon/llama3-8b-esg-survey-model', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_ba

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-07 08:17:06 [loader.py:447] Loading weights took 2.89 seconds
INFO 08-07 08:17:07 [gpu_model_runner.py:1186] Model loading took 14.9596 GB and 3.493485 seconds
INFO 08-07 08:17:13 [backends.py:415] Using cache directory: /root/.cache/vllm/torch_compile_cache/5fbafbff57/rank_0_0 for vLLM's torch.compile
INFO 08-07 08:17:13 [backends.py:425] Dynamo bytecode transform time: 6.84 s
INFO 08-07 08:17:14 [backends.py:115] Directly load the compiled graph for shape None from the cache
INFO 08-07 08:17:19 [monitor.py:33] torch.compile takes 6.84 s in total
INFO 08-07 08:17:20 [kv_cache_utils.py:566] GPU KV cache size: 418,768 tokens
INFO 08-07 08:17:20 [kv_cache_utils.py:569] Maximum concurrency for 8,192 tokens per request: 51.12x
INFO 08-07 08:17:39 [gpu_model_runner.py:1534] Graph capturing finished in 20 secs, took 0.51 GiB
INFO 08-07 08:17:39 [core.py:151] init engine (profile, create kv cache, warmup model) took 32.96 seconds


In [7]:
len(prompt_lst)

1380

In [8]:
# 긴 프롬프트 샘플 배제
def filter_long_prompts(prompt_lst, label_lst, max_length=8000):
    """
    최대 길이를 초과하는 프롬프트 샘플들을 배제하는 함수
    """
    filtered_prompts = []
    filtered_labels = []
    filtered_indices = []
    
    for i, prompt in enumerate(prompt_lst):
        if len(prompt) <= max_length:
            filtered_prompts.append(prompt)
            filtered_labels.append(label_lst[i])
            filtered_indices.append(i)
    
    print(f"전체 {len(prompt_lst)}개 중 {len(filtered_prompts)}개 샘플 유지")
    print(f"{len(prompt_lst) - len(filtered_prompts)}개 샘플 배제됨")
    
    return filtered_prompts, filtered_labels, filtered_indices

# 사용법
filtered_prompt_lst, filtered_label_lst, valid_indices = filter_long_prompts(prompt_lst, label_lst, max_length=12645)
print(len(filtered_prompt_lst))
print(len(filtered_label_lst))

전체 1380개 중 216개 샘플 유지
1164개 샘플 배제됨
216
216


In [9]:
# 샘플링 파라미터 설정
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=1024,
    stop=["\n<|end_of_text|>", "<|eot_id|>"],  # 문자열로 중지 토큰 지정
)

# 생성
outputs = llm.generate(filtered_prompt_lst, sampling_params)

Processed prompts: 100%|██████████| 216/216 [01:31<00:00,  2.35it/s, est. speed input: 16554.54 toks/s, output: 20.22 toks/s]


In [10]:
# outputs를 문자열 리스트로 파싱
preds = [output.outputs[0].text for output in outputs]

In [11]:
preds[:5]

['답변: 1) 예', '답변: 4) 많다', '답변: 알 수 없음', '답변: 알 수 없음', '답변: 알 수 없음']

In [12]:
filtered_label_lst[:5]

['\n답변: 1) 예', '\n답변: 3) 보통이다', '\n답변: 알 수 없음', '\n답변: 알 수 없음', '\n답변: 알 수 없음']

In [13]:
len(preds)

216

In [14]:
import re

def calculate_accuracy(labels_list, predictions_list):
    """
    텍스트에서 '답변:' 다음 부분을 추출하여 일치도를 계산하는 함수
    """
    def extract_answer(text):
        match = re.search(r'답변:\s*(.+)', text.strip())
        if match:
            return match.group(1).strip()
        return ""
    
    if len(labels_list) != len(predictions_list):
        raise ValueError("라벨과 예측 결과의 개수가 일치하지 않습니다.")
    
    correct_count = 0
    total_count = len(labels_list)
    
    for label_text, pred_text in zip(labels_list, predictions_list):
        label_answer = extract_answer(label_text)
        pred_answer = extract_answer(pred_text)
        
        if label_answer == pred_answer:
            correct_count += 1
    
    accuracy = correct_count / total_count if total_count > 0 else 0.0
    
    return accuracy

In [15]:
# checkpoint-800
calculate_accuracy(filtered_label_lst, preds)

0.8240740740740741