# Sentence Transformer: 한국어 임베딩 모델 학습
#### 작성자: 고우주

In [1]:
# 0. (필요시) 설치
# pip install sentence-transformers datasets torch --upgrade

import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader

# 1. 디바이스 설정 (CUDA 사용)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 2. 사전학습 모델 로드
model_name = 'nlpai-lab/KURE-v1'
model = SentenceTransformer(model_name, device=device)

# 3. KLUE-STS 데이터셋 로드 (train/validation만 제공)
dataset = load_dataset('mteb/KLUE-STS')

# 4. InputExample 리스트 생성 (score → 0~5 범위 → 0~1로 정규화)
def to_input_examples(split: str):
    examples = []
    for item in dataset[split]:
        normalized_score = float(item['score']) / 5.0
        examples.append(
            InputExample(
                texts=[item['sentence1'], item['sentence2']],
                label=normalized_score
            )
        )
    return examples

train_examples = to_input_examples('train')
dev_examples = to_input_examples('validation')

# 5. DataLoader 생성
train_dataloader = DataLoader(
    train_examples, 
    shuffle=True,  
    batch_size=64
)
dev_dataloader = DataLoader(
    dev_examples,   
    shuffle=False, 
    batch_size=64
)

# 6. 손실함수 정의 (CosineSimilarityLoss)
train_loss = losses.CosineSimilarityLoss(model=model)

# 7. 평가기 정의 (validation set에서 Spearman 상관계수 계산)
dev_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(
    dev_examples, 
    name='sts-dev', 
    write_csv=True
)

# 8. Warmup 스텝 계산 (전체 학습 스텝의 10%)
epochs = 3
total_steps = len(train_dataloader) * epochs
warmup_steps = int(0.1 * total_steps)

# 9. 모델 학습 및 평가
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=epochs,
    warmup_steps=warmup_steps,
    evaluation_steps=1000,       # 1,000스텝마다 validation 평가
    output_path='./fine_tuned_kure',
    use_amp=True                 # mixed-precision optional
)

2025-07-17 17:54:16.047631: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-17 17:54:16.050176: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-17 17:54:16.099512: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


comet_ml version 3.43.0 is installed, but version 3.43.2 or higher is required. Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=3.43.2'.
Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
comet_ml version 3.43.0 is installed, but version 3.43.2 or higher is required. Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=3.43.2'.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
46,No log,No log,0.899343,0.898905
92,No log,No log,0.913797,0.914956
138,No log,No log,0.916483,0.917812


In [4]:
import torch
from sentence_transformers import SentenceTransformer, util

# 1. 디바이스 설정 (CUDA 사용 권장)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 2. 파인튜닝된 모델 로드
model_path = './fine_tuned_kure'  # 학습 시 output_path와 동일한 경로
model = SentenceTransformer(model_path, device=device)

# 3. 테스트용 문장 입력
sentences1 = [
    "이 문장은 모델 테스트를 위한 첫 번째 문장입니다.",
    "두 번째 문장으로 모델 추론 성능을 확인합니다."
]

sentences2 = [
    "RAG는 생성 AI에서 가장 많이 사용하는 서비스입니다.",
    "청킹, 임베딩, 벡터스토어, 유사검색, 참조 생성 절차를 따릅니다."
]

# 4. 임베딩 계산
#    - convert_to_tensor=True 로 하면 GPU 텐서로 바로 반환됩니다.
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# 5. 코사인 유사도 계산
cos_sim1 = util.cos_sim(embeddings1[0], embeddings1[1])
print(f"Cosine similarity: {cos_sim1.item():.4f}")

cos_sim2 = util.cos_sim(embeddings2[0], embeddings2[1])
print(f"Cosine similarity: {cos_sim2.item():.4f}")

# 6. (선택) 개별 임베딩 벡터 확인
print("Embedding1-1 vector:", embeddings1[0])
print("Embedding1-2 vector:", embeddings1[1])

print("Embedding2-1 vector:", embeddings1[0])
print("Embedding2-2 vector:", embeddings1[1])

Using device: cuda
Cosine similarity: 0.4747
Cosine similarity: 0.0488
Embedding1-1 vector: tensor([-0.0189, -0.0196,  0.0428,  ...,  0.0406,  0.0505, -0.0416],
       device='cuda:0')
Embedding1-2 vector: tensor([-0.0876, -0.0315,  0.0454,  ...,  0.0424,  0.0327, -0.0437],
       device='cuda:0')
Embedding2-1 vector: tensor([-0.0189, -0.0196,  0.0428,  ...,  0.0406,  0.0505, -0.0416],
       device='cuda:0')
Embedding2-2 vector: tensor([-0.0876, -0.0315,  0.0454,  ...,  0.0424,  0.0327, -0.0437],
       device='cuda:0')
