In [1]:
import os
import gc
import requests
import json
import pandas as pd
import numpy as np
import PyPDF2
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import random

from openai import OpenAI
from sentence_transformers import SentenceTransformer, losses, InputExample, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.finetuning import generate_qa_embedding_pairs, EmbeddingQAFinetuneDataset
from rank_bm25 import BM25Okapi

In [2]:
usa_documents = SimpleDirectoryReader(input_files=['data/ict_usa_2024.pdf']).load_data()
japan_documents = SimpleDirectoryReader(input_files=['data/ict_japan_2024.pdf']).load_data()
node_parser = SentenceSplitter(chunk_size=512)
nodes_train = node_parser.get_nodes_from_documents(usa_documents)
nodes_valid = node_parser.get_nodes_from_documents(japan_documents)
print(len(nodes_train), len(nodes_valid))

sample_weight = 0.5
if len(nodes_train) > 50:
    selected_nodes_train = random.sample(nodes_train, int(len(nodes_train) * sample_weight))
else:
    selected_nodes_train = nodes_train

if len(nodes_valid) > 50:
    selected_nodes_valid = random.sample(nodes_valid, int(len(nodes_valid) * sample_weight))
else:
    selected_nodes_valid = nodes_valid
print(len(selected_nodes_train), len(selected_nodes_valid))

llm = OpenAI(model="gpt-5-nano", temperature=0.0)

84 76
42 38


In [3]:
DATASET_PATH_TRAIN = "data/usa_train_dataset.json"

if os.path.exists(DATASET_PATH_TRAIN):
    print(f"이미 저장된 데이터셋이 있습니다. '{DATASET_PATH_TRAIN}'를 로드합니다...")
    train_dataset = EmbeddingQAFinetuneDataset.from_json(DATASET_PATH_TRAIN)
    
else:
    print("저장된 데이터셋이 없습니다. 생성을 시작합니다... (시간 소요됨)")
    
    train_dataset = generate_qa_embedding_pairs(
        nodes=selected_nodes_train,
        llm=llm,
        num_questions_per_chunk=2,
        output_path=DATASET_PATH_TRAIN
    )
    print("생성 및 저장 완료!")

print(f"데이터 준비 완료: 쿼리 {len(train_dataset.queries)}개")

print(train_dataset.queries)
print(train_dataset.relevant_docs)

이미 저장된 데이터셋이 있습니다. 'data/usa_train_dataset.json'를 로드합니다...
데이터 준비 완료: 쿼리 84개
{'b3c2606c-0943-41ee-b08a-d79feebfb14d': 'What is the numeric value provided in the context information?', '51429fb0-516a-4a7d-a035-6984f7d236b2': 'Describe the formatting of the context block around the numeric value, including any separators used.', '38a5a9f9-3307-4ce4-8122-6a3294fcf867': '객관식: 다음 중 본 문서에서 무선 허가자들에게 인센티브를 제공하는 주된 목적은 무엇인가?', 'dae4fd20-ae84-41eb-bbfe-164c215208bf': 'A) 도시 지역의 서비스 확대', '9ff4a22d-763e-4d85-8239-594e55e05f27': '문서에 제시된 두 가지 주요 목표를 각각 간단히 설명하시오. (예: AI가 생성한 콘텐츠를 탐지하고 공식 콘텐츠를 인증하기 위한 표준과 모범사례 확립; 중요 소프트웨어의 취약점을 찾아 수정할 수 있는 AI 툴 개발을 위한 프로그램 수립.)', 'ab4a35c3-b515-4896-91f8-3415bfa7fa7a': '다음 중 미국 연방 차원의 개인정보보호 정책에서 우선시되는 조치로 올바른 것을 고르시오.', 'eaf8eb0e-9291-4cb2-9f9d-07deba2e8af5': '- Q1 (다지선다형): 아래 중 문서의 목표에 포함되지 않는 것은 무엇인가?', 'f75dd53a-296b-4aeb-bea5-22c354b8dae8': 'A) AI 관련 의료 분야 위험 보고 체계 수립과 AI 지원 교육 도구 배포 지원', 'da98e0d4-d0c2-49e2-9aea-d8b41044dbc0': '객관식(다지선다형): 다음 중 이 자금의 보조금을 배분

In [4]:
model_id = 'BAAI/bge-m3'
tokenizer = AutoTokenizer.from_pretrained(model_id)

def model_tokenizer(text):
    return tokenizer.tokenize(text)

corpus_ids = list(train_dataset.corpus.keys())
corpus_texts = list(train_dataset.corpus.values())

tokenized_corpus = [model_tokenizer(doc) for doc in tqdm(corpus_texts, desc='Tokenizing')]

bm25 = BM25Okapi(tokenized_corpus)

Tokenizing:   0%|          | 0/42 [00:00<?, ?it/s]

In [5]:
train_examples = []
mining_success_count = 0

for query_id, query_text in tqdm(train_dataset.queries.items(), desc='Mining'):
    pos_doc_ids = train_dataset.relevant_docs[query_id]
    pos_doc_texts = [train_dataset.corpus[p_id] for p_id in pos_doc_ids]

    query_tokens = model_tokenizer(query_text)

    top_n_texts = bm25.get_top_n(query_tokens, corpus_texts, n=15)

    hard_neg_text = None
    for candidate_text in top_n_texts:
        if candidate_text not in pos_doc_texts:
            hard_neg_text = candidate_text
            break
    
    for pos_text in pos_doc_texts:
        if hard_neg_text:
            train_examples.append({'anchor':query_text, 'positive':pos_text, 'negative':hard_neg_text})
            mining_success_count += 1
        else:
            train_examples.append({'anchor':query_text, 'positive':pos_text})

Mining:   0%|          | 0/84 [00:00<?, ?it/s]

In [6]:
train_dataset_hf = Dataset.from_list(train_examples)

In [7]:
DATASET_PATH_VALID = "data/japan_valid_dataset.json"

if os.path.exists(DATASET_PATH_VALID):
    print(f"이미 저장된 데이터셋이 있습니다. '{DATASET_PATH_VALID}'를 로드합니다...")
    valid_dataset = EmbeddingQAFinetuneDataset.from_json(DATASET_PATH_VALID)
    
else:
    print("저장된 데이터셋이 없습니다. 생성을 시작합니다... (시간 소요됨)")
    
    valid_dataset = generate_qa_embedding_pairs(
        nodes=selected_nodes_valid,
        llm=llm,
        num_questions_per_chunk=2,
        output_path=DATASET_PATH_VALID
    )
    print("생성 및 저장 완료!")

print(f"데이터 준비 완료: 쿼리 {len(valid_dataset.queries)}개")

print(valid_dataset.queries)
print(valid_dataset.relevant_docs)

이미 저장된 데이터셋이 있습니다. 'data/japan_valid_dataset.json'를 로드합니다...
데이터 준비 완료: 쿼리 76개
{'2d5608a7-bd91-471b-b446-93bf7f270c21': 'Identify the three nations cited as sources of cyber threats in the document. Then describe the two real-world incidents referenced (SolarWinds hack and Colonial Pipeline ransomware) and summarize their impacts on data security and critical infrastructure.', '2fc72960-f28f-4907-8c56-b1395b91fc05': 'Explain the policy development announced by Japan in June 2023 requiring government contractors to comply with U.S. cybersecurity guidelines. Include who is affected (e.g., think tanks, telecoms, and the 1,000+ firms) and what changes are being made by the Government Cybersecurity Strategy Office and central ministries to strengthen information security for external partners.', 'fdc1aa68-9cab-46e3-83f0-71c5541c1b4e': '아래의 참고 자료 중 AI 정책 및 데이터 보호에 관련된 일본 정부 기관 두 곳과 각 기관의 참고 자료 제목을 매칭하시오.', 'c92dc7dc-2139-4f71-8399-e3d3aa4a449d': '- 個人情報保護委員会 — 「生成AIサービスの利⽤に関する注意喚起等」', '7bf2a

In [8]:
val_queries = valid_dataset.queries
val_corpus = valid_dataset.corpus

val_relavant_docs = {
    q_id:set(doc_ids) for q_id, doc_ids in valid_dataset.relevant_docs.items()
}

In [9]:
evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relavant_docs,
    name='val_evaluator'
)

In [10]:
batch_size = 4
loader = DataLoader(train_examples, batch_size=batch_size, shuffle=True)
model = SentenceTransformer(model_id)
loss = losses.MultipleNegativesRankingLoss(model)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
Load pretrained SentenceTransformer: BAAI/bge-m3


In [11]:
output_dir = '../../../exercisebook_large_data/korean_sentence-embedding-model'

args = SentenceTransformerTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    fp16=False,
    bf16=True,
    gradient_checkpointing=True,
    eval_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=10,
    save_total_limit=2,
    logging_steps=10,
    report_to='none'
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset_hf,
    loss=loss,
    evaluator=evaluator
)

trainer.train()
model.save_pretrained(output_dir)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val Evaluator Cosine Accuracy@1,Val Evaluator Cosine Accuracy@3,Val Evaluator Cosine Accuracy@5,Val Evaluator Cosine Accuracy@10,Val Evaluator Cosine Precision@1,Val Evaluator Cosine Precision@3,Val Evaluator Cosine Precision@5,Val Evaluator Cosine Precision@10,Val Evaluator Cosine Recall@1,Val Evaluator Cosine Recall@3,Val Evaluator Cosine Recall@5,Val Evaluator Cosine Recall@10,Val Evaluator Cosine Ndcg@10,Val Evaluator Cosine Mrr@10,Val Evaluator Cosine Map@100
10,0.9857,No log,0.644737,0.776316,0.842105,0.842105,0.644737,0.258772,0.168421,0.084211,0.644737,0.776316,0.842105,0.842105,0.751489,0.721272,0.729133
20,0.8922,No log,0.631579,0.789474,0.842105,0.842105,0.631579,0.263158,0.168421,0.084211,0.631579,0.789474,0.842105,0.842105,0.747545,0.715789,0.7238
30,0.691,No log,0.631579,0.789474,0.842105,0.855263,0.631579,0.263158,0.168421,0.085526,0.631579,0.789474,0.842105,0.855263,0.751925,0.717763,0.724612
40,0.449,No log,0.631579,0.789474,0.842105,0.855263,0.631579,0.263158,0.168421,0.085526,0.631579,0.789474,0.842105,0.855263,0.751925,0.717763,0.724534


INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the val_evaluator dataset in epoch 0.47619047619047616 after 10 steps:
Information Retrieval Evaluation of the model on the val_evaluator dataset in epoch 0.47619047619047616 after 10 steps:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 76
Queries: 76
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 38

Corpus: 38

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 64.47%
Accuracy@1: 64.47%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@3: 77.63%
Accuracy@3: 77.63%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@5: 84.21%
Accuracy@5: 84.21%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy

In [12]:
def evaluate_st(model_id, name, evaluator):
    os.makedirs('/data/embedding_finetuning_results', exist_ok=True)

    model = SentenceTransformer(model_id)
    result = evaluator(model)

    result_df = pd.DataFrame([result]) if isinstance(result, dict) else result
    output_path = f'/data/embedding_finetuning_results/Information-Retrieval-Evaluation_{name}_results.csv'
    result_df.to_csv(output_path, index=False)

    return result

In [13]:
original_result = evaluate_st(model_id='BAAI/bge-m3', name='original', evaluator=evaluator)
finetuned_result = evaluate_st(model_id='../../../exercisebook_large_data/korean_sentence-embedding-model', name='finetuned', evaluator=evaluator)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
Load pretrained SentenceTransformer: BAAI/bge-m3
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the val_evaluator dataset:
Information Retrieval Evaluation of the model on the val_evaluator dataset:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 76
Queries: 76
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 38

Corpus: 38

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 65.79%
Accuracy@1: 65.79%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@3: 77.63%
Accuracy@3: 77.63%
INFO:sentence_

In [14]:
result_df = pd.concat([pd.DataFrame([original_result]), pd.DataFrame([finetuned_result])])
result_df

Unnamed: 0,val_evaluator_cosine_accuracy@1,val_evaluator_cosine_accuracy@3,val_evaluator_cosine_accuracy@5,val_evaluator_cosine_accuracy@10,val_evaluator_cosine_precision@1,val_evaluator_cosine_precision@3,val_evaluator_cosine_precision@5,val_evaluator_cosine_precision@10,val_evaluator_cosine_recall@1,val_evaluator_cosine_recall@3,val_evaluator_cosine_recall@5,val_evaluator_cosine_recall@10,val_evaluator_cosine_ndcg@10,val_evaluator_cosine_mrr@10,val_evaluator_cosine_map@100
0,0.657895,0.776316,0.815789,0.828947,0.657895,0.258772,0.163158,0.082895,0.657895,0.776316,0.815789,0.828947,0.750275,0.724123,0.733022
0,0.631579,0.789474,0.842105,0.855263,0.631579,0.263158,0.168421,0.085526,0.631579,0.789474,0.842105,0.855263,0.751925,0.717763,0.724534


In [15]:
del model
del loader
del evaluator
del loss
gc.collect()
torch.cuda.empty_cache()