In [None]:
import json
import torch
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

# ✅ 학습된 KoBART 모델 로드
model_checkpoint = "./kobart_best_model"  # 🔹 Best 모델이 저장된 체크포인트 폴더
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_checkpoint)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [2]:
# ✅ 모델을 GPU로 이동 (가능하면)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [3]:
# ✅ 소설 데이터 로드
with open("unlabeled_books_no_summary.json", "r", encoding="utf-8") as f:
    novels_data = json.load(f)

In [4]:
# ✅ 요약 함수 정의
def generate_summary(description):
    if not description.strip():
        return "줄거리 없음"  # 🔹 설명이 비어 있으면 줄거리 없음 반환
    
    # 🔹 입력 텍스트 토큰화
    input_ids = tokenizer(description, return_tensors="pt", max_length=1024, truncation=True).input_ids.to(device)

    # 🔹 모델로 요약 생성
    with torch.no_grad():
        summary_ids = model.generate(
            input_ids, 
            max_length=250, 
            num_beams=5, 
            early_stopping=True,
            temperature=0.5)
    
    # 🔹 요약문 디코딩
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary_text

In [6]:
# ✅ 모든 소설의 description을 요약
for novel in novels_data:
    novel["summary"] = generate_summary(novel["description"])

# ✅ 새로운 JSON 파일로 저장
output_file = "unlabeled_books_with_summary.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(novels_data, f, ensure_ascii=False, indent=4)

print(f"✅ 요약 완료! 결과가 '{output_file}' 파일에 저장되었습니다.")

✅ 요약 완료! 결과가 'unlabeled_books_with_summary.json' 파일에 저장되었습니다.
