In [1]:
import os
import re
import json
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI

from langchain.schema import Document

from matplotlib import font_manager, rc
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font = font_manager.FontProperties(fname = font_path).get_name()
rc('font', family = font)

from dotenv import load_dotenv
load_dotenv("../keys.env")

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
client = OpenAI()
model = "gpt-4o"

In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def load_document(path="../dataset/labeled_documents.jsonl"):
    raw_documents = load_jsonl(path)

    documents = []
    for doc in raw_documents:
        doc_id = doc['docid']
        content = doc['content']

        documents.append(Document(page_content=content, metadata={"docid": doc_id}))

    return documents

In [4]:
def clean_json_response(response):
    # 코드 블록(예: ```json, ```) 제거
    cleaned_response = re.sub(r'```(?:json)?', '', response).strip()
    return cleaned_response

def translate_ko2en_doc(document, model:str, client:OpenAI):
    prompt = (
        "당신은 한국어를 영어로 번역하는 전문가입니다."
        "주어진 한국어 문서를 읽고 나서 가장 최고 품질의 영어 문서로 번역해야 합니다."
        "반환하는 형식은 반드시 JSON 포맷이어야 하며, 모든 문자열은 쌍따옴표로 감싸야 합니다. "
        "형식은 다음과 같아야 합니다: "
        '{ "translated_doc": "번역한 문서 내용" }. '

    )

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role" : "system", "content" : prompt},
            {"role" : "user", "content" : document}
        ],
    )
    
    response = completion.choices[0].message.content
    response = clean_json_response(response)
    
    try:
        json_response = json.loads(response)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response", "response": response}
    
    return json_response

In [5]:
def process_documents(documents, model, client, output_path='../dataset/en_documents.jsonl'):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for doc in tqdm(documents):
            ko_content = doc.page_content
            translated = translate_ko2en_doc(ko_content, model, client)
            
            # 문서에 생성한 제목을 추가
            en_content = translated["translated_doc"]
                
            # 새로운 문서 구조
            en_document = {
                "docid": doc.metadata["docid"],
                "content": en_content,
                "ko_content" : ko_content
                }
                
            json.dump(en_document, outfile, ensure_ascii=False)
            outfile.write('\n')

In [6]:
documents = load_document()
process_documents(documents, model, client)

 23%|██▎       | 997/4272 [28:41<1:34:16,  1.73s/it]


KeyError: 'translated_doc'