In [1]:
import os
import re
import json
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI

from langchain.schema import Document

from matplotlib import font_manager, rc
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font = font_manager.FontProperties(fname = font_path).get_name()
rc('font', family = font)

from dotenv import load_dotenv
load_dotenv("../keys.env")

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
client = OpenAI()
model = "gpt-4o"

In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def load_document(path="../dataset/labeled_documents.jsonl"):
    raw_documents = load_jsonl(path)

    documents = []
    for doc in raw_documents:
        doc_id = doc['docid']
        content = doc['content']
        domain = doc['domain']
        label = doc['label']
        question = doc['question']

        documents.append(Document(page_content=content, metadata={"docid": doc_id, "domain" : domain, "label" : label, "question" : question}))

    return documents

In [4]:
def clean_json_response(response):
    # 코드 블록(예: ```json, ```) 제거
    cleaned_response = re.sub(r'```(?:json)?', '', response).strip()
    return cleaned_response

In [5]:
def generate_title(document, model:str, client:OpenAI):
    prompt = (
        "당신은 주어진 문서를 읽고 이해하여 핵심을 파악하는 언어 전문가입니다."
        "주어진 문서를 읽은 후에 문서에 가장 적합한 제목 하나를 만들어야합니다. "
        "반환하는 형식은 반드시 JSON 포맷이어야 하며, 모든 문자열은 쌍따옴표로 감싸야 합니다. "
        "형식은 다음과 같아야 합니다: "
        '{ "title": "생성한 제목" }. '

    )

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role" : "system", "content" : prompt},
            {"role" : "user", "content" : document}
        ],
    )
    
    response = completion.choices[0].message.content
    response = clean_json_response(response)
    
    try:
        json_response = json.loads(response)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response", "response": response}
    
    return json_response

In [6]:
def process_documents(documents, model, client, output_path='titled_labeled_domain.jsonl'):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for doc in tqdm(documents):
            document_content = doc.page_content
            generated_title = generate_title(document_content, model, client)
            
            if "title" in generated_title:
                # 문서에 생성한 제목을 추가
                titled_content = f"제목: {generated_title['title']}\n\n{document_content}"
                
                # 새로운 문서 구조
                titled_document = {
                    "docid": doc.metadata["docid"],
                    "content": titled_content
                }
                
                # 결과를 JSONL 형식으로 저장
                json.dump(titled_document, outfile, ensure_ascii=False)
                outfile.write('\n')

In [7]:
documents = load_document()
process_documents(documents, model, client, output_path='titled_labeled_domain.jsonl')

100%|██████████| 4272/4272 [49:04<00:00,  1.45it/s]  
