In [1]:
import os
import re
import json
import faiss
import warnings
import pandas as pd
import huggingface_hub

from openai import OpenAI

from langchain.schema import Document
from langchain_community.vectorstores.faiss import FAISS

from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker


os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
client = OpenAI()
model = "gpt-4o"

In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def load_document(doc_path):
    raw_documents = load_jsonl(doc_path)

    documents = []
    for doc in raw_documents:
        doc_id = doc['docid']
        content = doc['content']
        documents.append(Document(page_content=content, metadata={"docid": doc_id}))

    return documents

documents = load_document("../dataset/en_4.0_document.jsonl")

In [4]:
def clean_json_response(response):
    # 코드 블록(예: ```json, ```) 제거
    cleaned_response = re.sub(r'```(?:json)?', '', response).strip()
    return cleaned_response

In [9]:
def generate_keyword(document, model:str, client:OpenAI):
    prompt = (
        "당신은 주어진 문서를 읽고 이해하여 핵심을 파악하는 언어 전문가입니다. "
        "주어진 문서를 읽은 후에 문서와 관련성이 높은 질문을 세 가지 생성해야 합니다."
        "추가적으로 해당 문서가 어떤 분야에 해당하는지 하나의 단어로 구성된 도메인을 선정해야 합니다. "
        "반환하는 형식은 반드시 JSON 포맷이어야 하며, 모든 문자열은 쌍따옴표로 감싸야 합니다. "
        "형식은 다음과 같아야 합니다: "
        '{ "question1": "생성한 첫번째 질문", "question2" : "생성한 두번째 질문", "question3" : "생성한 세번째 질문", "domain" : "생성한 도메인"}. '
    )

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role" : "system", "content" : prompt},
            {"role" : "user", "content" : document}
        ],
    )
    
    response = completion.choices[0].message.content
    response = clean_json_response(response)
    
    try:
        json_response = json.loads(response)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response", "response": response}
    
    return json_response

In [10]:
sample_doc = documents[0]
print(sample_doc.page_content)

result = generate_keyword(sample_doc.page_content, model, client)
print(result)

It is important for a healthy person to maintain energy balance in equilibrium. Energy balance signifies the mathematical equivalence of energy intake and energy expenditure. Generally, a healthy person achieves energy balance over a period of 1-2 weeks. During this period, one must regulate energy intake and energy expenditure through a proper diet and appropriate exercise. A diet should include nutritious foods and an intake of adequate calories. Additionally, exercise promotes energy expenditure and strengthens muscles. Maintaining energy balance in this way helps maintain health and prevents issues such as obesity or malnutrition. Therefore, it is crucial for a healthy person to maintain energy balance in equilibrium, which requires adjusting diet and exercise over a period of 1-2 weeks.
{'question1': 'What is energy balance and why is it important for a healthy person?', 'question2': 'How often does a healthy person typically achieve energy balance?', 'question3': 'What are the co

In [11]:
labeled_documents = []

for idx, doc in enumerate(documents):
    docid = doc.metadata.get('docid')
    content = doc.page_content
    print(f"{idx}\n{docid}\n{content}")

    result = generate_keyword(content, model, client)
    print(f"{result}\n") ## question, keyword
    
    # domain 키 확인 및 할당
    if "domain" in result:
        doc.metadata["domain"] = result["domain"]
    else:
        print(f"'domain' key not found in result for document {doc.metadata['docid']}")

    # question1 키 확인 및 할당
    if "question1" in result:
        doc.metadata["question1"] = result["question1"]
    else:
        print(f"'question1' key not found in result for document {doc.metadata['docid']}")

    # question2 키 확인 및 할당
    if "question2" in result:
        doc.metadata["question2"] = result["question2"]
    else:
        print(f"'question2' key not found in result for document {doc.metadata['docid']}")

    # question3 키 확인 및 할당
    if "question3" in result:
        doc.metadata["question3"] = result["question3"]
    else:
        print(f"'question3' key not found in result for document {doc.metadata['docid']}")


    labeled_documents.append({
        "docid": doc.metadata["docid"],
        "content": doc.page_content,
        "domain" : doc.metadata['domain'],
        "question1" : doc.metadata['question1'],
        "question2" : doc.metadata['question2'],
        "question3" : doc.metadata['question3'],
    })

with open("../dataset/en_4.0_processed_documents_queries.jsonl", 'w', encoding='utf-8') as f:
    for entry in labeled_documents:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

0
42508ee0-c543-4338-878e-d98c6babee66
It is important for a healthy person to maintain energy balance in equilibrium. Energy balance signifies the mathematical equivalence of energy intake and energy expenditure. Generally, a healthy person achieves energy balance over a period of 1-2 weeks. During this period, one must regulate energy intake and energy expenditure through a proper diet and appropriate exercise. A diet should include nutritious foods and an intake of adequate calories. Additionally, exercise promotes energy expenditure and strengthens muscles. Maintaining energy balance in this way helps maintain health and prevents issues such as obesity or malnutrition. Therefore, it is crucial for a healthy person to maintain energy balance in equilibrium, which requires adjusting diet and exercise over a period of 1-2 weeks.
{'question1': 'What is energy balance and why is it important for health?', 'question2': 'How can a healthy person achieve energy balance over 1-2 weeks?', 'q