In [1]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR/src")

import os
import time
import json
import random
import warnings
import anthropic
import threading
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_text_splitters import RecursiveCharacterTextSplitter

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = anthropic_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from config import Args
from data.data import load_document
from dense_retriever.model import load_dense_model
from sparse_retriever.model import load_sparse_model

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
args = Args()

total_documents = load_document(path="../dataset/processed_documents.jsonl")
print(len(total_documents))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = args.chunk_size,
    chunk_overlap  = args.chunk_overlap,
    length_function = len,
)

4272


In [3]:
client = OpenAI()
model = "gpt-4o"

In [4]:
prompt = """
<document>
{DOCUMENT}
</document> 
전체 문서에서 발췌한 청크는 다음과 같습니다.
<chunk> 
{CHUNK}
</chunk>

이 청크가 전체 문서의 어떤 맥락에 속하는지 한국어로 간결하게 설명하세요. 청크가 문서의 어떤 부분에서 발췌되었는지에 대한 정보를 제공하고, 청크의 배경 설명을 명확하게 해주세요.

입력 예시:
    건강한 사람이 에너지 균형을 평형 상태로 유지하는 것은 중요합니다.
예시에 대한 설명:
    이 청크는 건강한 생활습관과 관련된 영양학 문서에서 발췌되었으며, 에너지 섭취와 소비의 균형을 유지하는 방법에 대한 설명입니다. 이 설명은 특히 식단과 운동을 통한 에너지 조절의 중요성에 초점을 맞추고 있습니다.
출력 예시:
    이 청크는 영양학과 관련된 2024년 연구 보고서에서 발췌되었습니다. 이 문서에서는 에너지 균형을 유지하는 것이 건강한 생활에 얼마나 중요한지 설명하고 있으며, 특히 1-2주 동안의 에너지 섭취와 소비 조절을 강조하고 있습니다.
"""

In [5]:
def gpt_contextual_retrieval(document, chunk, model: str, client: OpenAI):
    prompt = """
    <document>
    {DOCUMENT}
    </document> 
    The following chunk is extracted from the entire document:
    <chunk> 
    {CHUNK}
    </chunk>

    Please provide a concise explanation in English of how this chunk fits within the overall context of the document. Include information about where in the document the chunk was extracted from and clarify the background of the chunk.

    Input Example:
        It is important for a healthy person to maintain energy balance in equilibrium.
    Explanation for the example:
        This chunk is extracted from a nutrition-related document about healthy lifestyle, explaining how to maintain a balance between energy intake and expenditure. The explanation focuses on the importance of regulating energy through diet and exercise.

    Output Example:
        This chunk is extracted from a 2024 nutrition research report. The document explains how maintaining energy balance is crucial for a healthy lifestyle, with particular emphasis on regulating energy intake and expenditure over a 1-2 week period.
    """

    prompt = prompt.format(DOCUMENT=document, CHUNK=chunk)
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1
            )
            return completion.choices[0].message.content
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts: {e}")
                return None
            time.sleep(2 ** attempt + random.random())


In [6]:
def process_chunk(args):
    document, chunk, model, client = args
    result = gpt_contextual_retrieval(document.page_content, chunk, model, client)
    if result is not None:
        return {
            "docid": document.metadata['docid'],
            "content": f"{chunk}\n\n{result}"
        }
    return None


def process_documents(documents, text_splitter, output_file, max_workers=5):
    with open(output_file, 'w', encoding='utf-8') as f:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for document in tqdm(documents, desc="Processing documents"):
                chunks = text_splitter.split_text(document.page_content)
                futures = [executor.submit(process_chunk, (document, chunk, model, client)) for chunk in chunks]
                
                for future in as_completed(futures):
                    result = future.result()
                    if result is not None:
                        f.write(json.dumps(result, ensure_ascii=False) + '\n')
                
                time.sleep(random.uniform(1, 2))  # 문서 간 1~2초 랜덤 대기

In [7]:
output_file = '../dataset/gpt_contextual_retrieval_documents_en_v3.jsonl'
process_documents(total_documents, text_splitter, output_file)

Processing documents: 100%|██████████| 4272/4272 [6:18:58<00:00,  5.32s/it]  
