In [1]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR/src")

import os
import time
import json
import random
import warnings
import anthropic
import threading
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_text_splitters import RecursiveCharacterTextSplitter

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = anthropic_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from config import Args
from data.data import load_document
from dense_retriever.model import load_dense_model, load_sparse_model


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
args = Args()

total_documents = load_document(path=args.doc_file_path)
print(len(total_documents))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = args.chunk_size,
    chunk_overlap  = args.chunk_overlap,
    length_function = len,
)

client = anthropic.Anthropic(api_key=anthropic_api_key)

4272


In [3]:
DOCUMENT_CONTEXT_PROMPT = """
<document>
{doc_content}
</document>
"""

CHUNK_CONTEXT_PROMPT = """
전체 문서 내에 배치하려는 청크는 다음과 같습니다.
<chunk>
{chunk_content}
</chunk>

이 청크가 전체 문서에서 어떤 맥락에 속하는지 설명하는 간결한 문맥을 한국어로 작성하세요. 답변은 이 청크에 관한 짧고 구체적인 배경 설명을 포함해야 하며, 청크가 문서의 어느 부분에서 나온 것인지에 대한 정보를 제공해야 합니다.

    입력 예시:
        회사의 매출이 전 분기 대비 3% 증가했습니다.

    주어진 청크 예시에서는 '회사'가 어떤 회사를 말하는 것인지, '전 분기'가 정확하게 몇년도 몇분기에 대한 것인지 정보가 포함되어 있지 않습니다. 따라서 당신은 아래 출력 예시처럼 입력되는 청크를 읽고 정보검색에 유용하도록 더 명확한 청크로 재구성해야합니다.

    출력 예시: 
        이 청크는 2023년 2분기에 ACME 회사의 실적을 다룬 SEC 보고서에서 발췌되었습니다. 이전 분기의 수익은 3억 1천 4백만 달러였으며, 회사의 수익은 이전 분기 대비 3% 증가했습니다.
"""

token_counts = {
    'input': 0,
    'output': 0,
    'cache_read': 0,
    'cache_creation': 0
}
token_lock = threading.Lock()

In [4]:
def situate_context(doc: str, chunk: str, max_retries=5) -> str:
    for attempt in range(max_retries):
        try:
            response = client.beta.prompt_caching.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=1024,
                temperature=0.0,
                messages=[
                    {
                        "role": "user", 
                        "content": [
                            {
                                "type": "text",
                                "text": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc),
                                "cache_control": {"type": "ephemeral"}
                            },
                            {
                                "type": "text",
                                "text": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),
                            }
                        ]
                    }
                ],
                extra_headers={"anthropic-beta": "prompt-caching-2024-07-31"}
            )
            with token_lock:
                token_counts['input'] += response.usage.input_tokens
                token_counts['output'] += response.usage.output_tokens
                token_counts['cache_read'] += response.usage.cache_read_input_tokens
                token_counts['cache_creation'] += response.usage.cache_creation_input_tokens
            return response
        except anthropic.RateLimitError as e:
            if attempt == max_retries - 1:
                raise
            wait_time = (2 ** attempt) + (random.random() * 0.1)
            print(f"Rate limit hit. Waiting for {wait_time:.2f} seconds before retry.")
            time.sleep(wait_time)

In [5]:
def process_chunk(document, chunk):
    result = situate_context(document.page_content, chunk)
    return {
        "docid": document.metadata['docid'],
        "content": f"{chunk}\n\n{result.content[0].text}"
    }

In [6]:
def process_documents(documents, text_splitter, output_file, parallel_threads=5):
    with open(output_file, 'w', encoding='utf-8') as f:
        with ThreadPoolExecutor(max_workers=parallel_threads) as executor:
            for document in tqdm(documents, desc="Processing documents"):
                chunks = text_splitter.split_text(document.page_content)
                futures = [executor.submit(process_chunk, document, chunk) for chunk in chunks]
                
                for future in as_completed(futures):
                    try:
                        result = future.result()
                        f.write(json.dumps(result, ensure_ascii=False) + '\n')
                    except Exception as e:
                        print(f"Error processing chunk: {e}")
                
                time.sleep(random.uniform(1, 2))

In [None]:
output_file = '../dataset/antropic_contextual_retrieval_documents.jsonl'

# 중단 지점 저장 및 불러오기
try:
    with open('progress.json', 'r') as f:
        progress = json.load(f)
        start_index = progress['last_processed_index'] + 1
except FileNotFoundError:
    start_index = 0

try:
    process_documents(total_documents[start_index:], text_splitter, output_file)
except KeyboardInterrupt:
    print("작업이 중단되었습니다. 진행 상황을 저장합니다.")
finally:
    with open('progress.json', 'w') as f:
        json.dump({'last_processed_index': start_index + len(total_documents) - 1}, f)

# 토큰 사용량 출력
print(f"Total input tokens: {token_counts['input']}")
print(f"Total output tokens: {token_counts['output']}")
print(f"Total tokens read from cache: {token_counts['cache_read']}")
print(f"Total tokens written to cache: {token_counts['cache_creation']}")

total_tokens = token_counts['input'] + token_counts['cache_read'] + token_counts['cache_creation']
savings_percentage = (token_counts['cache_read'] / total_tokens) * 100 if total_tokens > 0 else 0
print(f"Total input token savings from prompt caching: {savings_percentage:.2f}% of all input tokens used were read from cache.")

Processing documents:   0%|          | 20/4272 [02:05<8:24:36,  7.12s/it]

Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:   0%|          | 21/4272 [02:16<9:56:14,  8.42s/it]

Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:   2%|▏         | 103/4272 [12:19<7:35:03,  6.55s/it]

Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:   3%|▎         | 108/4272 [13:11<10:13:12,  8.84s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:   3%|▎         | 128/4272 [15:49<8:15:59,  7.18s/it] 

Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:   3%|▎         | 136/4272 [16:58<7:57:46,  6.93s/it] 

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:   3%|▎         | 142/4272 [17:47<7:41:07,  6.70s/it] 

Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:   3%|▎         | 144/4272 [18:06<9:05:20,  7.93s/it]

Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 2.03 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:   4%|▎         | 159/4272 [20:03<7:04:07,  6.19s/it] 

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:   4%|▍         | 192/4272 [24:31<9:00:31,  7.95s/it] 

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:   5%|▍         | 205/4272 [26:20<7:44:35,  6.85s/it] 

Rate limit hit. Waiting for 1.01 seconds before retry.


Processing documents:   5%|▌         | 231/4272 [29:24<5:46:37,  5.15s/it] 

Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.07 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.07 seconds before retry.
Rate limit hit. Waiting for 2.09 seconds before retry.
Rate limit hit. Waiting for 2.00 seconds before retry.


Processing documents:   5%|▌         | 232/4272 [29:49<12:22:06, 11.02s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.


Processing documents:   6%|▌         | 252/4272 [32:12<7:08:59,  6.40s/it] 

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:   6%|▌         | 254/4272 [32:34<9:32:06,  8.54s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:   6%|▌         | 260/4272 [33:10<5:44:14,  5.15s/it] 

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:   6%|▋         | 271/4272 [34:27<6:25:41,  5.78s/it]

Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:   8%|▊         | 329/4272 [41:52<9:20:01,  8.52s/it] 

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:   8%|▊         | 332/4272 [42:20<9:33:18,  8.73s/it] 

Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:   8%|▊         | 340/4272 [43:30<8:24:54,  7.70s/it] 

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:   8%|▊         | 347/4272 [44:39<10:23:47,  9.54s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:   8%|▊         | 349/4272 [45:01<11:21:16, 10.42s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:   9%|▊         | 369/4272 [47:28<7:27:08,  6.87s/it] 

Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 2.01 seconds before retry.


Processing documents:   9%|▉         | 378/4272 [48:51<7:25:11,  6.86s/it] 

Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:   9%|▉         | 391/4272 [50:24<7:35:40,  7.04s/it]

Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  10%|▉         | 416/4272 [53:20<8:19:11,  7.77s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 2.03 seconds before retry.


Processing documents:  11%|█         | 456/4272 [58:19<7:06:08,  6.70s/it] 

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:  11%|█         | 462/4272 [59:08<7:17:53,  6.90s/it]

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:  11%|█         | 464/4272 [59:30<9:06:50,  8.62s/it] 

Rate limit hit. Waiting for 1.05 seconds before retry.


Processing documents:  12%|█▏        | 495/4272 [1:03:17<6:26:36,  6.14s/it] 

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  12%|█▏        | 531/4272 [1:07:51<6:17:10,  6.05s/it]

Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  13%|█▎        | 536/4272 [1:08:34<7:07:46,  6.87s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:  13%|█▎        | 542/4272 [1:09:34<9:27:57,  9.14s/it] 

Rate limit hit. Waiting for 1.01 seconds before retry.


Processing documents:  13%|█▎        | 555/4272 [1:11:01<6:41:30,  6.48s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  13%|█▎        | 576/4272 [1:13:51<8:47:37,  8.57s/it]

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:  14%|█▍        | 617/4272 [1:18:47<5:54:46,  5.82s/it] 

Rate limit hit. Waiting for 1.00 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  15%|█▌        | 654/4272 [1:23:25<7:45:42,  7.72s/it] 

Rate limit hit. Waiting for 1.10 seconds before retry.


Processing documents:  16%|█▌        | 668/4272 [1:25:12<7:21:19,  7.35s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 2.02 seconds before retry.
Rate limit hit. Waiting for 2.07 seconds before retry.


Processing documents:  16%|█▌        | 671/4272 [1:25:51<9:44:49,  9.74s/it] 

Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 2.00 seconds before retry.


Processing documents:  16%|█▌        | 680/4272 [1:27:14<7:04:00,  7.08s/it] 

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:  16%|█▌        | 684/4272 [1:27:46<7:06:48,  7.14s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 2.01 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 4.06 seconds before retry.
Rate limit hit. Waiting for 2.05 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.


Processing documents:  16%|█▋        | 701/4272 [1:30:26<6:23:32,  6.44s/it] 

Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  17%|█▋        | 706/4272 [1:31:21<8:26:46,  8.53s/it] 

Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  17%|█▋        | 722/4272 [1:33:17<5:57:22,  6.04s/it] 

Rate limit hit. Waiting for 1.00 seconds before retry.


Processing documents:  17%|█▋        | 726/4272 [1:33:56<8:08:12,  8.26s/it]

Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  17%|█▋        | 742/4272 [1:35:44<5:46:17,  5.89s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.00 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  18%|█▊        | 757/4272 [1:37:51<7:26:21,  7.62s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  18%|█▊        | 758/4272 [1:38:03<8:38:51,  8.86s/it]

Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  18%|█▊        | 778/4272 [1:40:45<8:38:37,  8.91s/it]

Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  18%|█▊        | 785/4272 [1:41:42<6:29:14,  6.70s/it] 

Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  18%|█▊        | 788/4272 [1:42:09<7:20:11,  7.58s/it]

Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.05 seconds before retry.


Processing documents:  19%|█▊        | 792/4272 [1:43:05<11:03:48, 11.45s/it]

Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 1.07 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  19%|█▉        | 808/4272 [1:45:01<6:41:24,  6.95s/it] 

Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  19%|█▉        | 809/4272 [1:47:32<48:22:46, 50.29s/it]

Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:  19%|█▉        | 813/4272 [1:54:48<84:40:04, 88.12s/it]

Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:  19%|█▉        | 815/4272 [1:58:51<97:03:56, 101.08s/it] 

Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  19%|█▉        | 816/4272 [2:01:48<118:47:56, 123.75s/it]

Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  19%|█▉        | 820/4272 [2:07:58<92:17:53, 96.26s/it]  

Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  19%|█▉        | 822/4272 [2:12:26<106:09:15, 110.77s/it]

Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  19%|█▉        | 824/4272 [2:17:49<123:49:34, 129.28s/it]

Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:  19%|█▉        | 827/4272 [2:23:49<111:43:13, 116.75s/it]

Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  19%|█▉        | 830/4272 [2:29:27<104:19:32, 109.11s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.05 seconds before retry.


Processing documents:  20%|█▉        | 837/4272 [2:40:05<75:42:38, 79.35s/it]  

Rate limit hit. Waiting for 1.07 seconds before retry.
Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  20%|█▉        | 838/4272 [2:42:22<92:12:30, 96.67s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.


Processing documents:  20%|█▉        | 844/4272 [2:51:51<86:52:11, 91.23s/it]  

Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.01 seconds before retry.


Processing documents:  20%|█▉        | 845/4272 [2:54:07<99:32:09, 104.56s/it]

Rate limit hit. Waiting for 1.05 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  20%|█▉        | 848/4272 [3:00:26<108:38:28, 114.23s/it]

Rate limit hit. Waiting for 1.08 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.
Rate limit hit. Waiting for 1.03 seconds before retry.


Processing documents:  20%|█▉        | 850/4272 [3:05:02<115:39:34, 121.68s/it]

Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  20%|█▉        | 851/4272 [3:08:10<134:31:05, 141.56s/it]

Rate limit hit. Waiting for 1.10 seconds before retry.
Rate limit hit. Waiting for 1.06 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.


Processing documents:  20%|█▉        | 852/4272 [3:11:05<144:01:51, 151.61s/it]

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:  20%|█▉        | 853/4272 [3:13:08<135:43:03, 142.90s/it]

Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  20%|██        | 857/4272 [3:18:16<82:42:37, 87.19s/it]  

Rate limit hit. Waiting for 1.06 seconds before retry.


Processing documents:  20%|██        | 859/4272 [3:22:00<92:31:27, 97.59s/it] 

Rate limit hit. Waiting for 1.05 seconds before retry.


Processing documents:  20%|██        | 861/4272 [3:25:50<98:34:57, 104.05s/it] 

Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.10 seconds before retry.
Rate limit hit. Waiting for 1.09 seconds before retry.
Rate limit hit. Waiting for 2.04 seconds before retry.


Processing documents:  20%|██        | 862/4272 [3:29:47<136:12:49, 143.80s/it]

Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.01 seconds before retry.
Rate limit hit. Waiting for 1.02 seconds before retry.


Processing documents:  20%|██        | 864/4272 [3:34:05<124:33:50, 131.58s/it]

Rate limit hit. Waiting for 1.07 seconds before retry.


Processing documents:  20%|██        | 865/4272 [3:36:08<122:05:02, 129.00s/it]

Rate limit hit. Waiting for 1.03 seconds before retry.
Rate limit hit. Waiting for 1.04 seconds before retry.


Processing documents:  20%|██        | 866/4272 [3:38:15<121:36:24, 128.53s/it]

Rate limit hit. Waiting for 1.05 seconds before retry.


Processing documents:  20%|██        | 870/4272 [3:44:41<95:27:47, 101.02s/it] 

In [None]:
client = OpenAI()
model = "gpt-3.5-turbo" ## "gpt-4o"

# client = OpenAI(
#     api_key=upstage_api_key,
#     base_url="https://api.upstage.ai/v1/solar"
# )
# model = "solar-pro"

In [None]:
def gpt_contextual_retrieval(document, chunk, model: str, client: OpenAI):
    prompt = (
    """
    <document>
    {DOCUMENT}
    </document> 

    전체 문서 내에 배치하려는 청크는 다음과 같습니다.

    <chunk> 
    {CHUNK}
    </chunk>
    
    이 청크가 전체 문서에서 어떤 맥락에 속하는지 설명하는 간결한 문맥을 한국어로 작성하세요. 답변은 이 청크에 관한 짧고 구체적인 배경 설명을 포함해야 하며, 청크가 문서의 어느 부분에서 나온 것인지에 대한 정보를 제공해야 합니다.
    
    입력 예시:
        회사의 매출이 전 분기 대비 3% 증가했습니다.

    예시에 대한 설명:
        주어진 청크 예시에서는 '회사'가 어떤 회사를 말하는 것인지, '전 분기'가 정확하게 몇년도 몇분기에 대한 것인지 정보가 포함되어 있지 않습니다. 따라서 당신은 아래 출력 예시처럼 입력되는 청크를 읽고 정보검색에 유용하도록 더 명확한 청크로 재구성해야합니다.

    출력 예시: 
        이 청크는 2023년 2분기에 ACME 회사의 실적을 다룬 SEC 보고서에서 발췌되었습니다. 이전 분기의 수익은 3억 1천 4백만 달러였으며, 회사의 수익은 이전 분기 대비 3% 증가했습니다.
    """
    ).format(DOCUMENT=document, CHUNK=chunk)

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1
    )
    
    response = completion.choices[0].message.content
    return response


In [None]:
with open('../dataset/gpt_contextual_retrieval_documents.jsonl', 'w', encoding='utf-8') as f:
    # for document in tqdm(documents):
    for idx, document in enumerate(total_documents):
        print("=" * 50)
        print(idx)
        print(f"docid : {document.metadata['docid']}")
        print(f"page_content : {document.page_content}")

        chunks = text_splitter.split_text(document.page_content)
        print(f"num of chunks : {len(chunks)}\n")

        for idx, chunk in enumerate(chunks, start=1):
            print(f"chunk {idx} : {chunk}")
            result = gpt_contextual_retrieval(document, chunk, model, client)
            print(f"output : {result}\n")

            result_with_id = {
                "docid": document.metadata['docid'],
                "content": f"{chunk}\n\n{result}"
            }
            f.write(json.dumps(result_with_id, ensure_ascii=False) + '\n')