In [1]:
import sys
sys.path.append("/data/ephemeral/home/Upstage_Ai_Lab/Final/IR/src")

import os
import time
import json
import random
import warnings
import anthropic
import threading
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI
from itertools import product
from scipy.spatial.distance import cosine
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain.retrievers import EnsembleRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = anthropic_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from config import Args
from data.data import load_document, load_query, chunking

from sparse_retriever.model import load_sparse_model
from dense_retriever.model import load_dense_model, load_hf_encoder, load_upstage_encoder, load_openai_encoder, load_voyage_encoder

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /data/ephemeral/home/.cache/huggingface/token
Login successful


In [2]:
args = Args()

documents_data_path = "../dataset/gpt_contextual_retrieval_documents_v3.jsonl"
questions_data_path = "../dataset/processed_documents_queries.jsonl"

documents = load_document(documents_data_path)
questions = load_query(questions_data_path)

random.shuffle(questions)
questions = questions[:100]

In [3]:
retriever_weights_list = [[0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5]]
ensemble_weights_list = [[0.3, 0.3, 0.4], [0.2, 0.2, 0.6], [0.1, 0.1, 0.8]]

In [4]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

In [5]:
folder_path = f"./index_files/{args.encoder_method}"
os.makedirs(folder_path, exist_ok=True)

In [7]:
best_map = 0
best_params = {}

args.faiss_index_file = None
args.ensemble_models = [
    {'type': 'hf', 'name': "BAAI/bge-m3"},
    {'type': 'hf', 'name': "intfloat/multilingual-e5-large"},
    {'type': 'upstage', 'name': "solar-embedding-1-large-query"}
]

# ensemble encoders 준비
ensemble_encoders = []
for model_info in args.ensemble_models:
    model_type = model_info.get('type', 'hf')  # 기본값은 'hf'
    model_name = model_info['name']
    if model_type == 'hf':
        encoder = load_hf_encoder(model_name, args.model_kwargs, args.encode_kwargs)
    elif model_type == 'upstage':
        encoder = load_upstage_encoder(model_name)
    elif model_type == 'voyage':
        encoder = load_voyage_encoder(model_name)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    ensemble_encoders.append(encoder)

# 각 파라미터 조합에 대해 검색 및 평가 수행
for retriever_weights, ensemble_weights in product(retriever_weights_list, ensemble_weights_list):
    # 문서 chunking
    # chunk_documents = chunking(args, documents)

    # sparse retriever 로드
    sparse_retriever = load_sparse_model(documents, args.src_lang)
    sparse_retriever.k = 10

    # FAISS 인덱스 설정
    folder_path = f"/data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/{args.encoder_method}/{args.upstage_model_name}-cs{args.chunk_size}-co{args.chunk_overlap}"
    if not os.path.exists(folder_path):
        print("None")
        args.faiss_index_file = None
    else:
        print("Load")
        args.faiss_index_file = folder_path

    # dense retriever 로드
    dense_retriever = load_dense_model(args, documents).as_retriever(search_kwargs={"k": 10})

    # 앙상블 retriever 설정
    retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=args.retriever_weights,
        search_type="mrr"
    )

    # 정답 레이블 생성
    gt = {}
    for question in questions:
        query, question_id = question['query'], question['metadata']['docid']
        gt[question_id] = [question_id]
    
    # 예측 결과 수집
    pred = []
    for question in tqdm(questions):
        query, question_id = question['query'], question['metadata']['docid']

        # 쿼리 임베딩 계산 및 가중치 적용
        query_embeddings = []
        for idx, encoder in enumerate(ensemble_encoders):
            query_embedding = encoder.embed_query(query)
            query_embeddings.append((query_embedding, ensemble_weights[idx]))
        
        # 검색 수행
        search_result = retriever.invoke(query)
        # for result in search_result:
            # print(result)
        
        combined_scores = []
        for doc in search_result:
            combined_similarity = 0
            for idx, (query_embedding, weight) in enumerate(query_embeddings):
                # 문서 임베딩 가져오기
                doc_embedding_key = f'embedding_{args.ensemble_models[idx]["name"]}'
                doc_embedding = doc.metadata.get(doc_embedding_key) or ensemble_encoders[idx].embed_query(doc.page_content)
                
                # 유사도 계산 및 가중치 적용
                similarity = 1 - cosine(query_embedding, doc_embedding)
                combined_similarity += weight * similarity
            combined_scores.append((doc, combined_similarity))
        
        # top-k 결과 수집
        topk_result = [doc.metadata.get('docid') for doc, _ in sorted(combined_scores, key=lambda x: x[1], reverse=True)]
        
        pred.append({
            "eval_id": question_id,
            "topk": topk_result
        })
    
    # MAP 계산
    mean_average_precision = calc_map(gt, pred)
    print(f"Parameters: chunk_size={args.chunk_size}, chunk_overlap={args.chunk_overlap}, retriever_weights={retriever_weights}, ensemble_weights={ensemble_weights}")
    print(f"Mean Average Precision (MAP): {mean_average_precision}\n")
    
    # 최적의 MAP 값과 파라미터 저장
    if mean_average_precision > best_map:
        best_map = mean_average_precision
        best_params = {
            'chunk_size': args.chunk_size,
            'chunk_overlap': args.chunk_overlap,
            'retriever_weights': retriever_weights,
            'ensemble_weights': ensemble_weights
        }


Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:32<00:00,  9.92s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.2, 0.8], ensemble_weights=[0.3, 0.3, 0.4]
Mean Average Precision (MAP): 0.9525

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:27<00:00,  9.87s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.2, 0.8], ensemble_weights=[0.2, 0.2, 0.6]
Mean Average Precision (MAP): 0.9475

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:23<00:00,  9.83s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.2, 0.8], ensemble_weights=[0.1, 0.1, 0.8]
Mean Average Precision (MAP): 0.955

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:31<00:00,  9.91s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.3, 0.7], ensemble_weights=[0.3, 0.3, 0.4]
Mean Average Precision (MAP): 0.9525

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:11<00:00,  9.72s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.3, 0.7], ensemble_weights=[0.2, 0.2, 0.6]
Mean Average Precision (MAP): 0.9475

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:05<00:00,  9.66s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.3, 0.7], ensemble_weights=[0.1, 0.1, 0.8]
Mean Average Precision (MAP): 0.955

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:21<00:00,  9.82s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.4, 0.6], ensemble_weights=[0.3, 0.3, 0.4]
Mean Average Precision (MAP): 0.9525

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:20<00:00,  9.80s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.4, 0.6], ensemble_weights=[0.2, 0.2, 0.6]
Mean Average Precision (MAP): 0.9475

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [16:23<00:00,  9.83s/it]


Parameters: chunk_size=0, chunk_overlap=0, retriever_weights=[0.4, 0.6], ensemble_weights=[0.1, 0.1, 0.8]
Mean Average Precision (MAP): 0.955

Load
FAISS 인덱스 로드 중: /data/ephemeral/home/Upstage_Ai_Lab/Final/IR/notebooks/index_files/upstage/solar-embedding-1-large-passage-cs0-co0
FAISS 인덱스 로드 완료, 총 문서 수: 24799


 36%|███▌      | 36/100 [05:47<10:18,  9.66s/it]


KeyboardInterrupt: 

In [None]:
print(f"Best Mean Average Precision (MAP): {best_map}")
print(f"Best Parameters:")
print(f"  chunk_size: {best_params['chunk_size']}")
print(f"  chunk_overlap: {best_params['chunk_overlap']}")
print(f"  retriever_weights: {best_params['retriever_weights']}")
print(f"  ensemble_weights: {best_params['ensemble_weights']}")