In [None]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR/src")

import os
import re
import json
import faiss
import random
import warnings
import pandas as pd
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI
from itertools import product

from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores.faiss import FAISS

from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker

from config import Args
from data.data import load_document, load_query, chunking
from dense_retriever.model import load_dense_model
from sparse_retriever.model import load_sparse_model

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

In [3]:
args = Args()
num_questions = 1000

In [5]:
en_document_path = "../dataset/en_4.0_processed_documents_queries.jsonl"
ko_document_path = "../dataset/processed_documents_queries.jsonl"

en_documents = load_document(en_document_path)
en_questions = load_query(en_document_path)

ko_documents = load_document(ko_document_path)
ko_questions = load_query(ko_document_path)

random.shuffle(en_questions)
en_questions = en_questions[:num_questions]

random.shuffle(ko_questions)
ko_questions = ko_questions[:num_questions]

In [None]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

In [None]:
# 파라미터 값의 범위를 설정합니다.
chunk_sizes = [50, 100, 200, 300, 400]
chunk_overlaps = [0, 25, 50, 100]
retriever_weights_list = [[0.5, 0.5], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3], [0.3, 0.7]]

# 최적의 MAP 값을 추적하기 위한 변수 초기화
best_map = 0
best_params = {}

# 모든 파라미터 조합에 대해 실험을 수행합니다.
for chunk_size, chunk_overlap, retriever_weights in product(chunk_sizes, chunk_overlaps, retriever_weights_list):
    args.chunk_size = chunk_size
    args.chunk_overlap = chunk_overlap
    args.retriever_weights = retriever_weights
    
    # 문서 조각화
    chunk_documents = chunking(args, ko_documents)
    
    # Sparse 및 Dense 리트리버 로드
    sparse_retriever = load_sparse_model(ko_documents)
    sparse_retriever.k = 5
    
    dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 5})
    
    # 앙상블 리트리버 설정
    retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=args.retriever_weights,
        search_type="similarity_score_threshold"  # 또는 "mmr"
    )
    
    # 정답 레이블 생성
    gt = {}
    for question in ko_questions:
        query, question_id = question['query'], question['metadata']['docid']
        gt[question_id] = [question_id]
    
    # 예측 결과 수집
    pred = []
    for question in tqdm(ko_questions):
        query, question_id = question['query'], question['metadata']['docid']
        
        search_result = retriever.invoke(query)
        
        topk_result = []
        for result in search_result:
            topk_result.append(result.metadata.get('docid'))
        
        pred.append({
            "eval_id": question_id,
            "topk": topk_result
        })
    
    # MAP 계산
    mean_average_precision = calc_map(gt, pred)
    print("Parameters:")
    print(f"  -chunk_size={chunk_size}")
    print(f"  -chunk_overlap={chunk_overlap}")
    print(f"  -retriever_weights={retriever_weights}\n")
    print(f"Mean Average Precision (MAP): {mean_average_precision}\n")
    
    # 최적의 MAP 값과 파라미터 저장
    if mean_average_precision > best_map:
        best_map = mean_average_precision
        best_params = {
            'chunk_size': chunk_size,
            'chunk_overlap': chunk_overlap,
            'retriever_weights': retriever_weights
        }

In [None]:
print(f"Best Mean Average Precision (MAP): {best_map}")
print(f"Best Parameters:")
print(f"  chunk_size: {best_params['chunk_size']}")
print(f"  chunk_overlap: {best_params['chunk_overlap']}")
print(f"  retriever_weights: {best_params['retriever_weights']}")
