In [1]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR")

import os
import re
import json
import faiss
import random
import warnings
import pandas as pd
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI

from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores.faiss import FAISS

from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker

from src.sparse_retriever.kiwi_bm25 import KiwiBM25Retriever

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
class Args:
    retrieval_debug = False
    llm_model = "ollama"
    
    src_lang = "ko"
    if src_lang == "en":
        eval_file_path = "../dataset/eval.jsonl" ## "../dataset/en_eval.jsonl" --> 성능이 별로임.
        doc_file_path = "../dataset/en_4.0_document.jsonl" ## "../dataset/processed_documents.jsonl"
    else:
        eval_file_path = "../dataset/eval.jsonl"
        doc_file_path = "../dataset/processed_documents.jsonl"

    output_path = "./outputs/output.csv"

    ## sparse or dense or ensemble
    doc_method = "dense"
    encoder_method = "huggingface" ## huggingface, upstage, openai
    retriever_weights = [0.4, 0.6] ## [sparse, dense] ## BEST [0.4, 0.6]

    ## HuggingFace
    hf_model_name = "intfloat/multilingual-e5-large-instruct"
    model_kwargs = {"device": "cuda:0"}
    encode_kwargs = {"normalize_embeddings": False,
                     "clean_up_tokenization_spaces": True}
    
    ## Upstage
    upstage_model_name = "solar-embedding-1-large-passage"
    faiss_index_file = "./index_files/upstage-faiss.npy"
    
    ## OpenAI
    openai_model_name = "text-embedding-3-large"

    ## chunking
    chunking = True
    chunk_method = "recursive" ## recursive, semantic
    semantic_chunk_method = "upstage"
    chunk_size = 320
    chunk_overlap = 80

In [3]:
args = Args()

In [4]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


def load_document(file_path):
    raw_documents = load_jsonl(file_path)

    documents = []
    for doc in raw_documents:
        doc_id = doc['docid']
        content = doc['content']
        documents.append(Document(page_content=content, metadata={"docid": doc_id}))
    
    return documents

def load_query(file_path):
    raw_queries = load_jsonl(file_path)

    queries = []
    for query in raw_queries:
        doc_id = query['docid']

        for i in range(1, 4):
            queries.append({"query": query[f'question{i}'], "metadata": {"docid": doc_id}})
    
    return queries


def score_normalizer(val: float) -> float:
    return 1 / (1 + val)

def load_upstage_encoder(model_name):
    encoder = UpstageEmbeddings(model=model_name)

    return encoder

def load_openai_encoder(model_name):
    encoder = OpenAIEmbeddings(model=model_name)

    return encoder

def load_hf_encoder(model_name, model_kwargs, encode_kwargs):
    encoder = HuggingFaceEmbeddings(model_name=model_name,
                                    model_kwargs=model_kwargs,
                                    encode_kwargs=encode_kwargs)
    
    return encoder

def load_hf_reranker(model_name, retriever):
    reranker = HuggingFaceCrossEncoder(model_name=model_name)
    compressor = CrossEncoderReranker(model=reranker, top_n=3)
    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever.as_retriever(search_kwargs={"k": 10}))

    return compression_retriever

def load_sparse_model(documents):
    from konlpy.tag import Okt
    okt = Okt()
    def tokenize(text):
        tokens = okt.morphs(text)
        return tokens

    retriever = KiwiBM25Retriever.from_documents(documents)
    retriever = BM25Retriever.from_documents(documents, tokenizer=tokenize)
    
    return retriever


def load_dense_model(args, documents):
    if args.encoder_method == "huggingface":
        encoder = load_hf_encoder(args.hf_model_name, args.model_kwargs, args.encode_kwargs)
        print(f"Embedding Model : {args.hf_model_name}")

    elif args.encoder_method == "upstage":
        encoder = load_upstage_encoder(args.upstage_model_name)
        print(f"Embedding Model : {args.upstage_model_name}")

    elif args.encoder_method == "openai":
        encoder = load_openai_encoder(args.openai_model_name)
        print(f"Embedding Model : {args.openai_model_name}")

    index = faiss.IndexFlatL2(len(encoder.embed_query("hello world")))
    vector_store = FAISS(
        embedding_function=encoder,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        relevance_score_fn=score_normalizer
    )
    vector_store.add_documents(documents=documents)
    retriever = vector_store

    # faiss.write_index(index, f"./index_files/{args.encoder_method}-faiss.npy")
    print(f"FAISS 인덱스에 추가된 문서 수: {index.ntotal}")

    return retriever

def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)


def chunking(args, documents):
    if args.chunk_method == "recursive":
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=args.chunk_size,
            chunk_overlap=args.chunk_overlap,
            length_function=len,
            is_separator_regex=False
        )
    elif args.chunk_method == "semantic":
        if args.semantic_chunk_method == "huggingface":
            encoder = load_hf_encoder(args.hf_model_name, args.model_kwargs, args.encode_kwargs)
        elif args.semantic_chunk_method == "upstage":
            encoder = load_upstage_encoder(args.upstage_model_name)
        elif args.semantic_chunk_method == "openai":
            encoder = load_openai_encoder(args.openai_model_name)

        text_splitter = SemanticChunker(encoder)

    return text_splitter.split_documents(documents)

In [5]:
en_document_path = "../dataset/en_4.0_processed_documents_queries.jsonl"
ko_document_path = "../dataset/processed_documents_queries.jsonl"

en_documents = load_document(en_document_path)
en_questions = load_query(en_document_path)

random.shuffle(en_questions)
en_questions = en_questions[:500]

ko_documents = load_document(ko_document_path)
ko_questions = load_query(ko_document_path)

random.shuffle(ko_questions)
ko_questions = ko_questions[:500]

## Sparse

In [6]:
sparse_retriever = load_sparse_model(ko_documents)

In [7]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = sparse_retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:01<00:00, 373.55it/s]

Mean Average Precision (MAP): 0.782





## Dense

In [8]:
dense_retriever = load_dense_model(args, ko_documents).as_retriever(search_kwargs={"k": 3})

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4272


In [9]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = dense_retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:04<00:00, 108.37it/s]

Mean Average Precision (MAP): 0.856





## Hybrid

In [10]:
sparse_retriever = load_sparse_model(ko_documents)
sparse_retriever.k = 5

dense_retriever = load_dense_model(args, ko_documents).as_retriever(search_kwargs={"k": 5})

print(args.retriever_weights)
retriever = EnsembleRetriever(
    retrievers=[sparse_retriever, dense_retriever],
    weights=args.retriever_weights,
    search_type="similarity_score_threshold" ## "mmr"
)

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4272
[0.4, 0.6]


In [11]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:06<00:00, 78.95it/s]

Mean Average Precision (MAP): 0.8676666666666666





## Chunking + Sparse

In [30]:
args.chunk_size = 320
args.chunk_overlap = 80

chunk_documents = chunking(args, ko_documents)

In [31]:
sparse_retriever = load_sparse_model(chunk_documents)

In [32]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = sparse_retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:01<00:00, 357.98it/s]

Mean Average Precision (MAP): 0.7736666666666665





## Chunking + Dense

In [50]:
args.chunk_size = 100
args.chunk_overlap = 50

chunk_documents = chunking(args, ko_documents)

In [51]:
dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 3})

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


In [52]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = dense_retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:11<00:00, 45.10it/s]

Mean Average Precision (MAP): 0.8984999999999993





## Chunking + Hybrid

In [59]:
args.chunk_size = 100
args.chunk_overlap = 50
args.retriever_weights = [0.4, 0.6] ## sparse_weights, dense_weights

chunk_documents = chunking(args, ko_documents)

In [60]:
sparse_retriever = load_sparse_model(ko_documents)
sparse_retriever.k = 5

dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 5})

print(args.retriever_weights)
retriever = EnsembleRetriever(
    retrievers=[sparse_retriever, dense_retriever],
    weights=args.retriever_weights,
    search_type="similarity_score_threshold" ## "mmr"
)

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799
[0.4, 0.6]


In [61]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retriever.invoke(query)
    
    topk_result = []
    for result in search_result:
        topk_result.append(result.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:12<00:00, 40.06it/s]

Mean Average Precision (MAP): 0.8969999999999994





In [62]:
from itertools import product
from tqdm import tqdm

# 파라미터 값의 범위를 설정합니다.
chunk_sizes = [50, 100, 200, 300, 400]
chunk_overlaps = [0, 25, 50, 100]
retriever_weights_list = [[0.5, 0.5], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3], [0.3, 0.7]]

# 최적의 MAP 값을 추적하기 위한 변수 초기화
best_map = 0
best_params = {}

# 모든 파라미터 조합에 대해 실험을 수행합니다.
for chunk_size, chunk_overlap, retriever_weights in product(chunk_sizes, chunk_overlaps, retriever_weights_list):
    args.chunk_size = chunk_size
    args.chunk_overlap = chunk_overlap
    args.retriever_weights = retriever_weights
    
    # 문서 조각화
    chunk_documents = chunking(args, ko_documents)
    
    # Sparse 및 Dense 리트리버 로드
    sparse_retriever = load_sparse_model(ko_documents)
    sparse_retriever.k = 5
    
    dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 5})
    
    # 앙상블 리트리버 설정
    retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=args.retriever_weights,
        search_type="similarity_score_threshold"  # 또는 "mmr"
    )
    
    # 정답 레이블 생성
    gt = {}
    for question in ko_questions:
        query, question_id = question['query'], question['metadata']['docid']
        gt[question_id] = [question_id]
    
    # 예측 결과 수집
    pred = []
    for question in tqdm(ko_questions):
        query, question_id = question['query'], question['metadata']['docid']
        
        search_result = retriever.invoke(query)
        
        topk_result = []
        for result in search_result:
            topk_result.append(result.metadata.get('docid'))
        
        pred.append({
            "eval_id": question_id,
            "topk": topk_result
        })
    
    # MAP 계산
    mean_average_precision = calc_map(gt, pred)
    print(f"Parameters: chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, retriever_weights={retriever_weights}")
    print(f"Mean Average Precision (MAP): {mean_average_precision}\n")
    
    # 최적의 MAP 값과 파라미터 저장
    if mean_average_precision > best_map:
        best_map = mean_average_precision
        best_params = {
            'chunk_size': chunk_size,
            'chunk_overlap': chunk_overlap,
            'retriever_weights': retriever_weights
        }

# 최적의 파라미터 출력
print(f"Best Mean Average Precision (MAP): {best_map}")
print(f"Best Parameters:")
print(f"  chunk_size: {best_params['chunk_size']}")
print(f"  chunk_overlap: {best_params['chunk_overlap']}")
print(f"  retriever_weights: {best_params['retriever_weights']}")


Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 15773


100%|██████████| 500/500 [00:10<00:00, 49.68it/s]


Parameters: chunk_size=100, chunk_overlap=0, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8258333333333329

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 15773


100%|██████████| 500/500 [00:10<00:00, 50.00it/s]


Parameters: chunk_size=100, chunk_overlap=0, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7783333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 15773


100%|██████████| 500/500 [00:09<00:00, 50.06it/s]


Parameters: chunk_size=100, chunk_overlap=0, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8246666666666662

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 15773


100%|██████████| 500/500 [00:10<00:00, 49.68it/s]


Parameters: chunk_size=100, chunk_overlap=0, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7793333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 15773


100%|██████████| 500/500 [00:10<00:00, 49.73it/s]


Parameters: chunk_size=100, chunk_overlap=0, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8246666666666664

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


100%|██████████| 500/500 [00:13<00:00, 37.86it/s]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8478333333333329

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


100%|██████████| 500/500 [00:13<00:00, 38.17it/s]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.781

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


100%|██████████| 500/500 [00:12<00:00, 40.14it/s]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8969999999999994

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


100%|██████████| 500/500 [00:12<00:00, 39.78it/s]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7803333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 24799


100%|██████████| 500/500 [00:12<00:00, 38.93it/s]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8998333333333327

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 187938


100%|██████████| 500/500 [01:01<00:00,  8.19it/s]


Parameters: chunk_size=100, chunk_overlap=100, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8491666666666663

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 187938


100%|██████████| 500/500 [00:59<00:00,  8.34it/s]


Parameters: chunk_size=100, chunk_overlap=100, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7823333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 187938


100%|██████████| 500/500 [00:59<00:00,  8.37it/s]


Parameters: chunk_size=100, chunk_overlap=100, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8856666666666666

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 187938


100%|██████████| 500/500 [01:00<00:00,  8.29it/s]


Parameters: chunk_size=100, chunk_overlap=100, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7803333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 187938


100%|██████████| 500/500 [01:01<00:00,  8.17it/s]


Parameters: chunk_size=100, chunk_overlap=100, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8881666666666668

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 8995


100%|██████████| 500/500 [00:08<00:00, 57.77it/s]


Parameters: chunk_size=200, chunk_overlap=0, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8153333333333328

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 8995


100%|██████████| 500/500 [00:08<00:00, 61.15it/s]


Parameters: chunk_size=200, chunk_overlap=0, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7766666666666667

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 8995


100%|██████████| 500/500 [00:08<00:00, 60.55it/s]


Parameters: chunk_size=200, chunk_overlap=0, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.7604999999999994

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 8995


100%|██████████| 500/500 [00:08<00:00, 61.50it/s]


Parameters: chunk_size=200, chunk_overlap=0, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.778

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 8995


100%|██████████| 500/500 [00:08<00:00, 61.26it/s]


Parameters: chunk_size=200, chunk_overlap=0, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.7609999999999996

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 9694


100%|██████████| 500/500 [00:08<00:00, 60.19it/s]


Parameters: chunk_size=200, chunk_overlap=50, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8284999999999996

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 9694


100%|██████████| 500/500 [00:08<00:00, 59.59it/s]


Parameters: chunk_size=200, chunk_overlap=50, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7790000000000001

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 9694


100%|██████████| 500/500 [00:08<00:00, 60.16it/s]


Parameters: chunk_size=200, chunk_overlap=50, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8338333333333324

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 9694


100%|██████████| 500/500 [00:08<00:00, 57.98it/s]


Parameters: chunk_size=200, chunk_overlap=50, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7793333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 9694


100%|██████████| 500/500 [00:08<00:00, 57.65it/s]


Parameters: chunk_size=200, chunk_overlap=50, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8353333333333326

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 11341


100%|██████████| 500/500 [00:09<00:00, 54.42it/s]


Parameters: chunk_size=200, chunk_overlap=100, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8324999999999996

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 11341


100%|██████████| 500/500 [00:08<00:00, 56.67it/s]


Parameters: chunk_size=200, chunk_overlap=100, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7796666666666667

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 11341


100%|██████████| 500/500 [00:08<00:00, 57.30it/s]


Parameters: chunk_size=200, chunk_overlap=100, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8568333333333322

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 11341


100%|██████████| 500/500 [00:08<00:00, 56.35it/s]


Parameters: chunk_size=200, chunk_overlap=100, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7803333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 11341


100%|██████████| 500/500 [00:08<00:00, 56.35it/s]


Parameters: chunk_size=200, chunk_overlap=100, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8581666666666659

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6486


100%|██████████| 500/500 [00:07<00:00, 67.03it/s]


Parameters: chunk_size=300, chunk_overlap=0, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8146666666666663

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6486


100%|██████████| 500/500 [00:07<00:00, 67.39it/s]


Parameters: chunk_size=300, chunk_overlap=0, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7843333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6486


100%|██████████| 500/500 [00:07<00:00, 64.57it/s]


Parameters: chunk_size=300, chunk_overlap=0, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.7259999999999996

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6486


100%|██████████| 500/500 [00:07<00:00, 66.63it/s]


Parameters: chunk_size=300, chunk_overlap=0, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7853333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6486


100%|██████████| 500/500 [00:07<00:00, 64.02it/s]


Parameters: chunk_size=300, chunk_overlap=0, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.725833333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6532


100%|██████████| 500/500 [00:07<00:00, 66.28it/s]


Parameters: chunk_size=300, chunk_overlap=50, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8306666666666662

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6532


100%|██████████| 500/500 [00:07<00:00, 63.76it/s]


Parameters: chunk_size=300, chunk_overlap=50, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7846666666666666

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6532


100%|██████████| 500/500 [00:07<00:00, 66.90it/s]


Parameters: chunk_size=300, chunk_overlap=50, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8176666666666662

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6532


100%|██████████| 500/500 [00:07<00:00, 63.11it/s]


Parameters: chunk_size=300, chunk_overlap=50, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7836666666666666

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6532


100%|██████████| 500/500 [00:07<00:00, 66.08it/s]


Parameters: chunk_size=300, chunk_overlap=50, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8173333333333328

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6630


100%|██████████| 500/500 [00:07<00:00, 66.20it/s]


Parameters: chunk_size=300, chunk_overlap=100, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8348333333333327

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6630


100%|██████████| 500/500 [00:07<00:00, 65.67it/s]


Parameters: chunk_size=300, chunk_overlap=100, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.7863333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6630


100%|██████████| 500/500 [00:07<00:00, 66.10it/s]


Parameters: chunk_size=300, chunk_overlap=100, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8413333333333329

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6630


100%|██████████| 500/500 [00:07<00:00, 66.04it/s]


Parameters: chunk_size=300, chunk_overlap=100, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7853333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 6630


100%|██████████| 500/500 [00:07<00:00, 66.20it/s]


Parameters: chunk_size=300, chunk_overlap=100, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8411666666666663

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4886


100%|██████████| 500/500 [00:07<00:00, 71.08it/s]


Parameters: chunk_size=400, chunk_overlap=0, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.835333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4886


100%|██████████| 500/500 [00:07<00:00, 70.91it/s]


Parameters: chunk_size=400, chunk_overlap=0, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.8006666666666665

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4886


100%|██████████| 500/500 [00:06<00:00, 71.58it/s]


Parameters: chunk_size=400, chunk_overlap=0, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.7873333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4886


100%|██████████| 500/500 [00:06<00:00, 71.73it/s]


Parameters: chunk_size=400, chunk_overlap=0, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.8039999999999999

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4886


100%|██████████| 500/500 [00:07<00:00, 68.80it/s]


Parameters: chunk_size=400, chunk_overlap=0, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.7843333333333333

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4899


100%|██████████| 500/500 [00:07<00:00, 69.00it/s]


Parameters: chunk_size=400, chunk_overlap=50, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8468333333333331

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4899


100%|██████████| 500/500 [00:07<00:00, 68.05it/s]


Parameters: chunk_size=400, chunk_overlap=50, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.8049999999999999

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4899


100%|██████████| 500/500 [00:07<00:00, 70.47it/s]


Parameters: chunk_size=400, chunk_overlap=50, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8436666666666667

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4899


100%|██████████| 500/500 [00:07<00:00, 69.71it/s]


Parameters: chunk_size=400, chunk_overlap=50, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.8029999999999999

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4899


100%|██████████| 500/500 [00:07<00:00, 68.16it/s]


Parameters: chunk_size=400, chunk_overlap=50, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8405

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4913


100%|██████████| 500/500 [00:07<00:00, 70.70it/s]


Parameters: chunk_size=400, chunk_overlap=100, retriever_weights=[0.5, 0.5]
Mean Average Precision (MAP): 0.8464999999999998

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4913


100%|██████████| 500/500 [00:07<00:00, 68.12it/s]


Parameters: chunk_size=400, chunk_overlap=100, retriever_weights=[0.6, 0.4]
Mean Average Precision (MAP): 0.8029999999999999

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4913


100%|██████████| 500/500 [00:07<00:00, 71.35it/s]


Parameters: chunk_size=400, chunk_overlap=100, retriever_weights=[0.4, 0.6]
Mean Average Precision (MAP): 0.8438333333333332

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4913


100%|██████████| 500/500 [00:07<00:00, 70.85it/s]


Parameters: chunk_size=400, chunk_overlap=100, retriever_weights=[0.7, 0.3]
Mean Average Precision (MAP): 0.7999999999999999

Embedding Model : intfloat/multilingual-e5-large-instruct
FAISS 인덱스에 추가된 문서 수: 4913


100%|██████████| 500/500 [00:07<00:00, 70.46it/s]

Parameters: chunk_size=400, chunk_overlap=100, retriever_weights=[0.3, 0.7]
Mean Average Precision (MAP): 0.8416666666666667

Best Mean Average Precision (MAP): 0.8998333333333327
Best Parameters:
  chunk_size: 100
  chunk_overlap: 50
  retriever_weights: [0.3, 0.7]





: 