In [1]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR/src")

import os
import time
import json
import random
import warnings
import anthropic
import threading
import numpy as np
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI
from itertools import product
from scipy.spatial.distance import cosine
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain.retrievers import EnsembleRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = anthropic_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from config import Args
from rankgpt.ranker import permutation_pipeline
from data.data import load_document, load_query, chunking

from sparse_retriever.model import load_sparse_model
from dense_retriever.model import load_dense_model, load_hf_encoder, load_upstage_encoder, load_openai_encoder, load_voyage_encoder

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [2]:
args = Args()

data_path = "../dataset/processed_documents_queries.jsonl"
documents = load_document(data_path)
questions = load_query(data_path)

random.shuffle(questions)
questions = questions[:100]

In [3]:
chunk_sizes = [100]
chunk_overlaps = [50]
retriever_weights_list = [[0.3, 0.7]]
ensemble_weights_list = [[0.3, 0.3, 0.4], [0.2, 0.2, 0.6], [0.1, 0.1, 0.8]]

In [4]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

In [6]:
folder_path = f"./index_files/{args.encoder_method}"
os.makedirs(folder_path, exist_ok=True)

In [6]:
best_map = 0
best_params = {}

args. chunking = True
args.chunk_size = 100
args.chunk_overlap = 50
args.faiss_index_file = "/home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50"
args.ensemble_models = [
    {'type': 'hf', 'name': "BAAI/bge-m3"},
    {'type': 'hf', 'name': "intfloat/multilingual-e5-large"},
    {'type': 'upstage', 'name': "solar-embedding-1-large-query"}
]

# ensemble encoders 준비
ensemble_encoders = []
for model_info in args.ensemble_models:
    model_type = model_info.get('type', 'hf')  # 기본값은 'hf'
    model_name = model_info['name']
    if model_type == 'hf':
        encoder = load_hf_encoder(model_name, args.model_kwargs, args.encode_kwargs)
    elif model_type == 'upstage':
        encoder = load_upstage_encoder(model_name)
    elif model_type == 'voyage':
        encoder = load_voyage_encoder(model_name)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    ensemble_encoders.append(encoder)

# 각 파라미터 조합에 대해 검색 및 평가 수행
for ensemble_weights in ensemble_weights_list:
    print(ensemble_weights)

    # 문서 chunking
    chunk_documents = chunking(args, documents)

    # sparse retriever 로드
    sparse_retriever = load_sparse_model(documents, args.src_lang)
    sparse_retriever.k = 10

    # # FAISS 인덱스 설정
    # folder_path = f"../index_files/{args.encoder_method}/{args.upstage_model_name}-cs{args.chunk_size}-co{args.chunk_overlap}"
    # if not os.path.exists(folder_path):
    #     args.faiss_index_file = None
    # else:
    #     args.faiss_index_file = folder_path

    # dense retriever 로드
    dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 10})

    # 앙상블 retriever 설정
    retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=args.retriever_weights,
        search_type="mrr"
    )

    # 정답 레이블 생성
    gt = {}
    for question in questions:
        query, question_id = question['query'], question['metadata']['docid']
        gt[question_id] = [question_id]
    
    # 예측 결과 수집
    pred = []
    for question in tqdm(questions):
        query, question_id = question['query'], question['metadata']['docid']

        # 쿼리 임베딩 계산 및 가중치 적용
        query_embeddings = []
        for idx, encoder in enumerate(ensemble_encoders):
            query_embedding = encoder.embed_query(query)
            query_embeddings.append((query_embedding, ensemble_weights[idx]))
        
        # 검색 수행
        results = retriever.invoke(query)
        sorted_results = sorted(results, key=lambda x: x.metadata['score'], reverse=True)
        search_result = [(result, result.metadata['score']) for result in sorted_results[:20]]

        combined_scores = []
        for doc, _ in search_result:
            combined_similarity = 0
            for idx, (query_embedding, weight) in enumerate(query_embeddings):
                doc_embedding_key = f'embedding_{args.ensemble_models[idx]["name"]}'
                doc_embedding = doc.metadata.get(doc_embedding_key) or ensemble_encoders[idx].embed_query(doc.page_content)
                similarity = 1 - cosine(query_embedding, doc_embedding)
                combined_similarity += weight * similarity
            combined_scores.append((doc, combined_similarity))
        
        # top-k 결과 수집
        topk_result = [doc.metadata.get('docid') for doc, _ in sorted(combined_scores, key=lambda x: x[1], reverse=True)]
        
        pred.append({
            "eval_id": question_id,
            "topk": topk_result
        })
    
    # MAP 계산
    mean_average_precision = calc_map(gt, pred)
    print(f"Parameters: chunk_size={args.chunk_size}, chunk_overlap={args.chunk_overlap}, retriever_weights={args.retriever_weights}, ensemble_weights={ensemble_weights}")
    print(f"Mean Average Precision (MAP): {mean_average_precision}\n")
    
    # 최적의 MAP 값과 파라미터 저장
    if mean_average_precision > best_map:
        best_map = mean_average_precision
        best_params = {
            'chunk_size': args.chunk_size,
            'chunk_overlap': args.chunk_overlap,
            'retriever_weights': args.retriever_weights,
            'ensemble_weights': ensemble_weights
        }


[0.3, 0.3, 0.4]
FAISS 인덱스 로드 중: /home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [18:24<00:00, 11.05s/it]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.3, 0.7], ensemble_weights=[0.3, 0.3, 0.4]
Mean Average Precision (MAP): 0.9416666666666665

[0.2, 0.2, 0.6]
FAISS 인덱스 로드 중: /home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [18:40<00:00, 11.20s/it]


Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.3, 0.7], ensemble_weights=[0.2, 0.2, 0.6]
Mean Average Precision (MAP): 0.9466666666666668

[0.1, 0.1, 0.8]
FAISS 인덱스 로드 중: /home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50
FAISS 인덱스 로드 완료, 총 문서 수: 24799


100%|██████████| 100/100 [19:33<00:00, 11.73s/it]

Parameters: chunk_size=100, chunk_overlap=50, retriever_weights=[0.3, 0.7], ensemble_weights=[0.1, 0.1, 0.8]
Mean Average Precision (MAP): 0.9425






In [7]:
print(f"Best Mean Average Precision (MAP): {best_map}")
print(f"Best Parameters:")
print(f"  chunk_size: {best_params['chunk_size']}")
print(f"  chunk_overlap: {best_params['chunk_overlap']}")
print(f"  retriever_weights: {best_params['retriever_weights']}")
print(f"  ensemble_weights: {best_params['ensemble_weights']}")

Best Mean Average Precision (MAP): 0.9466666666666668
Best Parameters:
  chunk_size: 100
  chunk_overlap: 50
  retriever_weights: [0.3, 0.7]
  ensemble_weights: [0.2, 0.2, 0.6]


In [10]:
# RankGPT reranking 함수 추가
def format_rerank_results(query, top_docs, docs, scores):
    # top_docs는 docid로 구성되어 있으므로, docid를 기준으로 문서를 찾음
    formatted_hits = []
    for idx, docid in enumerate(top_docs):
        # docid로 문서를 검색하여 해당 문서의 인덱스를 찾아냄
        matching_doc = next((doc for doc in docs if doc['metadata']['docid'] == docid), None)
        if matching_doc:
            formatted_hits.append({
                'idx': idx, 
                'content': matching_doc['content'], 
                'score': scores[idx]
            })
    result = {
        'query': query,
        'hits': formatted_hits
    }
    return result

def reranking(query, top_docs, docs, top_k):
    new_item = permutation_pipeline(
        item=format_rerank_results(query, top_docs, docs, scores=[0]*len(top_docs)),  # 초기 점수로 0 사용
        rank_start=0, 
        rank_end=len(top_docs), 
        model_name='gpt-3.5-turbo', 
        api_key=openai_api_key  # 여기에 실제 API 키를 입력
    )
    reranked_doc_indices = [hit['idx'] for hit in new_item['hits']]
    reranked_scores = [hit['score'] for hit in new_item['hits']]
    
    return reranked_doc_indices[:top_k], reranked_scores[:top_k]

In [11]:
best_map = 0
best_params = {}

args. chunking = True
args.chunk_size = 100
args.chunk_overlap = 50
args.faiss_index_file = "/home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50"
args.ensemble_models = [
    {'type': 'hf', 'name': "BAAI/bge-m3"},
    {'type': 'hf', 'name': "intfloat/multilingual-e5-large"},
    {'type': 'upstage', 'name': "solar-embedding-1-large-query"}
]

# ensemble encoders 준비
ensemble_encoders = []
for model_info in args.ensemble_models:
    model_type = model_info.get('type', 'hf')  # 기본값은 'hf'
    model_name = model_info['name']
    if model_type == 'hf':
        encoder = load_hf_encoder(model_name, args.model_kwargs, args.encode_kwargs)
    elif model_type == 'upstage':
        encoder = load_upstage_encoder(model_name)
    elif model_type == 'voyage':
        encoder = load_voyage_encoder(model_name)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    ensemble_encoders.append(encoder)

# 각 파라미터 조합에 대해 검색 및 평가 수행
for ensemble_weights in ensemble_weights_list:
    print(ensemble_weights)

    # 문서 chunking
    chunk_documents = chunking(args, documents)

    # sparse retriever 로드
    sparse_retriever = load_sparse_model(documents, args.src_lang)
    sparse_retriever.k = 10

    # # FAISS 인덱스 설정
    # folder_path = f"../index_files/{args.encoder_method}/{args.upstage_model_name}-cs{args.chunk_size}-co{args.chunk_overlap}"
    # if not os.path.exists(folder_path):
    #     args.faiss_index_file = None
    # else:
    #     args.faiss_index_file = folder_path

    # dense retriever 로드
    dense_retriever = load_dense_model(args, chunk_documents).as_retriever(search_kwargs={"k": 10})

    # 앙상블 retriever 설정
    retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, dense_retriever],
        weights=args.retriever_weights,
        search_type="mrr"
    )

    # 정답 레이블 생성
    gt = {}
    for question in questions:
        query, question_id = question['query'], question['metadata']['docid']
        gt[question_id] = [question_id]
    
    # 예측 결과 수집
    pred = []
    for question in tqdm(questions):
        query, question_id = question['query'], question['metadata']['docid']

        # 쿼리 임베딩 계산 및 가중치 적용
        query_embeddings = []
        for idx, encoder in enumerate(ensemble_encoders):
            query_embedding = encoder.embed_query(query)
            query_embeddings.append((query_embedding, ensemble_weights[idx]))
        
        # 검색 수행
        results = retriever.invoke(query)
        sorted_results = sorted(results, key=lambda x: x.metadata['score'], reverse=True)
        search_result = [(result, result.metadata['score']) for result in sorted_results[:20]]

        docs = [{'content': result.page_content, 'metadata': result.metadata} for result in results]

        combined_scores = []
        for doc, _ in search_result:
            combined_similarity = 0
            for idx, (query_embedding, weight) in enumerate(query_embeddings):
                doc_embedding_key = f'embedding_{args.ensemble_models[idx]["name"]}'
                doc_embedding = doc.metadata.get(doc_embedding_key) or ensemble_encoders[idx].embed_query(doc.page_content)
                similarity = 1 - cosine(query_embedding, doc_embedding)
                combined_similarity += weight * similarity
            combined_scores.append((doc, combined_similarity))
        
        # RankGPT reranking 수행
        reranked_doc_indices, reranked_scores = reranking(query, [doc.metadata['docid'] for doc, _ in combined_scores], docs, top_k=3)
        
        # top-k 결과 수집
        topk_result = [reranked_doc_indices[i] for i in range(len(reranked_doc_indices))]
        
        pred.append({
            "eval_id": question_id,
            "topk": topk_result
        })
    
    # MAP 계산
    mean_average_precision = calc_map(gt, pred)
    print(f"Parameters: chunk_size={args.chunk_size}, chunk_overlap={args.chunk_overlap}, retriever_weights={args.retriever_weights}, ensemble_weights={ensemble_weights}")
    print(f"Mean Average Precision (MAP): {mean_average_precision}\n")
    
    # 최적의 MAP 값과 파라미터 저장
    if mean_average_precision > best_map:
        best_map = mean_average_precision
        best_params = {
            'chunk_size': args.chunk_size,
            'chunk_overlap': args.chunk_overlap,
            'retriever_weights': args.retriever_weights,
            'ensemble_weights': ensemble_weights
        }


[0.3, 0.3, 0.4]
FAISS 인덱스 로드 중: /home/pervinco/Upstage_Ai_Lab/Final/IR/src/index_files/upstage/solar-embedding-1-large-passage-cs100-co50
FAISS 인덱스 로드 완료, 총 문서 수: 24799


  3%|▎         | 3/100 [00:44<23:57, 14.82s/it]


KeyboardInterrupt: 

In [None]:
print(f"Best Mean Average Precision (MAP): {best_map}")
print(f"Best Parameters:")
print(f"  chunk_size: {best_params['chunk_size']}")
print(f"  chunk_overlap: {best_params['chunk_overlap']}")
print(f"  retriever_weights: {best_params['retriever_weights']}")
print(f"  ensemble_weights: {best_params['ensemble_weights']}")