In [2]:
import sys
sys.path.append("/home/pervinco/Upstage_Ai_Lab/Final/IR")

import os
import re
import json
import faiss
import random
import warnings
import pandas as pd
import huggingface_hub

from tqdm import tqdm
from openai import OpenAI

from langchain.schema import Document
from langchain_community.vectorstores.faiss import FAISS

from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker

from src.sparse_retriever.kiwi_bm25 import KiwiBM25Retriever

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=FutureWarning)

from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pervinco/.cache/huggingface/token
Login successful


In [3]:
client = OpenAI()
model = "gpt-4o"


en_document_path = "../dataset/en_4.0_processed_documents_queries.jsonl"
ko_document_path = "../dataset/processed_documents_queries.jsonl"

In [4]:
def score_normalizer(val: float) -> float:
    return 1 / (1 + val)

def load_upstage_encoder(model_name):
    encoder = UpstageEmbeddings(model=model_name)

    return encoder

def load_openai_encoder(model_name):
    encoder = OpenAIEmbeddings(model=model_name)

    return encoder

def load_hf_encoder(model_name, model_kwargs, encode_kwargs):
    encoder = HuggingFaceEmbeddings(model_name=model_name,
                                    model_kwargs=model_kwargs,
                                    encode_kwargs=encode_kwargs)
    
    return encoder

def load_hf_reranker(model_name, retriever):
    reranker = HuggingFaceCrossEncoder(model_name=model_name)
    compressor = CrossEncoderReranker(model=reranker, top_n=3)
    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever.as_retriever(search_kwargs={"k": 10}))

    return compression_retriever

In [5]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


def load_document(file_path):
    raw_documents = load_jsonl(file_path)

    documents = []
    for doc in raw_documents:
        doc_id = doc['docid']
        content = doc['content']
        documents.append(Document(page_content=content, metadata={"docid": doc_id}))
    
    return documents

def load_query(file_path):
    raw_queries = load_jsonl(file_path)

    queries = []
    for query in raw_queries:
        doc_id = query['docid']

        for i in range(1, 4):
            queries.append({"query": query[f'question{i}'], "metadata": {"docid": doc_id}})
    
    return queries


In [6]:
en_documents = load_document(en_document_path)
en_questions = load_query(en_document_path)

random.shuffle(en_questions)
en_questions = en_questions[:500]

In [7]:
ko_documents = load_document(ko_document_path)
ko_questions = load_query(ko_document_path)

random.shuffle(ko_questions)
ko_questions = ko_questions[:500]

In [8]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

## KiwiBM25

In [8]:
retrieval = KiwiBM25Retriever.from_documents(ko_documents)

In [None]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.search_with_score(query)
    
    topk_result = []
    for doc in search_result:
        score = doc.metadata.get('score', 'N/A')
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

## HuggingFace - intfloat/multilingual-e5-large-instruct

### 한국어

In [8]:
encoder = load_hf_encoder("intfloat/multilingual-e5-large-instruct", 
                          {"device": "cuda:0"}, 
                          {"normalize_embeddings": False, "clean_up_tokenization_spaces": True})

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

4272개 documents 임베딩 시간 22.5초

In [None]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3, score_threshold=0.6)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

### 영어

In [10]:
encoder = load_hf_encoder("intfloat/multilingual-e5-large-instruct", 
                          {"device": "cuda:0"}, 
                          {"normalize_embeddings": False, "clean_up_tokenization_spaces": True})

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=en_documents)
retrieval = vector_store

In [None]:
gt = {}
for question in en_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(en_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3, score_threshold=0.6)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

500개 쿼리 처리시간 4초

mAP : 0.85

## HuggingFace - intfloat/multilingual-e5-large-instruct

### 한국어

In [None]:
encoder = load_hf_encoder('BAAI/bge-m3', 
                          {"device": "cuda:0"}, 
                          {"normalize_embeddings": False, "clean_up_tokenization_spaces": True})

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

4272개 documents 임베딩 시간 22.5초

In [None]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

In [24]:
encoder = load_hf_encoder('bespin-global/klue-sroberta-base-continue-learning-by-mnr', 
                          {"device": "cuda:0"}, 
                          {"normalize_embeddings": False, "clean_up_tokenization_spaces": True})

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:02<00:00, 183.07it/s]

Mean Average Precision (MAP): 0.8266666666666664





## OpenAI - text-embedding-3-large

### 한국어

In [12]:
encoder = load_openai_encoder("text-embedding-3-large")

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

4272개 documents 임베딩 시간 37초

In [None]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

500개 쿼리 처리시간 5분

mAP : 0.85

### 영어

In [14]:
encoder = load_openai_encoder("text-embedding-3-large")

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=en_documents)
retrieval = vector_store

In [None]:
gt = {}
for question in en_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(en_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

## Upstage - solar-embedding-1-large-passage

### 한국어

In [8]:
encoder = load_upstage_encoder("solar-embedding-1-large-passage")

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

In [9]:
faiss.write_index(index, "faiss_index.npy")

In [10]:
# FAISS 인덱스를 npy로부터 불러옴
index = faiss.read_index("faiss_index.npy")

# 벡터 스토어 다시 생성
vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

# 검색에 활용
retrieval = vector_store


4272개 documents 임베딩 시간 12분 17초...

In [None]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

500개 쿼리 처리시간 4분 16초

mAP : 0.9073

### 영어

In [18]:
encoder = load_upstage_encoder("solar-embedding-1-large-passage")

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=en_documents)
retrieval = vector_store

In [None]:
gt = {}
for question in en_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(en_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

## Ollama

In [10]:
from langchain_community.embeddings import OllamaEmbeddings

encoder = OllamaEmbeddings(model="llama3-instruct-8b")

index = faiss.IndexFlatL2(len(encoder.embed_query("파이썬")))

vector_store = FAISS(
    embedding_function=encoder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    relevance_score_fn=score_normalizer
)

vector_store.add_documents(documents=ko_documents)
retrieval = vector_store

In [11]:
gt = {}
for question in ko_questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(ko_questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = retrieval.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 500/500 [00:25<00:00, 19.96it/s]

Mean Average Precision (MAP): 0.10533333333333335



