In [1]:
import os
import sys
sys.path.append("/data/ephemeral/home/Upstage_Ai_Lab/Final/IR/src")

import json
import random
import huggingface_hub

from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

voyage_api_key = os.getenv('VOYAGE_API_KEY')
os.environ['VOYAGE_API_KEY'] = voyage_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from langchain.schema import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_experimental.text_splitter import SemanticChunker

from config import Args
from data.data import load_document, chunking, load_query
from sparse_retriever.model import load_sparse_model
from dense_retriever.model import load_dense_model, load_upstage_encoder, load_hf_encoder

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


  from .autonotebook import tqdm as notebook_tqdm


Token is valid (permission: read).
Your token has been saved to /data/ephemeral/home/.cache/huggingface/token
Login successful


In [2]:
os.makedirs("../dataset/test", exist_ok=True)

In [3]:
args = Args()
args.encoder_method = "hugginface"
args.hf_model_name = "dragonkue/bge-m3-ko"
args.doc_file_path = "../dataset/processed_documents_queries.jsonl"
args.faiss_index_file = None

args.chunk_size = 100
args.chunk_overlap = 50

documents = load_document(args.doc_file_path)
print(len(documents))

questions = load_query(args.doc_file_path)
print(len(questions))

random.shuffle(questions)
questions = questions[:1000]

4272
12816


In [4]:
embedder = load_hf_encoder(args.hf_model_name, args.model_kwargs, args.encode_kwargs)

In [5]:
rec_chunks = chunking(args, documents)
print(len(rec_chunks))

24799


In [6]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

In [7]:
vector_store = FAISS.from_documents(rec_chunks, embedder)

In [8]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 1000/1000 [00:33<00:00, 29.98it/s]

Mean Average Precision (MAP): 0.9207500000000008





In [9]:
splitter = SemanticChunker(embedder, breakpoint_threshold_type="percentile")

perc_chunks = splitter.split_documents(documents)
print(len(perc_chunks))

8540


In [10]:
with open("../dataset/test/percentile_chunk_documents.jsonl", 'w', encoding='utf-8') as f:
    for doc in perc_chunks:
        id = doc.metadata['docid']
        content = doc.page_content
        
        json.dump({"docid":id, "content":content}, f, ensure_ascii=False)
        f.write("\n")

In [11]:
vector_store = FAISS.from_documents(perc_chunks, embedder)

In [12]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 1000/1000 [00:21<00:00, 46.48it/s]

Mean Average Precision (MAP): 0.9175000000000014





In [13]:
splitter = SemanticChunker(embedder, breakpoint_threshold_type="standard_deviation")

std_chunks = splitter.split_documents(documents)
print(len(std_chunks))

4272


In [14]:
with open("../dataset/test/std_chunk_documents.jsonl", 'w', encoding='utf-8') as f:
    for doc in std_chunks:
        id = doc.metadata['docid']
        content = doc.page_content
        
        json.dump({"docid":id, "content":content}, f, ensure_ascii=False)
        f.write("\n")

In [15]:
vector_store = FAISS.from_documents(std_chunks, embedder)

In [16]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 1000/1000 [00:17<00:00, 55.91it/s]

Mean Average Precision (MAP): 0.9230000000000003





In [17]:
splitter = SemanticChunker(embedder, breakpoint_threshold_type="interquartile")

inter_chunks = splitter.split_documents(documents)
print(len(inter_chunks))

5741


In [18]:
with open("../dataset/test/interquartile_chunk_documents.jsonl", 'w', encoding='utf-8') as f:
    for doc in inter_chunks:
        id = doc.metadata['docid']
        content = doc.page_content
        
        json.dump({"docid":id, "content":content}, f, ensure_ascii=False)
        f.write("\n")

In [19]:
vector_store = FAISS.from_documents(inter_chunks, embedder)

In [20]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 1000/1000 [00:19<00:00, 51.99it/s]

Mean Average Precision (MAP): 0.9199166666666674





In [21]:
splitter = SemanticChunker(embedder, breakpoint_threshold_type="gradient")

grad_chunks = splitter.split_documents(documents)
print(len(grad_chunks))

8505


In [22]:
with open("../dataset/test/gradient_chunk_documents.jsonl", 'w', encoding='utf-8') as f:
    for doc in grad_chunks:
        id = doc.metadata['docid']
        content = doc.page_content
        
        json.dump({"docid":id, "content":content}, f, ensure_ascii=False)
        f.write("\n")

In [23]:
vector_store = FAISS.from_documents(grad_chunks, embedder)

In [24]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

100%|██████████| 1000/1000 [00:21<00:00, 46.83it/s]

Mean Average Precision (MAP): 0.9045000000000014



