In [1]:
import os
import sys
sys.path.append("/data/ephemeral/home/Upstage_Ai_Lab/Final/IR/src")

import cassio
cassio.init(auto=True)

import json
import random
import huggingface_hub

from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv("../keys.env")

upstage_api_key = os.getenv("UPSTAGE_API_KEY")
os.environ['UPSTAGE_API_KEY'] = upstage_api_key

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

voyage_api_key = os.getenv('VOYAGE_API_KEY')
os.environ['VOYAGE_API_KEY'] = voyage_api_key

hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(hf_token)

from langchain.schema import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_experimental.text_splitter import SemanticChunker

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.graph_vectorstores.links import add_links
from langchain_community.graph_vectorstores import CassandraGraphVectorStore
from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor

from config import Args
from data.data import load_document, chunking, load_query
from sparse_retriever.model import load_sparse_model
from dense_retriever.model import load_dense_model, load_upstage_encoder, load_hf_encoder

ValueError: Insufficient parameters to connect

In [None]:
args = Args()
args.encoder_method = "hugginface"
args.hf_model_name = "dragonkue/bge-m3-ko"
args.doc_file_path = "../dataset/processed_documents_queries.jsonl"
args.faiss_index_file = None

args.chunk_size = 100
args.chunk_overlap = 50

documents = load_document(args.doc_file_path)
print(len(documents))

questions = load_query(args.doc_file_path)
print(len(questions))

random.shuffle(questions)
questions = questions[:1000]

In [None]:
rec_chunks = chunking(args, documents)
print(len(rec_chunks))

In [None]:
def calc_map(gt, pred):    
    sum_average_precision = 0    
    for j in pred:        
        if gt[j["eval_id"]]:            
            hit_count = 0            
            sum_precision = 0            
            for i,docid in enumerate(j["topk"][:3]):                
                if docid in gt[j["eval_id"]]:                    
                    hit_count += 1                    
                    sum_precision += hit_count/(i+1)            
            average_precision = sum_precision / hit_count if hit_count > 0 else 0        
        else:            
            average_precision = 0 if j["topk"] else 1        
        sum_average_precision += average_precision    
    return sum_average_precision/len(pred)

In [None]:
embedder = load_hf_encoder(args.hf_model_name, args.model_kwargs, args.encode_kwargs)
vector_store = FAISS.from_documents(rec_chunks, embedder)

In [None]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")

In [None]:
extractor = KeybertLinkExtractor()

for chunk in rec_chunks:
    add_links(chunk, extractor.extract_one(chunk))

In [None]:
vector_store = CassandraGraphVectorStore.from_documents(
    embedding=embedder,
    documents=documents,
)

In [None]:
gt = {}
for question in questions:
    query, question_id = question['query'], question['metadata']['docid']
    gt[question_id] = [question_id]

pred = []
for question in tqdm(questions):
    query, question_id = question['query'], question['metadata']['docid']
    
    search_result = vector_store.similarity_search_with_relevance_scores(query, k=3)
    
    topk_result = []
    for doc, score in search_result:
        topk_result.append(doc.metadata.get('docid'))

    pred.append({
        "eval_id": question_id,
        "topk": topk_result
    })

mean_average_precision = calc_map(gt, pred)
print(f"Mean Average Precision (MAP): {mean_average_precision}")