# Experiment
- Hybrid search
- Doc split

Reference: https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble

In [1]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
import sys
import pandas as pd
from collections import Counter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
sys.path.append("../")
from src.config import Configuration
from src.prepare.data_load import DocDataLoader

In [2]:
doc_loader = DocDataLoader()
db = doc_loader.load_major_docs_full()

From (44, 4) to 44


# Config

In [3]:
conf = Configuration()

WEIGHTS = [0.5, 0.5]
K = 1

# INDEX = "labse-major"
INDEX = "gemini-major"

# MODEL = "sentence-transformers/LaBSE"
MODEL = "models/embedding-001"


DIST = "EUCLIDEAN_DISTANCE"
META = {
    "model": MODEL, 
    "bm25-weight": WEIGHTS[0],
    "elastic-weight": WEIGHTS[1],
    "index": INDEX,
    "top k": K,
    "distance": DIST, 
    "technique": f"{MODEL} + Hybrid (full text at BM25)"
}

# Hybrid Retriever

In [4]:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings = HuggingFaceInferenceAPIEmbeddings(
#     api_key=conf.load_hg_token(),
#     model_name=MODEL)

embeddings = GoogleGenerativeAIEmbeddings(model=MODEL,
                                          google_api_key=conf.load_gemini_token())

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_documents(db)
bm25_retriever.k = K

elastic_vector_search = ElasticsearchStore(
            es_connection=conf.load_elasticsearch_connection(),
            index_name=INDEX,
            embedding=embeddings,
            distance_strategy=DIST)

es_retriever = elastic_vector_search.as_retriever(search_kwargs={"k": K})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, es_retriever], weights=WEIGHTS
)


  from .autonotebook import tqdm as notebook_tqdm


# Test

In [5]:
from tests.major_eval import MajorBlogPostEvaluation
eval = MajorBlogPostEvaluation(root_path="../", save_path="./results")

!ls ../data/test_major

docs.csv		    private_test_case.csv  sample_test_case.csv
hard_private_test_case.csv  public_test_case.csv


In [6]:
eval.eval_sample("sample_hybrid_fulltext.txt", ensemble_retriever, META)

In [6]:
eval.eval_public("gemini_major_hybrid_public_fulltext.txt", ensemble_retriever, META)

In [10]:
with open("./results/labse_major_hybrid_bm25full_private.txt", "a") as f:
    top = 1
    c, t = test_major_private(ensemble_retriever, 1)
    # c, t = [1, 2]
    write_info(f)
    f.write(f"Find Top: {top}\n")
    f.write(f"Correct: {str(c)}\n")
    f.write(f"Total: {str(t)}\n")
    f.write(f"Score: {str(c/t)}\n")

In [11]:

with open("./results/labse_major_hybrid_bm25full_private.txt", "a") as f:
    top = 2
    c, t = test_major_private(ensemble_retriever, top)
    # c, t = [1, 2]
    write_info(f)
    f.write(f"Find Top: {top}\n")
    f.write(f"Correct: {str(c)}\n")
    f.write(f"Total: {str(t)}\n")
    f.write(f"Score: {str(c/t)}\n")