# Experiment
- Hybrid search
- Doc split

Reference: https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble

In [1]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
import sys
import pandas as pd
from collections import Counter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
sys.path.append("../")
from src.config import Configuration
from src.prepare.data_load import DocDataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc_loader = DocDataLoader()
db = doc_loader.load_major_docs(size=460, overlap=20)

From (44, 4) to Counter({2: 37, 1: 6, 3: 1})


# Config

In [2]:
conf = Configuration()

WEIGHTS = [0.5, 0.5]
K = 1
INDEX = "labse-major"
# INDEX = "gemini-major"

MODEL = "sentence-transformers/LaBSE"
# MODEL = "models/embedding-001"

DIST = "EUCLIDEAN_DISTANCE"

META = {
    "model": MODEL,
    "doc_size": 460,
    "doc_overlap": 20,
    "bm25-weight": WEIGHTS[0],
    "elastic-weight": WEIGHTS[1],
    "index": INDEX,
    "top k": K,
    "distance": DIST, 
    "technique": f"{MODEL} + Hybrid"
}

# Hybrid Retriever

In [3]:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=conf.load_hg_token(1),
    model_name=MODEL)

# embeddings = GoogleGenerativeAIEmbeddings(model=MODEL,
#                                           google_api_key=conf.load_gemini_token())

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_documents(db)
bm25_retriever.k = K

elastic_vector_search = ElasticsearchStore(
            es_connection=conf.load_elasticsearch_connection(),
            index_name=INDEX,
            embedding=embeddings,
            distance_strategy=DIST)

es_retriever = elastic_vector_search.as_retriever(search_kwargs={"k": K})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, es_retriever], weights=WEIGHTS
)


NameError: name 'db' is not defined

# My code

In [4]:
# import
import sys
sys.path.append("../")

from src.rag.hybrid_rag import HybridGeminiRAG
from src.service.provider import ProviderService
services = ProviderService()

major = services.get_categories().major
rag = HybridGeminiRAG(
    provider=services, 
    rag_config=services.get_categories().major, 
    update_notification_func=lambda x: x)

ensemble_retriever = rag.ensemble_retriever

From (44, 5) to 44


# Test

In [10]:
from src.tests.major_eval import MajorBlogPostEvaluation
eval = MajorBlogPostEvaluation(root_path="../", save_path="./results")

!ls ../data/test_major

docs.csv		    private_test_case.csv  sample_test_case.csv
hard_private_test_case.csv  public_test_case.csv


# Labse

In [9]:
eval.eval_sample("sample_hybrid_docsplit.txt", ensemble_retriever, META)

In [5]:
eval.eval_public("labse_major_hybrid_public_docsplit.txt", ensemble_retriever, META)

In [11]:
eval.eval_private_hard("labse_major_hybrid_hard_private_docsplit.txt", ensemble_retriever, META)

# Gemini

In [6]:
eval.eval_public("gemini_major_hybrid_public_docsplit.txt", ensemble_retriever, META)

In [7]:
eval.eval_private_hard("gemini_major_hybrid_hard_private_docsplit.txt", ensemble_retriever, META)