# Experiment
- Hybrid search
- Doc split

Reference: https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble

In [1]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
import sys
import pandas as pd
from collections import Counter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
sys.path.append("../")
from src.config import Configuration
from src.prepare.data_load import DocDataLoader

# Config

In [2]:
conf = Configuration()

K = 1
INDEX = "labse-major"
# INDEX = "gemini-major"

MODEL = "sentence-transformers/LaBSE"
# MODEL = "models/embedding-001"

DIST = "EUCLIDEAN_DISTANCE"

META = {
    "model": MODEL,
    "doc_size": 460,
    "doc_overlap": 20,
    "index": INDEX,
    "top k": K,
    "distance": DIST, 
    "technique": f"Vector: {MODEL}"
}

# Retriever

In [5]:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=conf.load_hg_token(0),
    model_name=MODEL)

elastic_vector_search = ElasticsearchStore(
            es_connection=conf.load_elasticsearch_connection(),
            index_name=INDEX,
            embedding=embeddings,
            distance_strategy=DIST)

es_retriever = elastic_vector_search.as_retriever(search_kwargs={"k": K})

# Test

In [6]:
from tests.major_eval import MajorBlogPostEvaluation
eval = MajorBlogPostEvaluation(root_path="../", save_path="./results")

!ls ../data/test_major

docs.csv		    private_test_case.csv  sample_test_case.csv
hard_private_test_case.csv  public_test_case.csv


# Labse

In [6]:
eval.eval_sample("sample_vector_l2_docsplit.txt", es_retriever, META)

In [7]:
eval.eval_public("labse_major_vector_l2_public_docsplit.txt", es_retriever, META)

In [7]:
eval.eval_private_hard("labse_major_vector_l2_hard_private_docsplit.txt", es_retriever, META)

# Gemini

In [6]:
eval.eval_public("gemini_major_hybrid_public_docsplit.txt", ensemble_retriever, META)

In [7]:
eval.eval_private_hard("gemini_major_hybrid_hard_private_docsplit.txt", ensemble_retriever, META)