In [1]:
from fastembed import SparseTextEmbedding
from qdrant_client import QdrantClient, models
import os
import json
from langchain_openai import AzureOpenAIEmbeddings
from qdrant_client import QdrantClient
client = QdrantClient("http://localhost:6333", timeout=300)

In [2]:
os.environ["OPENAI_API_KEY"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["OPENAI_API_VERSION"] = ""
os.environ["OPENAI_API_TYPE"] = ""

In [3]:
#queries = ["How do I become a good computer science engineer?", "What causes nuclear reactions in the Sun?"]
MODEL_NAME = "prithivida/Splade_PP_en_v1"
#MODEL_NAME= "Qdrant/bm42-all-minilm-l6-v2-attentions"
#MODEL_NAME = "Qdrant/bm25"
SPARSE_TYPE = "bm42"
DATASET = "quora"
COLLECTION_NAME = "quora_collection"

DATASET = f"/datasets/{DATASET}/queries.jsonl"

In [4]:
def load_queries():
    queries = {}

    with open(DATASET, "r") as file:
        for line in file:
            row = json.loads(line)
            queries[row["_id"]] = row['text']
    return queries

In [5]:
queries = load_queries()

In [6]:

# this function encodes a query into a dense vector using openai embeddings
def encode_dense_query(query):
    embeddings = AzureOpenAIEmbeddings(
        model= "text-embedding-ada-002",  
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        openai_api_version=os.environ["OPENAI_API_VERSION"],
        openai_api_key=os.environ["OPENAI_API_KEY"],
    )
    query_vector = embeddings.embed_query(query)
    return query_vector

In [8]:
# this function encodes a query into a sparse vector using a specified model (bm25/splade/BM42)
def encode_sparse_query(query, model_name= MODEL_NAME): 
    model = SparseTextEmbedding(model_name)
    embedding = list(model.query_embed(query))[0]
    sparse_vector = models.SparseVector(values=embedding.values.tolist(), indices=embedding.indices.tolist())
    return sparse_vector

In [9]:

def hybrid_search(sparse_vector, dense_vector):
    hybrid = client.query_points(
        collection_name=COLLECTION_NAME,
        prefetch=[models.Prefetch(
            query=sparse_vector,
            using=SPARSE_TYPE,
            limit=10,
            ),
            models.Prefetch(
                query=dense_vector,
                using="openai",
                limit=10,
                ),
                ],
                query=models.FusionQuery(fusion=models.Fusion.RRF),
                )
    return hybrid

In [26]:
with open("./quora_bm42_hyb.jsonl", "w") as dense_out:
    for idx,text in queries.items():
        sparse_vector = encode_sparse_query(text)
        dense_vector = encode_dense_query(text)
        search_result = hybrid_search(sparse_vector,dense_vector)

        hybrid_output = {"query_id": idx,"results":
                        [{"doc_id": point.id,"score": point.score}
                         for point in search_result.points]}
        dense_out.write(json.dumps(hybrid_output) + "\n")