<a href="https://colab.research.google.com/github/patharepremkumar/Generative-AI-Indepth-Basic-to-Advance1/blob/main/Hybrid_Search_in_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [6]:

# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [7]:
query="keyword-based search"

In [8]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [9]:
preprocess_documents=[preprocess_text(doc) for doc in documents]
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [10]:
preprocessed_query = preprocess_text(query)
preprocessed_query

'keywordbased search'

In [11]:
vector=TfidfVectorizer()

In [12]:
x = vector.fit_transform(preprocess_documents)

In [13]:
x.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [14]:
query_embedding=vector.transform([preprocessed_query])

In [15]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [16]:
similarities = cosine_similarity(x, query_embedding)
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [17]:
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [18]:

#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [19]:
ranked_documents = [documents[i] for i in ranked_indices]

In [20]:

# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [21]:
#https://huggingface.co/sentence-transformers

In [22]:

document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [23]:

# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [24]:

# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [25]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [26]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [27]:

# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [28]:
!pip install pypdf
!pip install langchain_community

Collecting pypdf
  Downloading pypdf-5.0.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-5.0.0-py3-none-any.whl (292 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/292.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.0.0
Collecting langchain_community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.4.0,>=0.3.1 (from langchain_community)
  Downloading langchain-0.3.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.6 (from langchain_community)
  Downloading langchain_core-0.3.6-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain_

In [32]:
doc_path="/content/Retrieval-Augmented-Generation-for-NLP.pdf"

In [33]:
from langchain_community.document_loaders import PyPDFLoader

In [34]:
loader=PyPDFLoader(doc_path)

In [35]:
docs=loader.load()

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [37]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [38]:
chunks = splitter.split_documents(docs)

In [39]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [40]:
HF_TOKEN="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"

In [41]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [42]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.31.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.6.6-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.27.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pr

In [43]:
from langchain.vectorstores import Chroma

In [44]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [45]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [46]:
query="what is retrival?"
vectorstore_retreiver.get_relevant_documents(query)

  vectorstore_retreiver.get_relevant_documents(query)


[Document(metadata={'page': 4, 'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf'}, page_content='retriever is initialized using DPR’s retriever, which uses retrieval supervision on Natural Questions\nand TriviaQA. RAG compares favourably to the DPR QA system, which uses a BERT-based “cross-'),
 Document(metadata={'page': 4, 'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf'}, page_content='Wikipedia, or whether there is not enough information to decide. The task requires retrieving\nevidence from Wikipedia relating to the claim and then reasoning over this evidence to classify'),
 Document(metadata={'page': 2, 'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf'}, page_content='the complete sequence . Technically, it treats the retrieved document as a single latent variable that\nis marginalized to get the seq2seq probability p(y|x)via a top-K approximation. Concretely, the')]

In [50]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [51]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [52]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [53]:
keyword_retriever.k =  3

In [54]:
query="what is retrival?"
keyword_retriever.get_relevant_documents(query)

[Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 6}, page_content='what currency\nneeded in\nscotlandBART The currency needed in Scotland is Pound sterling.\nRAG-T Pound is the currency needed in Scotland.\nRAG-S The currency needed in Scotland is the pound sterling.'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 6}, page_content='RAG-T The middle ear is the portion of the ear internal to the eardrum.\nRAG-S The middle ear includes the tympanic cavity and the three ossicles.\nwhat currency\nneeded in'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 2}, page_content='2.4 Training\nWe jointly train the retriever and generator components without any direct supervision on what')]

In [55]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

In [56]:
ensemble_retriever.get_relevant_documents(query)

[Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 6}, page_content='what currency\nneeded in\nscotlandBART The currency needed in Scotland is Pound sterling.\nRAG-T Pound is the currency needed in Scotland.\nRAG-S The currency needed in Scotland is the pound sterling.'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 6}, page_content='RAG-T The middle ear is the portion of the ear internal to the eardrum.\nRAG-S The middle ear includes the tympanic cavity and the three ossicles.\nwhat currency\nneeded in'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 2}, page_content='2.4 Training\nWe jointly train the retriever and generator components without any direct supervision on what'),
 Document(metadata={'page': 4, 'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf'}, page_content='retriever is initialized using DPR’s retriever, which uses retri

## Mixing vector search and keyword search for Hybrid search
hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [57]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [73]:
#!pip install accelerate
!pip install -U bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.0


In [74]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [75]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [76]:

# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [77]:
tokenizer = initialize_tokenizer(model_name)

In [78]:
model = load_quantized_model(model_name)

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [79]:

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

NameError: name 'model' is not defined

In [80]:

llm = HuggingFacePipeline(pipeline=pipeline)

  llm = HuggingFacePipeline(pipeline=pipeline)


In [82]:
from langchain.chains import RetrievalQA

In [83]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [84]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [85]:
response1 = normal_chain.invoke("What is Abstractive Question Answering?")

TypeError: unhashable type: 'list'

In [None]:

response1