<a href="https://colab.research.google.com/github/patharepremkumar/Generative-AI-Indepth-Basic-to-Advance1/blob/main/Hybrid_Search_in_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [3]:

# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [4]:
query="keyword-based search"

In [6]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [8]:
preprocess_documents=[preprocess_text(doc) for doc in documents]
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [9]:
preprocessed_query = preprocess_text(query)
preprocessed_query

'keywordbased search'

In [10]:
vector=TfidfVectorizer()

In [12]:
x = vector.fit_transform(preprocess_documents)

In [15]:
x.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [16]:
query_embedding=vector.transform([preprocessed_query])

In [17]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [20]:
similarities = cosine_similarity(x, query_embedding)
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [28]:
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [26]:

#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [27]:
ranked_documents = [documents[i] for i in ranked_indices]

In [29]:

# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [30]:
#https://huggingface.co/sentence-transformers

In [31]:

document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [32]:

# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [33]:

# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [34]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [35]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [36]:

# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [44]:
#!pip install pypdf
#!pip install langchain_community

In [45]:
doc_path="/content/Retrieval-Augmented-Generation-for-NLP.pdf"

In [46]:
from langchain_community.document_loaders import PyPDFLoader

In [47]:
loader=PyPDFLoader(doc_path)

In [49]:
docs=loader.load()

In [51]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [52]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [53]:
chunks = splitter.split_documents(docs)

In [54]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [55]:
HF_TOKEN="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"

In [56]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [58]:
#!pip install chromadb

In [59]:
from langchain.vectorstores import Chroma

In [60]:
vectorstore=Chroma.from_documents(chunks,embeddings)

KeyError: 0

In [61]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

NameError: name 'vectorstore' is not defined

In [63]:
#!pip install rank_bm25

In [64]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [66]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [67]:
keyword_retriever.k =  3

In [72]:
keyword_retriever.get_relevant_documents(query)

  keyword_retriever.get_relevant_documents(query)


[Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 11}, page_content='anthology/2020.acl-main.228 .\n[23] Jeff Johnson, Matthijs Douze, and Hervé Jégou. Billion-scale similarity search with gpus. arXiv'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 12}, page_content='id=Hyg0vbWC- .\n[37] Yury A. Malkov and D. A. Yashunin. Efﬁcient and robust approximate nearest neighbor search'),
 Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 3}, page_content='token likelihood, hence we cannot solve it with a single beam search. Instead, we run beam search for')]

In [69]:
ensemble_retriever = EnsembleRetriever(retrievers=[keyword_retriever])

In [71]:
for i in ensemble_retriever:
  print(i)

('name', None)
('tags', None)
('metadata', None)
('retrievers', [BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7ea94c659600>, k=3)])
('weights', [1.0])
('c', 60)
('id_key', None)
