#### Retrievers Practice

In [None]:
import os
cache_dir = 'D:/Development/ML/Deep Learning/GenAI/.hf_cache'
os.environ['HF_HOME'] = cache_dir
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir
os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
os.makedirs(cache_dir, exist_ok=True)

# HF login
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()
login(os.getenv("HF_TOKEN"))

import transformers
print(transformers.file_utils.default_cache_path)

from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline

# Embedding and LLM Model Setup
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

llm_model = HuggingFacePipeline.from_model_id(
            model_id="google/gemma-3-1b-it",
            task="text-generation",
            pipeline_kwargs={"max_new_tokens": 500, "temperature": 0.2},
    )

In [None]:
from langchain_community.retrievers import WikipediaRetriever
from langchain_community.vectorstores import Chroma
from langchain_classic.schema import Document
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import LLMChainExtractor

Wikipedia Retriever

In [None]:
wiki_retriever = WikipediaRetriever(
    language="en",
    top_k=2,
)

query = "What is the most used technology in AI?"
docs = wiki_retriever.invoke(query)
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n")

Vector Store Retriever

In [None]:
# Documents
docs = [
    Document(
        page_content="Nervous System is a complex network of nerves and cells that carry messages to and from the brain and spinal cord to various parts of the body.",
        metadata={"topic": "Human Biology"}
    ),
    Document(
        page_content="Nervous System consists of Brain, Spinal Cord, and Nerves.",
        metadata={"topic": "Human Biology"}
    ),
    Document(
        page_content="Circulatory System is responsible for the flow of blood, nutrients, oxygen, hormones, and other gases to and from cells.",
        metadata={"topic": "Human Biology"}
    ),
    Document(
        page_content="Deep Neural Networks are a type of artificial neural network with multiple layers between the input and output layers.",
        metadata={"topic": "Artificial Intelligence"}
    ),
    Document(
        page_content="In RAG Application, Deep Neural Networks act as the Brain of the system, processing and understanding the information retrieved from various sources.",
        metadata={"topic": "Artificial Intelligence"}
    ),
]

In [None]:
# Using the ChromaDB from VectorStoresPractice.ipynb
vectorDB = Chroma(
    persist_directory="./files/chroma_db",
    collection_name='neural_networks',
    embedding_function=embedding_model
)

vectorDB.add_documents(docs)

In [None]:
vdb_retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [None]:
query = "What is Nervous System?"
res1 = vdb_retriever.invoke(query)
for i, doc in enumerate(res1):
    print(f"Document {i+1}:\n{doc.page_content}\n")

MMR Retriever (Maximal Marginal Relevance)

In [None]:
mmr_retriever = vectorDB.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4})
query2 = "What is Nervous System?"
res2 = mmr_retriever.invoke(query2)
for i, doc in enumerate(res2):
    print(f"Document {i+1}:\n{doc.page_content}\n")

In [None]:
query3 = "What is Neural?"
res3 = mmr_retriever.invoke(query3)
for i, doc in enumerate(res3):
    print(f"Document {i+1}:\n{doc.page_content}\n")

Multi Query Retriever

In [None]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    llm=llm_model
)

In [None]:
query4 = "What is Nervous System and its components?"
res4_single = vdb_retriever.invoke(query4)
for i, doc in enumerate(res4_single):
    print(f"Document {i+1}:\n{doc.page_content}\n")
    
print("**************************")

res4_multi = multi_query_retriever.invoke(query4)
for i, doc in enumerate(res4_multi):
    print(f"Document {i+1}:\n{doc.page_content}\n")

Contextual Compression Retriever

In [None]:
contx_retriever = ContextualCompressionRetriever(
    base_retriever=vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 4}),
    base_compressor=LLMChainExtractor.from_llm(llm_model)
)

In [None]:
contx_res = contx_retriever.invoke(query4)
for i, doc in enumerate(contx_res):
    print(f"Document {i+1}:\n{doc.page_content}\n")