In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline

import os 
from dotenv import load_dotenv
load_dotenv(override=True)

os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")
hf_token = os.getenv("HUGGINGFACE_API_KEY")

In [3]:
# Load & Split Document
loader = PyPDFLoader("D:/AI_ML_DL/papers/attention-is-all-you-need-Paper.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = loader.load_and_split(text_splitter)

In [None]:
# Initialize Embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create Vector Store
vector_db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_metadata={"hnsw:space": "cosine"},
    persist_directory="./chroma_db_simple_rag"
)

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  return forward_call(*args, **kwargs)


In [None]:
# Initialize LLM (LLaMA-2 7B)
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import torch
import accelerate

model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    torch_dtype=torch.float16,
    use_auth_token=hf_token
)

# model_id = "mistralai/Mistral-7B-Instruct-v0.1"
# tokenizer = AutoTokenizer.from_pretrained(model_id,
#                                           use_auth_token = hf_token)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     device_map="auto",
#     torch_dtype=torch.float16,
#     use_auth_token = hf_token
# )

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature = 0.7,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.1
)
llm = HuggingFacePipeline(pipeline=pipe)

# Build RAG Chain with Query Expansion
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_db.as_retriever(
        search_type="mmr", 
        search_kwargs={"k": 4}),
    chain_type="stuff",
    return_source_documents=True
)




tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

In [None]:
# Advanced Query Handling
def enhanced_rag(question):
    # Step 1: Query Rewriting
    rewrite_prompt = f"""
    Given the original query: {question}
    Generate 2 improved versions considering potential ambiguities.
    Output format: ["query1", "query2"]
    """
    rewritten = llm(rewrite_prompt)
    
    # Step 2: Multi-Query Retrieval
    all_results = []
    for q in eval(rewritten):
        all_results.extend(vector_db.similarity_search(q, k=2))
    
    # Step 3: Rerank with Cross-Encoder
    from sentence_transformers import CrossEncoder
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    pairs = [(question, doc.page_content) for doc in all_results]
    scores = reranker.predict(pairs)
    ranked_docs = [doc for _, doc in sorted(zip(scores, all_results), reverse=True)][:5]
    
    # Step 4: LLM Synthesis
    context = "\n\n".join(d.page_content for d in ranked_docs)
    response = llm(f"Context: {context}\n\nQuestion: {question}\nAnswer:")
    return response, ranked_docs

# Usage    
answer, sources = enhanced_rag("explain attention mechanism")

  rewritten = llm(rewrite_prompt)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


IndentationError: unexpected indent (<string>, line 2)

In [None]:
answer