In [2]:
# Install dependencies
!pip install faiss-cpu sentence-transformers transformers langchain streamlit accelerate -q

from transformers import pipeline
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import torch, os, shutil

You should consider upgrading via the 'D:\LLM Projects\RAG\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:


# 1. Load PDF and Split
def load_and_split_document(path):
    loader = PyPDFLoader(path)
    docs = loader.load()
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return splitter.split_documents(docs)

# 2. Create Vector Store
def create_vector_store(docs):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

# 3. Save vector store
def save_vectorstore(vectorstore, folder="faiss_index"):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    vectorstore.save_local(folder)

# 4. Load vector store
def load_vectorstore(folder="faiss_index"):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.load_local(folder, embeddings, allow_dangerous_deserialization=True)

# 5. Load HuggingFace LLM
def get_hf_pipeline():
    pipe = pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        tokenizer="google/flan-t5-base",
        max_length=512,
        device=0 if torch.cuda.is_available() else -1
    )
    return HuggingFacePipeline(pipeline=pipe)

# 6. QA Chain
def build_qa_chain(vectorstore):
    llm = get_hf_pipeline()
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever(), chain_type="stuff")
    return qa

# Example (Uncomment to Test)
# docs = load_and_split_document("/content/sample.pdf")
# vs = create_vector_store(docs)
# save_vectorstore(vs)
# qa = build_qa_chain(load_vectorstore())
# print(qa.run("What is this document about?"))
