<a href="https://colab.research.google.com/github/marvelcodeX/Q-A-over-Documents-using-Langchain/blob/main/Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [None]:
!pip install langchain langchain-community #for different kinds of docs
!pip install faiss-cpu #facebook AI similiarity search
!pip install pypdf python-docx
!pip install sentence-transformers #beacuse dataset is probably txt
!pip install transformers

# Upload a document (PDF, DOCX, or TXT)

In [None]:
from google.colab import files
uploaded = files.upload()

file_path = list(uploaded.keys())[0]
print(f"Uploaded file path:", file_path)

# Load the document with LangChain's loaders and split document into smaller chunks

In [None]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
if file_path.endswith(".pdf"):
  loader = PyPDFLoader(file_path)
elif file_path.endswith(".docx"):
  loader = Docx2txtLoader(file_path)
else:
  loader = TextLoader(file_path)

docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
documents = splitter.split_documents(docs)

print(f"Totla Chunks: {len(documents)}")

# Create embeddings and store in FAISS vector database

In [None]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embeddings)

# Load FLAN-T5 model for text generation

In [None]:
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

flan_pipeline = pipeline (
    "text2text-generation",
    model="google/flan-t5-large",
    max_length=512
)

llm=HuggingFacePipeline(pipeline=flan_pipeline)

# Build Retrieval-QA system

In [None]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
    chain_type="stuff"
)

query = "Give me a short summary of the document"
print(qa.run(query))

# Interactive Q&A loop

In [None]:
while True:
  q = input("Ask a question (or 'exit'): ")
  if q.lower() == "exit":
    break
  print("Answer: ", qa.run(q))