<a href="https://colab.research.google.com/github/nemat-al/QA_Doc_Bot/blob/main/QA_doc_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Bot in Huggingface space: https://huggingface.co/spaces/nemat-al/QA_bot

In [None]:
pip install langchain_community
pip install lang chain_community
pip install gradio
pip install chromadb
pip install pypdf

In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
import gradio as gr
from langchain.llms import HuggingFacePipeline
import chromadb.api
from langchain_community.embeddings import HuggingFaceEmbeddings

chromadb.api.client.SharedSystemClient.clear_system_cache()
def get_llm():
    model_id = 'google/flan-t5-base'
    hf_pipeline = pipeline("text2text-generation", model=model_id, tokenizer=model_id)

    # Wrap the pipeline in HuggingFacePipeline
    llm = HuggingFacePipeline(pipeline=hf_pipeline)
    return llm

def hugging_face_embedding():
    model_name = "sentence-transformers/all-mpnet-base-v2"
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    return embedding_model

def document_loader(file):
    loader = PyPDFLoader(file.name)
    loaded_document = loader.load()
    return loaded_document

def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
    )
    chunks = text_splitter.split_documents(data)
    return chunks

def vector_database(chunks):
    embedding_model = hugging_face_embedding()
    vectordb = Chroma.from_documents(chunks, embedding_model)
    return vectordb

def retriever(file):
    splits = document_loader(file)
    chunks = text_splitter(splits)
    vectordb = vector_database(chunks)
    retriever = vectordb.as_retriever()
    return retriever

def retriever_qa(file, query):
    llm = get_llm()
    retriever_obj = retriever(file)
    qa = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=retriever_obj,
                                    return_source_documents=False)
    response = qa.run(query)
    return response

rag_application = gr.Interface(
    fn=retriever_qa,
    inputs=[
        gr.File(label="Upload PDF File", file_count="single", file_types=['.pdf'], type="filepath"),
        gr.Textbox(label="Input Query", lines=2, placeholder="Type your question here...")
    ],
    outputs=gr.Textbox(label="Output"),
    title="RAG Chatbot",
    description="Upload a PDF document and ask any question. The chatbot will try to answer using the provided document."
)

rag_application.launch(debug=True)
