# Open-Source RAG with LLaMa 13B (4 bits for less GPU memory), Faiss, HuggingFace and Langchain or with OpenAI

In this Poc we'll create a RAG Open-Source solution with **Llama-13b-chat** with HuggingFace embedings, Faiss (Vector DB), all orchestrated by LangChain. Or we could parametrize with OpenAI.

In terms of struture of the solution, we have the main UI in file
 `RAG_QAw_Parametrization.ipynb` that import all the parametrization (which model, temperature, chain...) from `parametrization.ipynb`  and the core RAG functions from `RAGQA.ipynb`. `RAGQA.ipynb` import also `Parametrization.ipynb`.   

**Retrieval Augmented Generation (RAG)** is an advanced Natural Language Processing (NLP) technique that combines both retrieval and generation elements to enhance AI language models' capabilities.

You must first request access to Llama 2 models via [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) (access is typically granted within a few hours).
---

🚨 I suggest  runing in Google Colab  by going to **Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4**. This should be included within the free tier of Colab.

---

We start by doing a `pip install` of all required libraries.

In [None]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0 \
  pypdf \
  faiss-cpu \
  Docx2txt \
  gradio \
  openai \
  import-ipynb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.9 MB/s[0m 

# Import modules

##For import other ipynb (Collab) notebooks

*Texto em itálico*


In [None]:
#drive.mount("/content/drive", force_remount=True)
#/content/drive/MyDrive/Colab Notebooks

Mounted at /content/drive


In [None]:
#authorize Colab to access your Google Drive account, only necessary when running this module, not when importing
#from google.colab import drive
#drive.mount('/content/drive')

#Google Drive will be mounted at /content/drive

#directory where your IPYNB file is located
#%cd /content/drive/MyDrive/Colab Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import (
     LLMChain, ConversationalRetrievalChain
)

from langchain import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.memory import ConversationBufferWindowMemory
from langchain.vectorstores import FAISS

import gradio as gr

#%run Parametrization.ipynb
import import_ipynb

import Parametrization as pr

importing Jupyter notebook from Parametrization.ipynb


# Global Variables

In [None]:
#Global Variables with access functions
vectorstore = None
KnowledgeUploaded = False

##Access functions to global variables for solution parametrization


In [None]:
#Access functions to global variables
def vectorstore_read ():
    return vectorstore

def vectorstore_write (docs, embeddings):
    global vectorstore

    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def vectorstore_docs (query):

    vectorstore = vectorstore_read()
    docs = vectorstore.similarity_search(query)
    return docs


def KnowledgeUploaded_read ():
    return KnowledgeUploaded

def KnowledgeUploaded_write (new_value):
    global KnowledgeUploaded

    KnowledgeUploaded = new_value
    return KnowledgeUploaded

## Create the knowledge base

Create a knowledge base with documents (type pdf; docx and txt), same dir, for the specific knowledge of the virtual assistant (Question/Anwswer and Chat)


In [None]:
#Create a knowledge base with documents (type pdf; docx and txt)
def process_knowledge (pathFiles):
    documents = []
    for file in pathFiles:
        file2Treat = file.name  # obtain filename path (from Gradio)
        if file2Treat.endswith(".pdf"):
            loader = PyPDFLoader(file2Treat)
            documents.extend(loader.load())
        elif file2Treat.endswith('.docx') or file2Treat.endswith('.doc'):
            loader = Docx2txtLoader(file2Treat)
            documents.extend(loader.load())
        elif file2Treat.endswith('.txt'):
            loader = TextLoader(file2Treat)
            documents.extend(loader.load())

    docs_not_splitted = documents

    # Split the documents into smaller chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) #try
    docs = text_splitter.split_documents(docs_not_splitted)

    # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    embeddings = pr.embeddings_read ()

    #FAISS to work with ConversationalRetrievalChain
    #vectorstore = FAISS.from_documents(docs, embeddings)
    #vectorstore = Chroma.from_documents(docs, embeddings) # Chroma DB, problems in installing in env....
    vectorstore = vectorstore_write (docs, embeddings)

    KnowledgeUploaded = KnowledgeUploaded_write (True)
    msg = "docs uploaded in Vector DB"
    return msg

##Prompt Bullets

it returns a Prompt for a customer support chatbot that answer questions using
information extracted from our knowledge base. Having the answer in bullets or no

In [None]:
def promt_bullets (bullets):

    if pr.bulletprompt2work_read():
        template = """You are a exceptional customer support virtual assistant having a conversation with a human.

        Given the following context information and a question, create a final answer in bullets.

        {context}

        Human: {human_input}
        Virtual Assistant:"""
        # bullet = "in bullets"
    else:
        template = """You are a exceptional customer support virtual assistant having a conversation with a human.

        Given the following context information and a question, create a final answer.

        {context}

        Human: {human_input}
        Virtual Assistant:"""
        # bullet = ""
        # remove {chat_history}
    return template

##Question Answer (Q/A)

Question Answer (Q/A) from the Knowledge base, has the query (question) as an input and will return the answer. For RAG technique

In [None]:
def question_answer(query):
    # define the prompt with bullets or no
    template = promt_bullets(pr.bulletprompt2work_read())
    # validate if vectorstore exists....
    vectorst = vectorstore_read()
    if vectorst:
        docs = vectorstore_docs(query)
        #docs = vectorstore.similarity_search(query)  # db
        prompt = PromptTemplate(
            input_variables=["human_input", "context"], template=template
        )
        # memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
        if pr.qachain2work_read() == pr.LOADQA:
            chain = load_qa_chain(
                pr.llm_read(), chain_type="stuff", prompt=prompt, #temperature inside llm
                verbose=pr.verbose2work_read()
            )  # no memory
            result = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)
            output_text = result["output_text"]
            #print(f"result[output_text] {output_text}")
            return result["output_text"]
        # elif chain retrieve
        elif pr.qachain2work_read() == pr.RETRIEVALQA:
            from langchain.chains import RetrievalQA

            rag_pipeline = RetrievalQA.from_chain_type(
                llm=pr.llm_read(),
                chain_type='stuff',
                retriever=vectorst.as_retriever(),
                return_source_documents=True,
                verbose=pr.verbose2work_read()#,
                #chain_type_kwargs={"prompt": prompt}
            )

            # retrievalQA chain only uses query as an input, in order to have a custom made prompt
            # we need to go lower for instance with load_qa_chain, or  changing parameters definition in retrieval...in
            result = rag_pipeline(query)
            # result = rag_pipeline ({"input_documents": docs, "human_input": query}, return_only_outputs=True)
            # ValueError: Missing some input keys:  {'query'}
            #output_text = result["output_text"]
            #return output_text
            return result["result"]
    # elseif  isnull vectorstore:
    print("Upload knowledge please")  # if arrives here vectorstore is Null:
    return "Upload knowledge please"

#User Interface (UI) for knowledge, QA and Chat

Creating the specific knowledge base, QA and  Chat function with memory.


In [None]:
gr.close_all()

CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 100vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto;}
"""

with gr.Blocks(css=CSS) as app_vassistant_qa:
    gr.Markdown("# Virtual Assistant, with specific knowledge,  for Question Answering ")

    with gr.Tab(label = "Upload specific Knowledge Base"):
        DirUploaded = gr.File(label="Upload files of Knowledge", type="file", file_count="multiple") # file_types = ["text"];  file_count = "directory"
        btnProcessKnowledge = gr.Button("Upload Knowledge Base")  # Submit button side by side!
        msgUpload = gr.Textbox(label="Knowledge Base Uploaded?")

    with gr.Tab(label = "Question Answering"):
        with gr.Row():  # only visible when btn.click
            with gr.Column(scale=2):
                question = gr.Textbox(label="Your Question")  # Question
                btnProcessQuestion = gr.Button("Submit Question")
            with gr.Column(scale=4, min_width=50):
                output = gr.Textbox(label="Answer", lines=12)

    with gr.Tab(label="Virtual Assistant with specific knowledge, in chat format"):
        chatbot = gr.Chatbot(elem_id="chatbot")
        msg = gr.Textbox(label="Human message")
        msgNeedSpecificKnowledge = gr.Textbox(label="Need to Upload Specific Knowledge?")
        clear = gr.ClearButton([msg, chatbot, msgNeedSpecificKnowledge])

        # Q/A in chat format with memory
        def respond(query, chat_history_G):
            from langchain.schema import AIMessage, HumanMessage

            if KnowledgeUploaded_read () == False: # "need to upload knowledge"
                msgNeedSpecificKnowledge = "Need to Upload Specific Knowledge Base"

            elif KnowledgeUploaded_read (): #There is knowledge, we can go on with chat
                msgNeedSpecificKnowledge = ""
                # In each chat iteration we need to re-create the chain variables
                # may change (parametrizatiom, knowledge base)because we have parameters so each iteration we will need to define the chat

                # define the prompt with bullets or no
                template = promt_bullets(pr.bulletprompt2work_read())
                vectorst = vectorstore_read()

                if pr.qachain2work_read() == pr.RETRIEVALQA:
                    qachain = ConversationalRetrievalChain.from_llm(
                        pr.llm_read(),  # has got temperature
                        vectorst.as_retriever(),
                        # retriever = retriever,
                        # db.as_retriever(),
                        # memory=memory, without  memoria.....we control explicitly the memory
                    )
                elif pr.qachain2work_read() == pr.LOADQA: #Retrieval =... think about other?

                    qachain = ConversationalRetrievalChain.from_llm(
                        llm = pr.llm_read(),  # has got temperature
                        retriever = vectorst.as_retriever(),
                        verbose=pr.verbose2work_read()
                        #chain_type_kwargs={"prompt": prompt} #?
                        # retriever = retriever,
                        # db.as_retriever(),
                        # memory=memory, without  memoria.....we control explicitly the memory
                    )

                memory = ConversationBufferWindowMemory(memory_key="chat_history",return_messages=True) #AT so criar uma vez?
                history_langchain_format = []
                for human, ai in chat_history_G:
                    history_langchain_format.append(HumanMessage(content=human))
                    history_langchain_format.append(AIMessage(content=ai))

                result = qachain({"question": query,
                                "chat_history": history_langchain_format}) # input_documents no need because qachain deals with it (has vectorstore)

                chat_history_G.append((query, result["answer"]))

            return "", chat_history_G, msgNeedSpecificKnowledge

        msg.submit(respond, [msg, chatbot], [msg, chatbot, msgNeedSpecificKnowledge])

    btnProcessKnowledge.click(fn=process_knowledge, inputs=DirUploaded, outputs=msgUpload)  # output: gradio components nop
    btnProcessQuestion.click(fn=question_answer, inputs=question, outputs=output)