# Book saga Q&A with Retrieval Augmented Generation

## Install and import dependencies

In [None]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu pypdf gradio
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7
!playwright install
!playwright install-deps

In [None]:
import os
from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  BitsAndBytesConfig,
  AutoConfig,
  pipeline
)
import torch
import gradio as gr

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

import nest_asyncio
nest_asyncio.apply()

## Set up LLM

 The Mistral-7B-v0.2 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. The model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost.

In [None]:
# Define model and tokenizer
model_name='mistralai/Mistral-7B-Instruct-v0.2'

model_config = AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

We will use a quatized version of Mistral 7B, although certain precision is lost, this approach will let us run the model using less memory.

In [None]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Set up quantization config
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Load pre-trained config

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

In [None]:
# Define text generation pipeline (from transformers library)
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## Generate vector store index

In [None]:
# Define documents directory (change for your personal directory)
DOCUMENT_DIR = "/content/drive/MyDrive/Projects/RAG/Documents/Twilight-Saga/"

# Load pages from the books
loader = PyPDFDirectoryLoader(DOCUMENT_DIR)
docs = loader.load()

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=25)
chunked_documents = text_splitter.split_documents(docs)

We will use BAAI/bge-base-en-v1.5 embedding model to generate embeddings for both the documents and the queries.

In [None]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5'))

retriever = db.as_retriever()

## Generate RAG chain

In [None]:
# Create prompt template
prompt_template = """
### [INST] Instruction: Answer the question about Twilight saga \
delimited by triple backticks. \
Base your answer solely on the context delimited by triple backticks. \
Bare in mind that the publishing order of the books was: Twilight, New Moon, \
Eclipse and Breaking Down. \
Answer directly to the question in a polite and narrative style.

### QUESTION:
 ```{question} ```


### CONTEXT:
 ```{context} ```

[/INST]"""


# Create prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)


In [None]:
# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt_template)

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)


## RAG Interface

In [None]:
# Preprocess output
def print_source_docs(output):
  sources = ''
  context = output['context']
  for c in context:
    doc = c.metadata['source'].split("/")[-1].split(".")[0]
    page = c.metadata['page']
    sources += ''.join([doc, ' (pg. ', str(page), '), '])

  return sources[0:-2]

def rag(text):
  output = rag_chain.invoke(text)
  answer = output['text']
  sources = print_source_docs(output)

  return answer + ' Sources: ' + sources

In [None]:
# Gradio interface
gr.Interface(fn=rag,
             inputs='textbox',
             outputs='textbox').launch();