[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/martin-ku-hku/ai-sharing/blob/main/beyond-chatgpt-how-poor-people-recreate-chatgpt/falcon_rag_with_pdf.ipynb)

Make sure that you use GPU in the Colab environment!

# RAG with PDF

Experiment: To test loading PDF documents to a local Chroma vector DB, and performing RAG with the vector DB and a LLM (with 4 bit quantization).

## Installing libraries

In [None]:
!pip install -qU transformers sentence-transformers datasets accelerate einops langchain xformers bitsandbytes gradio pypdf tiktoken chromadb lark gradio

## Loading a PDF

In [None]:
!wget -O testing.pdf https://d2tic4wvo1iusb.cloudfront.net/production/documents/guidance/Cognitive_science_approaches_in_the_classroom_-_A_review_of_the_evidence.pdf?v=1691716109

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader('testing.pdf')
pages = loader.load()
len(pages)

In [None]:
page = pages[10]
print(page.metadata)

## Splitting document

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
chunk_size = 1500
chunk_overlap = 150

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)

In [None]:
texts = text_splitter.split_documents(pages)
print(len(texts))

In [None]:
for text in texts:
    print(text.page_content)
    break

## Initialize the HuggingFace Embedding Pipeline

In [None]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

## Vector store

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = '../vdb/chroma'

In [None]:
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embed_model,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

In [None]:
vectordb.persist()

## Similarity search

In [None]:
question = "What is spaced practice in the classroom?"

In [None]:
retrieved = vectordb.similarity_search(question, k=3)

In [None]:
for doc in retrieved:
  print(doc.metadata)

In [None]:
for doc in retrieved:
  print(doc.page_content)

## MMR search

In [None]:
mmr_retrieved = vectordb.max_marginal_relevance_search(question, k=3, fetch_k=5)

In [None]:
for doc in mmr_retrieved:
  print(doc.metadata)

In [None]:
for doc in mmr_retrieved:
  print(doc.page_content)

## HuggingFace Pipeline

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'bigscience/bloom-7b1'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True
)

model.eval()

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

In [None]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True, # langchain expects full text,
    task='text-generation',
    temperature=0.0,
    max_new_tokens=128,
    repetition_penalty=1.1
)

## Load the pipeline in LangChain

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

## Question answering with the document

In [None]:
from langchain.chains import RetrievalQA

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(search_type='mmr')
)

In [None]:
question = "What are the practices that teachers might use to manage cognitive load?"

In [None]:
result = qa_chain({'query': question})

In [None]:
result['result']

In [None]:
# no RAG, use the LLM only
llm(question)

## With extra prompt

In [None]:
from langchain.prompts import PromptTemplate

In [None]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(search_type="mmr"),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
question = "What are the practices that teachers might use to manage cognitive load?"

In [None]:
result = qa_chain({'query': question})

In [None]:
result['result']

In [None]:
# no RAG, use LLM only
llm(question)