In [None]:
from typing import List

# 1. Document loader

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
pdf_files: List[str]  = [
    "https://arxiv.org/pdf/2312.16862.pdf", # TinyGPT-V
    "https://arxiv.org/pdf/2308.10792.pdf", # Instruction Tuning
    "https://arxiv.org/pdf/2401.10020.pdf", # Self-Reward
    "https://arxiv.org/pdf/2201.11903.pdf", #Chain-of-Thought Prompting
    "https://arxiv.org/pdf/2401.18059v1.pdf", # RAPTOR
    "https://arxiv.org/pdf/2307.09288.pdf", # LLama 2
    "https://arxiv.org/pdf/2302.13971.pdf", # LLama 1
]

In [None]:
def remove_non_utf8_characters(text):
    utf8_encoded_text = ""
    for char in text:
        try:
            char.encode('utf-8')
            utf8_encoded_text += char
        except UnicodeEncodeError:
            pass
    return utf8_encoded_text

In [None]:
doc_loaded = []

for pdf_file in pdf_files:
    docs = PyPDFLoader(pdf_file, extract_images=True).load()
    for doc in docs:
        doc.page_content = remove_non_utf8_characters(doc.page_content)
    doc_loaded.extend(docs)

In [None]:
print(f"Total number of page loaded: {len(doc_loaded)}")

# 2. Document split

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
chunk_size = 500
chunk_overlap = 0
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

In [None]:
doc_splitted = splitter.split_documents(doc_loaded)

In [None]:
print(f"Total number of chunks: {len(doc_splitted)}")

# 3. Vector database

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings()

In [None]:
from langchain_chroma import Chroma
chroma_db = Chroma.from_documents(doc_splitted, embedding=embedding_model)

# 4. Retriever

In [None]:
retriever = chroma_db.as_retriever(
    search_type="similarity", 
    search_kwargs={
        "k": 10
    }
)

# 5. LLM 

In [None]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    low_cpu_mem_usage=True
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
max_new_token = 1024

model_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=max_new_token,
    pad_token_id=tokenizer.eos_token_id
)

gen_kwargs = {
    "temperature": 0.9
}

llm = HuggingFacePipeline(
    pipeline=model_pipeline,
    model_kwargs=gen_kwargs
)

# 5. RAG prompt

In [None]:
%pip install langchainhub

In [25]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [42]:
prompt.messages[0].prompt.input_variables

['context', 'question']

In [44]:
print(prompt.messages[0].prompt.template)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:


# 6. Str Parser

In [64]:
from langchain_core.output_parsers import StrOutputParser

str_parser = StrOutputParser()

# 6. RAG Chain

In [65]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [56]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | str_parser
)

In [57]:
user_question = "What is Instruction Tuning?"

output = rag_chain.invoke(user_question)

# 7. Post processing

In [66]:
import re

In [71]:
def extract_answer(
        text_response: str, 
        pattern: str = r"Answer:\s*(.*)"
    ) -> str:
    match = re.search(pattern, text_response)

    if match:
        answer_text = match.group(1).strip()
        return answer_text
    else:
        return "Answer not found."

In [73]:
res = extract_answer(output)
res

"Instruction tuning is a methodology for fine-tuning large language models (LLMs) using instruction datasets. It involves constructing a dataset consisting of instructions and their corresponding outputs. The benefits of instruction tuning include bridging the gap between the next-word prediction objective of LLMs and the users' objective of instruction following, and allowing for more controllable and predictable model behavior. The process typically involves instruction dataset construction, instruction tuning, and evaluation of the instruction-tuned models."