In [None]:
!pip install --upgrade --quiet PyPDF2 transformers datasets accelerate bitsandbytes sentence-transformers langchain langchain-huggingface langchain_community langchain_experimental faiss-cpu trl peft

In [None]:
import os
from typing import Any, Dict, List
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import datasets
import torch
import numpy as np
import random
import pandas as pd
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
import json
from glob import glob
import PyPDF2
from math import ceil
from tqdm import tqdm
import shutil
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
import time
from openai import OpenAI

General global variables:

In [None]:
LLM_PATH = "meta-llama/Meta-Llama-3-8B-Instruct"
EMBEDDER_PATH = "sentence-transformers/all-MiniLM-L6-v2"

SENTENTENCE_TRANSFORMER_BATCH_SIZE = 64 # TUNE THIS VARIABLE depending on the size of your embedding model and GPU mem available

In [None]:
from huggingface_hub import notebook_login

notebook_login() 

RAFT hyperparameters:

In [None]:
CHUNK_SIZE = 512 # size of each chunk in number of tokens
NUM_DISTRACT_DOCS = 3 # number of distractor docs per data point

# Initializing models

The first step is to create a simple function to load our models:

In [None]:
def initialize_llm():
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    # load model with quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=False,
    )

    model = AutoModelForCausalLM.from_pretrained(
        LLM_PATH,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
        device_map="auto",
    )

    # setup pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        eos_token_id=terminators,
        pad_token_id=tokenizer.eos_token_id,

        ## model hyperparameters
        max_new_tokens=200,
        do_sample=True,
        temperature=0.001, # lower temp for more precise generation based on context
        top_p=0.9,
    )

    # connect with a langchain object
    llm = HuggingFacePipeline(pipeline=pipe)

    """
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

    # load model
    llm = VLLM(
        model=LLM_PATH,

        ## vllm parameters
        trust_remote_code=True,
        tensor_parallel_size=torch.cuda.device_count(),
        dtype="half", # bfloat16 is not supported on nvidia-T4 GPUs
        vllm_kwargs = {
            "gpu_memory_utilization": VLLM_GPU_MEMORY_UTILIZATION,
            "enforce_eager":True
        },

        ## hyperparameters
        max_new_tokens=50,
        do_sample=True,
        temperature=0.001, # lower temp for more precise generation based on context
        top_p=0.9,
    )
    """

    return tokenizer, llm

def initialize_embedder():
    embedder = HuggingFaceEmbeddings(
        model_name=EMBEDDER_PATH,
        model_kwargs={"device": "cuda"},
        multi_process=True,
        encode_kwargs={
            "normalize_embeddings": True,
            "batch_size": SENTENTENCE_TRANSFORMER_BATCH_SIZE}
    )

    return embedder

# RAFT implementation

We first retrieve our documents and break them down into chunks of size `CHUNK_SIZE`:

In [None]:
def get_chunks(data_paths, embedder, return_raw=True):
    chunks = [] # final list of all chunks from all papers

    for data_path in tqdm(data_paths):
        chunks_paper = [] # chunks for this specific paper

        text = ""

        with open(data_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)
            for page_num in range(num_pages):
                page = reader.pages[page_num]
                text += page.extract_text()

        num_chunks = ceil(len(text) / CHUNK_SIZE)

        text_splitter = SemanticChunker(embedder, number_of_chunks=num_chunks)
        chunks_paper = text_splitter.create_documents([text])

        if return_raw:
            # get raw text instead of `Document` format
            chunks_paper = [chunk.page_content for chunk in chunks_paper]

        chunks.extend(chunks_paper)

    return chunks

Then we can work on creating the RAFT dataset. We'll first be defining the helper functions that will interact with the LLM:

In [None]:
def call_llm(llm: HuggingFacePipeline, tokenizer: AutoTokenizer, system_prompt: str, user_prompt: str) -> str:
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
    )

    raw_output = llm.invoke(prompt)
    raw_output = raw_output[len(prompt):]
    return raw_output


def strip_str(s: str) -> str:
    """
    Helper function for helping format strings returned by LLM.
    """
    l, r = 0, len(s)-1
    beg_found = False
    for i in range(len(s)):
        if s[i].isalpha():
            if not beg_found:
                l = i
                beg_found = True
            else:
                r = i
    r += 2
    return s[l:min(r, len(s))]


def generate_questions(llm: HuggingFacePipeline, tokenizer: AutoTokenizer, chunk: Any, x: int = 5) -> list[str]:
    """
    Generates `x` questions / use cases for `chunk`. Used when the input document is of general types
    `pdf`, `json`, or `txt`.
    """

    # create the prompts
    system_prompt = f"You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), generate {x} example questions a user could ask and would be answered using information from the chunk. For example, if the given context was a Wikipedia paragraph about the United States, an example question could be 'How many states are in the United States?'"
    system_prompt += "\n" + f"The questions should be able to be answered in a few words or less. Do not start your answer with something like 'Here are {x} example questions that could be asked based on the given context:', just write the questions."

    user_prompt = str(chunk)

    # call LLM
    queries_raw = call_llm(llm, tokenizer, system_prompt, user_prompt)

    # process output
    queries = queries_raw.split('\n')
    queries = [strip_str(q) for q in queries]
    queries = [q for q in queries if any(c.isalpha() for c in q)]

    return queries


def generate_cot(llm: HuggingFacePipeline, tokenizer: AutoTokenizer, question: str, context: Any) -> str | None:
    """
    Generates the chain of thought answer to `question` using `context` and the LLM
    """

    # create the prompts
    system_prompt = "You are a helpful question answerer who can provide an answer given a question and relevant context."

    user_prompt = """
        Question: {question}\nContext: {context}\n
        Answer this question using the information given in the context above. Here is things to pay attention to:
        - First provide step-by-step reasoning on how to answer the question.
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
        You MUST begin your final answer with the tag "<ANSWER>:".
    """.format(question=question, context=str(context))

    # call LLM
    cot = call_llm(llm, tokenizer, system_prompt, user_prompt)

    return cot

This allows us to write a function to process each chunk. For each chunk we'll create $\{Q, A, D\}$ triplets, where $Q$ is the question and add them to the dataset. Here, $A$ is the answer and $D$ is the set of documents (inside of $D$ we have a set of documents from which the accurate answer can be deduced, called *oracle documents*, and also a set of documents that do not contain context relevant to the answer, called *distractor documents*):

In [None]:
def add_chunk_to_dataset(
    chunks,
    chunk,
    llm,
    tokenizer,
    x=5, # number of questions for each chunk
    num_distract=3, # number of distractor docs
    p=0.8 # probability of including oracle docs

):
    global ds

    # get index of current chunk
    i = chunks.index(chunk)

    # generate questions
    questions = generate_questions(llm, tokenizer, chunk, x)

    for q in questions:
        datapoint = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None
        }

        datapoint["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
        datapoint["type"] = "general"
        datapoint["question"] = q

        # add `num_distract` distractor docs
        docs = [chunk]
        indices = list(range(0, len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])

        # adds oracle documents with `p` probability (so p% of the docs are oracles)
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {
            "title": [],
            "sentences": []
        }

        d["title"].append(["placeholder_title"]*(num_distract+1))
        d["sentences"].append(docs)
        datapoint["context"] = d
        datapoint["oracle_context"] = chunk

        # add answer to q
        datapoint["cot_answer"] = generate_cot(llm, tokenizer, q, chunk)

        # construct model instruction
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapoint["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapoint["id"] = [datapoint["id"]]
            datapoint["type"] = [datapoint["type"]]
            datapoint["question"] = [datapoint["question"]]
            datapoint["context"] = [datapoint["context"]]
            datapoint["oracle_context"] = [datapoint["oracle_context"]]
            datapoint["cot_answer"] = [datapoint["cot_answer"]]
            datapoint["instruction"] = [datapoint["instruction"]]
            ds = datasets.Dataset.from_dict(datapoint)
        else:
            ds = ds.add_item(datapoint)

Then we can create a function that'll create the RAFT dataset for all of our data:

In [None]:
def append_extension(path: str, extension: str) -> str:
    suffix = "." + extension
    if not path.endswith(suffix):
        path = path + suffix
    return path


def raft_dataset(chunks, llm, tokenizer, filename_checkpoint="checkpoint.txt", dir_out="raftdataset/"):
    num_chunks = len(chunks)
    N = 15 # every 15 chunks, save checkpoint

    global ds
    ds = None # dataset (it is saved periodically allowing the script to recover)
    start = 0
    os.makedirs(dir_out, exist_ok=True)

    # if we already have a checkpoint (last index), continue from it
    if os.path.exists(filename_checkpoint):
        with open(filename_checkpoint, 'r') as f:
            start = int(int(f.read()))

    # generate the dataset by iterating thru each chunk
    for i in tqdm(range((start//N)*N, len(chunks))):
        chunk = chunks[i]

        # save checkpoint
        with open(filename_checkpoint, 'w') as f:
            f.write(str(i))

        perc = ceil(i / num_chunks * 100)

        # update dataset with current chunk
        add_chunk_to_dataset(chunks, chunk, llm, tokenizer)

        if (i+1) % N == 0:
            ds.save_to_disk(dir_out + "-checkpoints-" + str(i))
            ds = None

    if ds:
        ds.save_to_disk(dir_out + "-checkpoints-" + str(i))

    # concatenate all of the datasets
    ds_list = []

    for filename in os.listdir(os.path.dirname(dir_out)):
        if "-checkpoints-" in filename:
            for f in os.listdir(os.path.dirname(dir_out) + "/" + filename):
                if f.endswith(".arrow"):
                    ds_list.append(datasets.Dataset.from_file(os.path.dirname(dir_out) + "/" + filename + "/" + f))

    ds = datasets.concatenate_datasets(ds_list)

    # export and save them in hf and jsonl format
    ds.save_to_disk(dir_out + "-checkpoints-last")
    ds.to_json(append_extension(dir_out, "jsonl"))

    # remove checkpoint and intermediate datasets
    os.remove("checkpoint.txt")
    for filename in os.listdir(os.path.dirname(dir_out)):
        if "-checkpoints-" in filename and "last" not in filename:
            shutil.rmtree(os.path.dirname(dir_out) + "/" + filename)

# Generating a RAFT dataset


We can start by initializing our embedder:

In [None]:
embedder = initialize_embedder()

And then creating the chunks:

In [None]:
PAPERS_PATHS = glob("papers/*.pdf")

chunks = get_chunks(PAPERS_PATHS, embedder)

Now we can initialize our LLM and then run it over the chunks to create the RAFT dataset:

In [None]:
tokenizer, llm = initialize_llm()
raft_dataset(chunks, llm, tokenizer)

# Training with RAFT

With the RAFT dataset created, we can use a simple finetuning strategy to tune our model, following the best practices highlighted here: https://ai.meta.com/blog/raft-llama-retrieval-augmented-generation-supervised-fine-tuning-microsoft/

In [None]:
# load dataset
ds = datasets.Dataset.from_file("raftdataset/-checkpoints-last/data-00000-of-00001.arrow")

We do a small amount of processing to the dataset:

In [None]:
def prepare_ds(item):
    instruction = item["instruction"]
    answer = item["cot_answer"]

    item["text"] = f"""<s>[INST] {instruction} [/INST] {answer}</s>"""
    return item

# process dataset and split between train/val sets
cols_to_remove = ["id", "type", "question", "context", "oracle_context"]
ds = ds.map(prepare_ds, num_proc=4, remove_columns=cols_to_remove)
ds

And then we can train the model:

In [None]:
def initialize_llm_trainable(path="mistralai/Mistral-7B-Instruct-v0.2", cache_dir="/workspace"):
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    # load model with quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=False,
    )

    model = AutoModelForCausalLM.from_pretrained(
        path,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir=cache_dir,
    )
    
    return tokenizer, model

In [None]:
model_tokenizer, model = initialize_llm_trainable()

sft_config = SFTConfig(
    dataset_text_field="text",
    output_dir="tmp_finetuning",
    max_seq_length=1024,
    per_device_train_batch_size=2
)

peft_config = LoraConfig(
    r=128,
    lora_alpha=64,
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM",
)

trainer = SFTTrainer(
    model,
    train_dataset=ds,
    args=sft_config,
    peft_config=peft_config
)

trainer.train()
trainer.save_model("finetuned")

# Creating a simple RAG pipeline

Now we can develop a simple RAG pipeline that uses the chunks in our data and the RAFT-finetuned model to actually come up with answers to our questions. We can use the same chunking function as before to process the PDFs:

In [None]:
embedder = initialize_embedder()
PAPERS_PATHS = glob("papers/*.pdf")

# this time we'll be getting the chunks in a langchain `Document` format
chunks = get_chunks(PAPERS_PATHS, embedder, return_raw=False)

And then add them to a vector database together with their encodings:

In [None]:
db = FAISS.from_documents(
    chunks, 
    HuggingFaceEmbeddings(
        model_name=EMBEDDER_PATH,
        model_kwargs={"device": "cuda"},
        multi_process=True,
        encode_kwargs={
            "normalize_embeddings": True,
            "batch_size": SENTENTENCE_TRANSFORMER_BATCH_SIZE}
    )
)

retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4} # same amount of documents for each point the RAFT dataset
)

Then we can load the finetuned model and prepare it to work with our RAG framework:

In [None]:
# load the model
tokenizer_tuned, model_tuned = initialize_llm_trainable(path="/workspace/tmp_finetuning/checkpoint-2500")

# create the HuggingFace pipeline
pipe = pipeline(
    'text-generation',
    model=model_tuned,
    tokenizer=tokenizer_tuned,

    ## hyperparameters
    max_new_tokens=1024, # RAFT models like to talk a lot apparently
    do_sample=True,
    temperature=0.001,
    top_p=0.9,
)

hf_pipeline = HuggingFacePipeline(pipeline=pipe)

Which allows us to define a simple RAG function that we can use to search through the database for relevant documents given a question and return an answer:

In [None]:
def perform_rag(question, hf_pipeline, tokenizer, retriever):
    # search for the relevant documents
    docs = retriever.invoke(question)

    # prepare the prompt for the model (following the RAFT instruction format)
    formatted_docs = ""
    for doc in docs:
        formatted_docs += f"<DOCUMENT>{doc.page_content}</DOCUMENT>\n"
        
    user_prompt = f"{formatted_docs}{question}"

    # process the prompt to have the correct template
    messages = [
        {"role": "user", "content": user_prompt}
    ]
    
    prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
    )

    # call the model
    raw_output = hf_pipeline.invoke(prompt)
    output = raw_output
    output = raw_output[len(prompt):]
    return output

We can now load our QA dataset:

In [None]:
df = pd.read_csv("Advanced_Medical_QA.csv")
df = df.rename({"Answer": "True Answer"}, axis=1)
df

And finally ask the questions to our LLM:

In [None]:
def extract_answer(text):
    # find the starting position of the first '<ANSWER>: '
    answer_start_index = text.find('<ANSWER>: ')
    
    if answer_start_index != -1:
        # slice the string starting from the first occurrence of '<ANSWER>: '
        answer_text_start = answer_start_index + len('<ANSWER>: ')
        
        # find the end of the line (next newline character)
        answer_end_index = text.find('\n', answer_text_start)
        
        if answer_end_index == -1:
            # if there is no newline, take the rest of the string
            answer_text = text[answer_text_start:].strip()
        else:
            # take the substring up to the newline character
            answer_text = text[answer_text_start:answer_end_index].strip()
        
        return answer_text
    
    # handle case where '<ANSWER>: ' is not found
    else:
        # return None
        return text

answers_raft_cot = []
answers_raft = []

for q in tqdm(df["Question"]):
    answer = perform_rag(q, hf_pipeline, tokenizer_tuned, retriever)
    answers_raft_cot.append(answer)
    answers_raft.append(extract_answer(answer))

df["Answers RAFT CoT"] = answers_raft_cot
df["Answers RAFT"] = answers_raft

In [None]:
df.to_csv("RAFT_answers.csv", index=False)

Additionally, we can also ask the same questions on the base model that we used (which wasn't finetuned with RAFT):

In [None]:
# load the baseline model
tokenizer_baseline, model_baseline = initialize_llm_trainable()

# create the HuggingFace pipeline
pipe_baseline = pipeline(
    'text-generation',
    model=model_baseline,
    tokenizer=tokenizer_baseline,

    ## hyperparameters
    max_new_tokens=512,
    do_sample=True,
    temperature=0.001,
    top_p=0.9,
)

hf_pipeline_baseline = HuggingFacePipeline(pipeline=pipe_baseline)

In [None]:
answers_baseline = []

for q in tqdm(df["Question"]):
    answer = perform_rag(q, hf_pipeline_baseline, tokenizer_baseline, retriever)
    answers_baseline.append(answer)

df["Answers Baseline"] = answers_baseline

In [None]:
df.to_csv("RAFT_answers.csv", index=False)