# WP4: Evaluating RAG

* Put the contents of RAG_demo in here, along with any other relevant work - to make a single file people can run
* **this will replace the rag_demo.ipynb and model_eval.ipynb files**

## Setup

incl. Colab Specific Setup

If running this notebook in colab please uncomment and run the following cell as well

In [None]:
#clone the github repo from the url below to access relevant files
#!git clone https://github.com/nhsengland/ds_251_RAG
#set working directory to the main repo folder
#%cd ds_251_RAG
#%pip install --upgrade --quiet anthropic transformers langchain_community bitsandbytes langchain accelerate tensorflow==2.15 chromadb unstructured sentence-transformers faiss-cpu

In [None]:
#import relevant modules
import glob
import os
import pandas as pd
import toml


import src.models as models

from tqdm import tqdm

config = toml.load("config.toml")

if config['DEV_MODE']:
    config['PERSIST_DIRECTORY'] += "/dev"

# Initialise RAG Pipelines

In [None]:
rag_pipeline_phi2 = models.RagPipeline(config['EMBEDDING_MODEL'], config['PERSIST_DIRECTORY'], model_type='phi2')

In [None]:
os.environ['ANTHROPIC_API_KEY'] = """*insert anthropic api key here*"""
rag_pipeline_anthropic = models.RagPipeline(config['EMBEDDING_MODEL'], config['PERSIST_DIRECTORY'])

# Load in documents into the vectorstore

In [None]:
# Add documents if there are non - if in DEV mode, don't add any more (if it's not empty)
if len(rag_pipeline_anthropic.vectorstore.get()['documents']) == 0 or (not config['DEV_MODE']):
    rag_pipeline_anthropic.load_documents()

# Loading the Cogstack qns /answers

link to cogstack QA data "https://raw.githubusercontent.com/CogStack/OpenGPT/main/data/nhs_uk_full/prepared_generated_data_for_nhs_uk_qa.csv"

In [None]:
#load processed questions and answers
cogstack_qa = pd.read_csv('src/model_eval/cogstack_qa_data_process.csv')

#select a random sample of questions
sample_qa = cogstack_qa.sample(n = 5, random_state = 999)
sample_qa

# Examples with RAG on and RAG off

Testing Phi-2 Model with RAG off

In [None]:
question = sample_qa['question'].values[0]
result_rag_off_phi2 = rag_pipeline_phi2.llm('Instruction: {} \n\n Output:'.format(question))
print(result_rag_off_phi2)

Testing Phi-2 Model with RAG on

In [None]:
result_rag_on_phi2 = rag_pipeline_phi2.answer_question(question, model_type = 'phi2', rag=True)
print(result_rag_on_phi2)

Testing anthropic model with RAG off

In [None]:
result_rag_off_anthropic = rag_pipeline_anthropic.answer_question(question, rag=False)
print(result_rag_off_anthropic)

Testing anthropic model with RAG on

In [None]:
result_rag_on_anthropic = rag_pipeline_anthropic.answer_question(question, rag=True)
print(result_rag_on_anthropic)

# Evaluating the Responses

Generate responses for both models for each of the sample questions with RAG turned on

In [None]:
phi2_responses = []
phi2_references = []

for index, row in sample_qa.iterrows():
    #retrieve question answer and references from df
    cogstack_q = row['question']
    cogstack_a = row['answer']
    cogstack_ref = row['short_reference']

    #run question prompt through LLM and append result
    result = rag_pipeline_phi2.answer_question(cogstack_q, rag=True)

    #separate by word and extract reference and generated response
    llm_result = result.split()[:-2]
    if not llm_result:
        llm_result = ''
        llm_ref = ''
    else:
        llm_result = ' '.join(llm_result)
        llm_ref = ' '.join(result.split()[-2:])

    #append generated response and corresponding reference
    phi2_responses.append(llm_result[llm_result.index('Output')+8:len(llm_result)])
    phi2_references.append(llm_ref)

In [None]:
anthropic_responses = []
anthropic_references = []

for index, row in sample_qa.iterrows():
    #retrieve question answer and references from df
    cogstack_q = row['question']
    cogstack_a = row['answer']
    cogstack_ref = row['short_reference']

    #run question prompt through LLM and append result
    result = rag_pipeline_anthropic.answer_question(cogstack_q, rag=True)

    #separate by word and extract reference and generated response
    llm_result = result.split()[:-2]
    if not llm_result:
        llm_result = ''
        llm_ref = ''
    else:
        llm_result = ' '.join(llm_result)
        llm_ref = ' '.join(result.split()[-2:])

    #append generated response and corresponding reference
    anthropic_responses.append(llm_result)
    anthropic_references.append(llm_ref)

In [None]:
#add the responses and references to the dataframe
sample_qa['phi2_response'] = phi2_responses
sample_qa['phi2_reference'] = phi2_references
sample_qa['anthropic_response'] = anthropic_responses
sample_qa['anthropic_reference'] = anthropic_references

# Retrieval Runtime