## Offline RAG Evaluation

## Load documents with IDs

In [5]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

df_ground_truth = pd.read_csv("ground-truth-data.csv")
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

ground_truth

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'When can I start watching the course videos?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'Are the live office hours sessions recorded?',
  'cours

In [6]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Index Data 

In [7]:

from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [8]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    vector = model.encode(question + ' ' + text)
    vectors.append(vector)

vectors

  0%|          | 0/948 [00:00<?, ?it/s]

[array([-2.47415155e-02,  1.25250602e-02,  1.86790694e-02, -2.44595762e-03,
        -6.29659668e-02, -1.47210283e-03, -9.39880908e-02, -7.70689100e-02,
        -2.42589619e-02, -2.65898975e-03, -3.13756429e-02,  2.58985274e-02,
         1.27631414e-04,  1.16566743e-03,  5.24663134e-03, -2.75483485e-02,
         1.25110904e-02, -1.48042306e-01,  3.98083031e-02, -7.59840431e-03,
         8.63100868e-03, -1.18886102e-02, -2.55145021e-02,  3.24880704e-02,
         2.93856785e-02,  1.58540793e-02,  2.06908286e-02, -2.25544106e-02,
         5.48142344e-02,  9.73005779e-03,  2.40406785e-02, -7.03245401e-02,
         8.46367776e-02,  5.13699763e-02, -8.63981433e-03,  5.17791174e-02,
         4.36919406e-02, -4.95563708e-02,  4.17246073e-02,  8.60650837e-02,
        -2.01458335e-02, -9.76670086e-02, -4.44233418e-02,  4.23669778e-02,
         1.40459687e-01, -4.54987446e-03, -9.79093183e-03, -5.90606667e-02,
         1.28404899e-02,  2.20494475e-02, -1.69678461e-02, -7.27972761e-02,
        -3.5

In [None]:
import numpy as np

vectors = np.array(vectors)

vectors

array([[-0.02474152,  0.01252506,  0.01867907, ..., -0.02614966,
        -0.03173675, -0.05508649],
       [-0.01585098,  0.01600035,  0.02664407, ...,  0.05027685,
        -0.04893355, -0.00346255],
       [-0.07135362, -0.05488089,  0.01031577, ..., -0.04985882,
         0.0098863 , -0.0816684 ],
       ...,
       [ 0.08542997, -0.00225559,  0.02360311, ...,  0.05132207,
        -0.05994945, -0.01875461],
       [ 0.00214935, -0.01339732,  0.06816499, ..., -0.04418039,
        -0.01629386,  0.03407   ],
       [ 0.08494177,  0.0288297 ,  0.02374815, ..., -0.01251399,
         0.00368627, -0.04804292]], shape=(948, 384), dtype=float32)

In [None]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(vectors, documents)

## Retrieval

In [None]:

def minsearch_vector_search(vector, course):
    return vindex.search(
        vector,
        filter_dict={'course': course},
        num_results=5
    )

def question_text_vector(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return minsearch_vector_search(v_q, course)

In [None]:
question_text_vector(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

## The RAG Flow

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [None]:
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_text_vector(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

## Cosine Similarity Metric

In [None]:
answer_orig = 'Yes, sessions are recorded if you miss one. Everything is recorded, allowing you to catch up on any missed content. Additionally, you can ask questions in advance for office hours and have them addressed during the live stream. You can also ask questions in Slack.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

v_llm.dot(v_orig)

np.float32(0.7591172)

In [None]:
answers = {}

for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [None]:
results_gpt4o = [None] * len(ground_truth)

for i, val in answers.items():
    results_gpt4o[i] = val.copy()
    results_gpt4o[i].update(ground_truth[i])

In [None]:
import pandas as pd

df_gpt4o = pd.DataFrame(results_gpt4o)
df_gpt4o.to_csv('data/results-gpt4o.csv', index=False)

## Evaluation GPT 3.5


In [None]:
rag(ground_truth[10], model='gpt-3.5-turbo')


In [None]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [None]:
def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)

    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

## LLM-as-a-Judge

In [None]:
prompt_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [None]:
df_samples = df_gpt4o.sample(n=50, random_state=1)
samples = df_samples.to_dict(orient="records")


evaluations = []

for record in tqdm(samples):
    prompt = prompt_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

In [None]:
import json

json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

df_evaluations = pd.DataFrame(json_evaluations)
df_evaluations.Relevance.value_counts()


In [None]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT']