# Offline RAG Evaluation

## Load Documents with IDS

In [12]:
import json
with open('documents-with-ids.json','rt') as f:
    docs_raw = json.load(f)

In [13]:
documents = []
for course_dict in docs_raw:
    documents.append(course_dict)

## Load Ground Truth Dataset

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('ground_truth_dataset.csv')

In [16]:
df.head(5)

Unnamed: 0,question,course,document
0,When will the course start?,data-engineering-zoomcamp,c02e79ef
1,How do I subscribe to the course Google Calendar?,data-engineering-zoomcamp,c02e79ef
2,What should I do before the course starts?,data-engineering-zoomcamp,c02e79ef
3,How can I join the course Telegram channel?,data-engineering-zoomcamp,c02e79ef
4,Where should I register and join the channel i...,data-engineering-zoomcamp,c02e79ef


In [17]:
df = df[df['course']=='machine-learning-zoomcamp']

In [18]:
df.count()

question    1776
course      1776
document    1776
dtype: int64

In [20]:
ground_truth = df.to_dict(orient='records')

In [21]:
ground_truth[0]

{'question': 'What is the purpose of the document?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [22]:
doc_idx = {d['id']: d for d in documents}
print(doc_idx['0227b872']['text'])

Machine Learning Zoomcamp FAQ
The purpose of this document is to capture frequently asked technical questions.
We did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:
Data Engineering Zoomcamp FAQ
In the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo
work


## Index Data

In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [5]:
len(model.encode('please encode this word'))

384

In [7]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://127.0.0.1:9200')
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "question_text_vector":{"type":"dense_vector","dims":384,"index":True,"similarity":"cosine"}
        }
    }
}
index_name = 'semantic_course_question'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic_course_question'})

In [8]:
from tqdm.auto import tqdm

In [23]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question+' '+text)
    
    es_client.index(index=index_name,document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:52<00:00, 17.91it/s]


## Retrieval

In [103]:
def vector_search_knn(field,vector,course):
    query_setting = {
    "field":field,
    "query_vector": vector,
    "k":5,
    "num_candidates":10000}
    response = es_client.search(
        index=index_name,
        knn=query_setting,
        query={
            "match":{
                "course":course
            }
        })
    result_docs = []
    for hit in response['hits']['hits']:
        source = hit['_source']
        # Extract only the required fields
        result_docs.append({
            'text': source.get('text'),
            'section': source.get('section'),
            'question': source.get('question'),
            'course': source.get('course')
        })
    return result_docs

In [104]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']
    v_q = model.encode(question)
    return vector_search_knn('question_text_vector', v_q, course)

In [105]:
question_text_vector_knn(dict(question='this the class recorded?',course='machine-learning-zoomcamp'))

[{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'section': 'General course-related questions',
  'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'How to visualize the predictions per classes after training a neural net\nSolution description\nclasses, predictions = zip(*dict(zip(classes, predictions)).items())\nplt.figure(figsize=(12, 3))\nplt.bar(classes, predictions)\nLuke',
  'section': 'Miscellaneous',
  'question': 'Chart for classes and predictions',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data eng

## RAG Flow

In [106]:
from openai import OpenAI
import sys,os,os.path

In [108]:
api_key = os.environ['OPEN_AI_API_KEY']

In [109]:
def build_prompt(query,search_results):
    prompt_template = """
    You are Course Assistant. You answer the question based on context. Use only fact the CONTEXT for answering the question.
    
    QUESTION: {question}
    CONTEXT : 
    {context}
    """.strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=search_results).strip()
    return prompt

In [110]:
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [111]:
def rag(query:dict) -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'],search_results)
    answer = llm_open_api(prompt)
    return answer

In [112]:
ground_truth[10]

{'question': 'What happens if I am unable to attend a session?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [113]:
rag(ground_truth[10])

"If you are unable to attend a session, everything is recorded, so you won't miss anything. You can ask your questions for office hours in advance and they will be covered during the live stream. Additionally, you can always ask questions in Slack."

In [115]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Cosine Similiarity Metrics

In [116]:
answer_llm = "If you are unable to attend a session, everything is recorded, so you won't miss anything. You can ask your questions for office hours in advance and they will be covered during the live stream. Additionally, you can always ask questions in Slack."
answer_original = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'
v_llm = model.encode(answer_llm)
v_original = model.encode(answer_original)
v_llm.dot(v_original)

np.float32(0.87166506)

In [None]:
answers = {}
for i,rec in tqdm(enumerate(ground_truth)):
    if i in answers:
        continue
    answer_llm = rag(rec)
    doc_id = rec['document']
    answer_original = doc_idx[doc_id]['text']
    answers[i] = {
    'answer_llm':answer_llm,
    'answer_original':answer_original,
    'document': doc_id 
    }

In [122]:
with open('answers.json','wt') as f:
    json.dump(answers,f,indent=2)

In [123]:
len(answers)

1195