#### Load documents with ID

In [1]:
import requests
import pandas as pd
import json

In [2]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)

documents = docs_response.json()

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [4]:
with open('document_with_ids.json', 'r') as f:
    data = json.load(f)
data[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

#### Load Ground truth data

In [5]:
df_ground_truth = pd.read_csv('ground_truth_data.csv')
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient= 'records')

In [6]:
ground_truth[10]

{'question': 'What happens if I cannot attend a class session?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [7]:
docs_idx = {d['id']: d for d in documents}
docs_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

### index data

In [8]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [10]:
es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [11]:
from tqdm import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index= index_name, document= doc)

100%|███████████████████████████████████████████████████████████| 948/948 [01:02<00:00, 15.23it/s]


In [12]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field" : field,
        "query_vector" : vector,
        "k" : 5,
        "num_candidates" : 10000,
        "filter" : {
            "term" : {
                "course" : course
                }
            }
        }
    search_query = {
        "knn" : knn,
        "_source" : ["text", "section", "question", "course", "id"]
    }
    
    es_result = es_client.search(
        index = index_name,
        body = search_query
    )
    result_docs = []
    
    for hit in es_result['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

def question_text_vector_knn(q):
    question = q["question"]
    course = q["course"]

    v_q = model.encode(question)
    return elastic_search_knn("question_text_vector", v_q, course)

In [13]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

#### The Rag Flow

In [14]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}""".strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nAnswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context = context).strip()
    return prompt

In [15]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

def llm(prompt, model = 'gpt-4o-mini'):
    response = client.chat.completions.create(
        model = model,
        messages =  [{"role" : "user", "content" : prompt}]
    )
    return response.choices[0].message.content

In [16]:
def rag(query: dict, model = 'gpt-4o-mini') ->str:
    search_results = question_text_vector_knn(query)
    prompt= build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [17]:
ground_truth[10]

{'question': 'What happens if I cannot attend a class session?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [18]:
rag(ground_truth[10])

'If you cannot attend a class session, everything is recorded, so you won’t miss anything. You will have the opportunity to ask questions during office hours in advance, and those questions will be addressed during the live stream. Additionally, you can always ask questions in Slack.'

In [19]:
docs_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

### Cosine Similarity Metric

In [20]:
llm_answer = 'If you cannot attend a class session, everything is recorded, so you won’t miss anything. You can ask your questions in advance for office hours, and those will be covered during the live stream. Additionally, you can always ask questions in Slack.'
doc_answer = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(llm_answer)
v_doc = model.encode(doc_answer)

v_llm.dot(v_doc)

np.float32(0.7919637)

In [23]:
answers = {}

In [None]:
answers

In [None]:
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue
    answer_llm = rag(rec)
    doc_id = rec['document']
    orig_document = docs_idx[doc_id]
    answer_orig = orig_document['text']

    answers[i] = {
        "answer_llm" : answer_llm,
        "answer_orig" : answer_orig,
        "document" : doc_id
    }

In [31]:
df_gpt_4o_mini = pd.DataFrame(answers.values())

In [32]:
df_gpt_4o_mini.sample(n=5)

Unnamed: 0,answer_llm,answer_orig,document
77,"Yes, you can begin the course materials before...",The course is available in the self-paced mode...,636f55d5
28,"If you unsubscribed from the newsletter, you w...","The process is automated now, so you should re...",6ba259b1
13,"Yes, you can ask questions outside of the live...","Everything is recorded, so you won’t miss anyt...",5170565b
34,"Yes, there are options for extra credits that ...","Approximately 4 months, but may take more if y...",67e2fd13
47,"If you miss the midterm project, you may still...","Yes, it's possible. See the previous answer.",1d644223
