# Load ground truth

In [None]:
import json

with open('documents-with-ids.json', 'rt') as f_input:
    documents = json.load(f_input)

In [None]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [None]:
import pandas as pd
df_ground_truth = pd.read_csv("ground-truth-data.csv")
df_ground_truth = df_ground_truth[df_ground_truth.course == "machine-learning-zoomcamp"]
ground_truth = df_ground_truth.to_dict(orient="records")

In [None]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [None]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

# Index data

In [None]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id":{"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity":"cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity":"cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity":"cosine"
            },
        }
    }
}

index_name = "course_questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [None]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    questions = doc['question']
    text = doc['text']

    doc['question_text_vector'] = model.encode(questions + ' ' + text)

    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:14<00:00, 12.65it/s]


# Retrieval

In [None]:
def elastic_search_knn(field, vector, course):
        knn = {
            "field" : field,
            "query_vector": vector,
            "k" : 5,
            "num_candidates" : 10000,
            "filter": {
                "term": {
                    "course": course

                }
            }
        }


        search_query = {
            "knn": knn,
            "_source": ["text", "section", "question", "course", "id"]

        }

        es_results = es_client.search(
            index=index_name,
            body=search_query
    )

        result_docs = []

        for hit in es_results["hits"]["hits"]:
            result_docs.append(hit['_source'])

        return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)



In [None]:
question_text_vector_knn(dict(
    question = "Are session recorded if i miss one?",
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks and Deep Learning',
  'text': "Problem description\nThe accuracy and the loss are both still the same or nearly the same while training.\nSolution description\nIn the homework, you should set class_mode='binary' while reading the data.\nAlso, problem occurs when you choose the wrong optimizer, batch size, or learning rate\nAdded by Ekaterina Kutovaia",
  'id': '7d11d5ce'},
 {'question': 'Will I get a certificate if I missed the midterm project?',
  'course': 'machine-learning-zoomcamp',
  'section'

# The RAG flow

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT does not provide sufficient information, politely indicate that the information is not available.
    Consider any relevant keywords or phrases in the QUESTION to tailor the answer.


    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
client = MistralClient()

def llm(prompt):
    chat_response = client.chat(
    model="mistral-large-latest",
    messages=[ChatMessage(role="user", content=prompt)]
    )

    return chat_response.choices[0].message.content

In [None]:
def rag(query: dict, model = "mistral-large-latest" ) -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [None]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [None]:
rag(ground_truth[10])

'Yes, sessions are recorded if you miss one. Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [None]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

# Cosine similarity metric

In [None]:
answer_orig = 'Yes, sessions are recorded if you miss one. Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

result = float(v_llm.dot(v_orig))
result

0.8014881610870361

In [None]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [None]:
len(ground_truth)

1830

In [None]:
answers = {}

for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm' : answer_llm,
        'answer_orig' : answer_orig,
        'document' : doc_id,
        'question' : rec['question'],
        'course' : rec['course'],
    }

 87%|███████████████████████████████████████████████████████████████████████████████████████████████▍              | 1588/1830 [2:27:08<22:25,  5.56s/it]


MistralAPIException: Status: 403. Message: {"message":"Inactive subscription or usage limit reached"}

In [None]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tdqm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            result.append(result)

    return results

In [None]:
def process_record(rec):
    model = "mistral-large-latest"
    answer_llm = rag(rec, model=model)

    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [None]:
process_record(ground_truth[10])

In [None]:
results_mistral = map_progress(pool, ground_truth, process_record)

In [None]:
df_mistral = pd.DataFrame(results_mistral)
df_mistral.to_csv('data/results-mistral.csv', index=False)

In [None]:
!head data/result-mistral.csv

# Cosine similarity
```
A  -> Q -> A' cosine similarity

A  -> Q -> A'

cosine(A, A')
```

In [None]:
results_mistral = df_mistral.to_dict(orient='records')

In [None]:
results_mistral[0]

In [None]:
record = results_mistral[0]

In [None]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)

    return v_llm.dot(v_orig)

In [None]:
similarity = []

for record in tqdm(results_mistral):
    sim = compute_similarity(record)
    similarity.append(sim)

In [None]:
df_mistral['cosine'] = similarity

In [None]:
df_mistral['cosine'].describe()

In [None]:
df_mistral.iloc[3].to_dict()

In [None]:
!pip install seaborn

In [None]:
sns.displot(df_mistral['cosine'])