In [37]:
import minsearch

In [38]:
import json

In [39]:
with open('documents.json', 'rt') as f_in:
    doc_raw = json.load(f_in)

In [40]:
documents = []

for course_dict in doc_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [41]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [42]:
index.fit(documents)

<minsearch.Index at 0x17402b210>

In [43]:
import ollama

In [44]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    results = index.search(
        query = query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [45]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. You're given the QUESTION from a student and some CONTEXT about the course.
Your task is to answer the QUESTION based on the provided CONTEXT. Use only the facts from the CONTEXT when answering the question.
The user does not know about the CONTEXT. Only you do, so don't mention the CONTEXT. Reply as if the knowledge comes from you.
If the CONTEXT does not contain the answer, respond with "I don't know."

CONTEXT: {context}

QUESTION: {query}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(
        query=query,
        context=context
    ).strip()

    return prompt

In [46]:
def llm(prompt):
    response = ollama.chat(model='llama3:8b-instruct-q5_K_M', messages=[
        {
            'role': 'user',
            'content': prompt,
        },
    ])
    return response['message']['content']

In [47]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    reply = llm(prompt)
    return reply

In [48]:
query = 'The course has already started. Can I still enroll?'
rag(query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [49]:
from elasticsearch import Elasticsearch

In [50]:
es_client = Elasticsearch('http://localhost:9200')

In [51]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(
    index=index_name,
    body=index_settings
)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/sLBmhGSdSJSV-M_RS38npA] already exists')

In [52]:
from tqdm.auto import tqdm

In [53]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 730.18it/s]


In [54]:
query = "I just found out about the course. Can I still join?"

In [55]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [56]:
response = es_client.search(index=index_name, body=search_query)

In [62]:
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])