In [4]:
import minsearch

In [5]:
import json

In [6]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)
    

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [9]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [11]:
index.fit(documents)

<minsearch.Index at 0x79b4098c0d90>

In [1]:
from openai import OpenAI

In [2]:
client = OpenAI()

In [41]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=q,
        boost_dict=boost,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )
    return results

In [49]:
def build_prompt(query, search_results):
    prompt_template = """
        You are a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use ONLY the facts from the CONTEXT to answer the QUESTION. 
        If the CONTEXT doesn't have the answer, answer NONE.
        
        QUESTION: {question}
        
        CONTEXT: {context}
        """.strip()
    context=""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [50]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content

In [54]:
query = "How do I run kafka?"
answer = rag(query)
answer

'None.'

In [56]:
from elasticsearch import Elasticsearch

In [57]:
es_client = Elasticsearch('http://localhost:9200')

In [60]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course-questions"
es_client.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [62]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [65]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)
                    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:27<00:00, 34.44it/s]


In [67]:
query = "I just discovered the course. Can I still join?"

In [77]:
def elastic_search(query) :
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [78]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [79]:
rag(query)

"Yes, you can still join the course even after the start date. It is mentioned that even if you don't register, you are still eligible to submit the homeworks."