# Import Necessary Function and Library

In [1]:
import minsearch
import json
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [2]:
index = minsearch.Index(
    text_fields = {'question','text','section'},
    keyword_fields=['course']
)
index.fit(documents)
boost = {'question':3.0, 'section':0.5}

In [3]:
def search(query):
    boost = {'question':3.0, 'section':0.5}
    search_results = index.search(
        query=query,
        boost_dict = boost,
        filter_dict = {'couse':'data-engineering-zoomcamp'},
        num_results=5
    )
    return search_results
def build_prompt(query,search_results):
    prompt_template = """
    You are Course Assistant. You answer the question based on context. Use only fact the CONTEXT for answering the question.
    
    QUESTION: {question}
    CONTEXT : 
    {context}
    """.strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=search_results).strip()
    return prompt
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [4]:
from openai import OpenAI
import sys,os,os.path

In [15]:
api_key = os.environ['OPEN_AI_API_KEY']
client = OpenAI(api_key=api_key)

In [16]:
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [17]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm_open_api(prompt)
    return answer

In [18]:
rag('the course already started. Can I still enroll the course?')

"Yes, you can still enroll in the course even if it has already started. However, if you join the course late, you may not be able to submit some of the homeworks. In order to receive a certificate, you will need to submit 2 out of 3 course projects and review 3 peers' projects by the deadline. So, if you join the course later and manage to work on two projects, you will still be eligible for a certificate."

# Use Ollama to Run Your LLM

In [19]:
from openai import OpenAI

In [20]:
def llm_ollama(prompt):
    client = OpenAI(
    base_url = 'http://localhost:11434/v1/',api_key = 'ollama',)
    response = client.chat.completions.create(model='phi3',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [21]:
def rag_ollama(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm_ollama(prompt)
    return answer

In [27]:
rag_ollama('the course already started. Can I still enroll the course?')

'As per your query regarding joining a course that has started, based on this context: "Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace." and considering it is about Data Engineering Zoom Camp, even if a certain date details such as year or starting time are missing from the provided text. However to enroll in any particular session of this specific \'Data-Engineering-Zoomcamp\' after its start isn\'t explicitly mentioned within context thus further information can be required for accurate answer regarding it. For general course based questions related queries, here you go:\nYes - even if a certain date is unavailable/missing from the provided text in our context as per this quote "yes-evenifyoudon’ttregisteryoushouldbeeligibletosubmitthematerialsofcourse"\nNo (based on above general course answer) – for specific enrolling questions like after starting or timings, further details can be required from the official course sourc

# Search with ElasticSearch

In [28]:
from elasticsearch import Elasticsearch

In [29]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [30]:
es_client.info()

ObjectApiResponse({'name': '1d49b5fa33b3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zOAQjRsURd2hiFNlYGTbVQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [31]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = 'course_question'
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_question'})

In [32]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
for doc in documents:
    es_client.index(index=index_name,document=doc)

In [34]:
def elastic_query(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name,body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [38]:
def rag_elastic_ollama(query):
    search_results = elastic_query(query)
    prompt = build_prompt(query,search_results)
    answer = llm_ollama(prompt)
    return answer

In [40]:
search_results = elastic_query(query)

In [39]:
query = 'the course already started. Can I still enroll the course?'
rag_elastic_ollama(query)

"Based on your question, it is possible that you could still enroll in a started Data Engineering Zoom Camp. However, there are considerations due to project deadlines and individual comfort with prerequisites. It might be advisable for potential learners to review pre-course materials like dependencies setup instructions first (including Google Cloud account and software installs) as well assortment of course syllabus before the officially start date, in order to better prepare themselves prior enrollment into the self-paced mode.\nIt's also important for potential learners who opt not finish/participate in courses within allotted deadlines and requirements by the endorsing community help maintain course integrity - such as ensuring sDocker containers exit codes would not impact their access to answers or materials shared on channels like Slack, FAQ documentations etc. There's even a potential for learners themselves to contribute back (like improvements in documentation/instrucción) 