# Import Necessary Function and Library

In [1]:
import minsearch
import json
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [2]:
index = minsearch.Index(
    text_fields = {'question','text','section'},
    keyword_fields=['course']
)
index.fit(documents)
boost = {'question':3.0, 'section':0.5}

In [3]:
def search(query):
    boost = {'question':3.0, 'section':0.5}
    search_results = index.search(
        query=query,
        boost_dict = boost,
        filter_dict = {'couse':'data-engineering-zoomcamp'},
        num_results=5
    )
    return search_results
def build_prompt(query,search_results):
    prompt_template = """
    You are Course Assistant. You answer the question based on context. Use only fact the CONTEXT for answering the question.
    
    QUESTION: {question}
    CONTEXT : 
    {context}
    """.strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=search_results).strip()
    return prompt
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [4]:
from openai import OpenAI
import sys,os,os.path

In [7]:
api_key = os.environ['OPEN_AI_API_KEY']
client = OpenAI(api_key=api_key)

In [8]:
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [9]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm_open_api(prompt)
    return answer

In [10]:
rag('the course already started. Can I still enroll the course?')

"Yes, you can still enroll in the course. You won't be able to submit some of the homeworks, but you can still participate in the course and be eligible for a certificate by completing the required projects and peer reviews."

# Use Ollama to Run Your LLM

In [11]:
from openai import OpenAI

In [12]:
def llm_ollama(prompt):
    client = OpenAI(
    base_url = 'http://localhost:11434/v1/',api_key = 'ollama',)
    response = client.chat.completions.create(model='phi3',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [13]:
def rag_ollama(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm_ollama(prompt)
    return answer

In [14]:
rag_ollama('the course already started. Can I still enroll the course?')

'Yes, though late enrollment does have some constraints depending on whether you want a certificate or not. If your main aim is to get certified in machine learning and data science skills that were taught during the course I would advise registering before it officially begins as there could be assignments worth several points only given out at specific rounds of lectures which are then graded after each round, so late enrollees may miss some opportunity for more grades. This information applies to the "machine-learning-zoomcamp". If your priority is getting certified or pass a particular segment in machine learning and AI that you couldn\'t previously due to other commitments now being free from this responsibility, then I recommend joining late as these courses tend not too penalizing for belated enrollees.\nThe "data-engineering-zoomcamp" course officially starts on 15th Jan 2024 at 9:00pm (GMT). If you are able to start working from this date but don\'t want official credit, the m

# Search with ElasticSearch

In [25]:
from elasticsearch import Elasticsearch

In [26]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [30]:
es_client.info()

ObjectApiResponse({'name': '84c0be41c71a', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'pAnxmPdYTNuanGeuXY3gaQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [32]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = 'course_question_new'
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_question_new'})

In [33]:
from tqdm.auto import tqdm

In [35]:
for doc in documents:
    es_client.index(index=index_name,document=doc)

In [36]:
def elastic_query(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name,body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [37]:
def rag_elastic_ollama(query):
    search_results = elastic_query(query)
    prompt = build_prompt(query,search_results)
    answer = llm_ollama(prompt)
    return answer

In [38]:
query = 'the course already started. Can I still enroll the course?'
search_results = elastic_query(query)
print(search_results)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}, {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}, {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to se

In [39]:
query = 'the course already started. Can I still enroll the course?'
rag_elastic_ollama(query)

"Based on the provided context, it seems that you are inquiring about joining a Data Engineering YouTube Course hosted by Google offered via Zoom Camp. The relevant information for this question is:\n\n1. Yes, even if you don’t register, you will still likely be able to submit homeworks with deadlines approaching towards the course end as there's guidance provided in context that 'you can also continue working on your final project.' However, make sure not to leave everything for last minute and consider joining/registering for clarity.\n2. It is essential first-hand knowledge from within each specific course provider should be sought before making an enrolment as most courses will have policies regarding joinings post the start of a class, this was exemplified in context: 'Yes, even if you don’t register' which can suggest potential implications for joining late.\n3. The suggested preparations include installing and setting up dependencies like Google Cloud Account, SDK; Python 3 inst