In [1]:
import os
import sys 
import json

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/course"
sys.path.append(PROJECT_DIR)


from utils.utils.environment import initialize_env_variables
from utils.rag import minsearch
from utils.utils.data import load_json_document
from utils.rag.query import (
    search,
    build_prompt,
    llm,
    rag
)
from utils.rag.elasticsearch import (
    create_elasticsearch_client,
    search_elasticsearch_indecis,
    load_index_settings,
    create_elasticsearch_index,
    remove_elasticsearch_index,
    index_documents
)
from openai import OpenAI

# Initialize environment variables
initialize_env_variables()

client = OpenAI()

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/course/.env


# Query & Documents

In [7]:
## question
query = 'The course has already started, can I still enroll?'

In [8]:
document_path = f'{PROJECT_DIR}/data/1/documents.json'

documents = load_json_document(document_path)

for _ in documents[10:12]:
    print(_, end="\n\n")

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.', 'section': 'General course-related questions', 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?', 'course': 'data-engineering-zoomcamp'}

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.", 'section': 'General course-related questions', 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?', 'course': 'data-engineering-zoomcamp'}



# QA

## 1. No Context

In [4]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": query}]
)

print(response.choices[0].message.content)

It's often possible to enroll in a course even after it has started, but this can depend on several factors including the policies of the institution offering the course, the specific course itself, and how far along the course is. Here are some steps you can take to inquire about late enrollment:

1. **Contact the Instructor or Department:**
   - Reach out to the course instructor or the department offering the course. Explain your situation and ask if late enrollment is an option.

2. **Check the Enrollment Policies:**
   - Review the enrollment policies on the institution's website. Some institutions have official add/drop periods during which you can still register for courses without penalty.

3. **Administrative Office:**
   - Visit or contact the registrar's office or the admissions office. They can provide detailed information on late enrollment policies and help you with the process.

4. **Prepare Your Case:**
   - Be prepared to explain why you are enrolling late and how you 

## 2. RAG using minsearch


!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<utils.rag.minsearch.Index at 0x7feb805f99d0>

In [6]:
## Relative importance compared to text, question is 3 times important.
boost = {'question': 3.0, 'section': 0.5}

## Which documents to search in
filter_dict={'course': 'data-engineering-zoomcamp'}

## Number of documents to retrieve
num_results = 5

In [7]:
search_results = search(query, index, filter_dict, boost, num_results)

for _ in search_results:
    print(_, end="\n\n")

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}

{'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The

In [8]:
prompt_template_path = f"{PROJECT_DIR}/prompts/course_qa.txt"

prompt = build_prompt(query, search_results, prompt_template_path)

print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: The course has already started, can I still enroll?

CONTEXT: 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related 

In [9]:
model = 'gpt-4o'

response = llm(client, prompt, model)

print(response)

Yes, you can still enroll even after the course has started. You are eligible to submit the homeworks, but be mindful of the deadlines for the final projects, as it is important not to leave everything until the last minute.


In [10]:
rag_params = dict(
    query=query,
    index=index,
    filter_dict=filter_dict,
    boost=boost,
    num_results=num_results,
    prompt_template_path=prompt_template_path,
    client=client,
    model=model
)

response = rag(**rag_params)
print(response)

Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework assignments, but keep in mind that there will be deadlines for turning in the final projects, so it's advisable not to delay your work until the last minute.


## 3. RAG using Elastic Search

### 3.1 Client & Index Creation

In [2]:
host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path=f"{PROJECT_DIR}/config/elasticsearch/course_qa_index_settings.json"
index_settings = load_index_settings(index_settings_path)


es_client = create_elasticsearch_client(host, port)

Connected to Elasticsearch


In [3]:
remove_elasticsearch_index(
        es_client,
        index_name
)

Found no index with name course-questions, nothing to remove.


In [4]:
create_elasticsearch_index(
        es_client,
        index_name,
        index_settings
)

Successfully created index course-questions.


In [5]:
for _ in search_elasticsearch_indecis(es_client):
    print(_, end="\n")

course-questions


### 3.2 Indexing Documents

In [9]:
index_documents(es_client, index_name, documents)

0it [00:00, ?it/s]

Successfully indexed 948/948 documents in index course-questions


### 3.3 Quering

In [10]:
search_context = 'elasticsearch'
model = 'gpt-4o'
boost = {'question': 3.0, 'section': 0.5}
filter_dict={'course': 'data-engineering-zoomcamp'}
num_results = 5
prompt_template_path = f"{PROJECT_DIR}/prompts/course_qa.txt"


rag_params = dict(
    es_client=es_client,
    query=query,
    index_name=index_name,
    filter_dict=filter_dict,
    boost=boost,
    num_results=num_results,
    prompt_template_path=prompt_template_path,
    client=client,
    model=model,
    search_context=search_context
)

response = rag(**rag_params)
print(response)

Yes, you can still enroll in the course even after it has started. However, be mindful of the deadlines for submitting final projects to ensure you don’t leave everything to the last minute.


In [11]:
search_context = 'minsearch'
model = 'gpt-4o'
boost = {'question': 3.0, 'section': 0.5}
filter_dict={'course': 'data-engineering-zoomcamp'}
num_results = 5
prompt_template_path = f"{PROJECT_DIR}/prompts/course_qa.txt"

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

rag_params = dict(
    query=query,
    index=index,
    filter_dict=filter_dict,
    boost=boost,
    num_results=num_results,
    prompt_template_path=prompt_template_path,
    client=client,
    model=model,
    search_context=search_context
)

response = rag(**rag_params)
print(response)

Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework assignments, but keep in mind that there will be deadlines for turning in the final projects, so it's advisable not to leave everything until the last minute.


In [5]:
search_context = 'NOT_A_SEARCH'
model = 'gpt-4o'
boost = {'question': 3.0, 'section': 0.5}
filter_dict={'course': 'data-engineering-zoomcamp'}
num_results = 5
prompt_template_path = f"{PROJECT_DIR}/prompts/course_qa.txt"

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

rag_params = dict(
    query=query,
    index=index,
    filter_dict=filter_dict,
    boost=boost,
    num_results=num_results,
    prompt_template_path=prompt_template_path,
    client=client,
    model=model,
    search_context=search_context
)

response = rag(**rag_params)
print(response)

SearchContextWrongValueError: Parameter search_context value must be in ['minsearch', 'elasticsearch'] or None