In [13]:
import sys
import os

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/course"
sys.path.append(PROJECT_DIR)

from utils.huggingface.huggingface import setup_hf_cache_dir
from utils.utils.data import load_json_document

from utils.rag.elasticsearch import (
    create_elasticsearch_client,
    search_elasticsearch_indecis,
    load_index_settings,
    create_elasticsearch_index,
    remove_elasticsearch_index,
    index_documents,
)

from utils.rag.query import (
    rag,
)

from transformers import T5Tokenizer, T5ForConditionalGeneration




setup_hf_cache_dir(os.path.join(PROJECT_DIR, "hf_cache"))

HuggingFace cache directory has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache


# Query & Documents

In [17]:
## question
query = 'The course has already started, can I still enroll?'

In [9]:
document_path = f'{PROJECT_DIR}/data/1/documents.json'

documents = load_json_document(document_path)

for _ in documents[10:12]:
    print(_, end="\n\n")

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.', 'section': 'General course-related questions', 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?', 'course': 'data-engineering-zoomcamp'}

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.", 'section': 'General course-related questions', 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?', 'course': 'data-engineering-zoomcamp'}



# ElasticSearch Client

In [10]:
host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path=f"{PROJECT_DIR}/config/elasticsearch/course_qa_index_settings.json"
index_settings = load_index_settings(index_settings_path)


es_client = create_elasticsearch_client(host, port)
remove_elasticsearch_index(es_client, index_name)
create_elasticsearch_index(es_client, index_name, index_settings)


index_documents(es_client, index_name, documents)

Connected to Elasticsearch
Successfully removed index course-questions.
Successfully created index course-questions.


0it [00:00, ?it/s]

Successfully indexed 948/948 documents in index course-questions


# HuggingFace models

## 1. flan-t5-small

In [18]:
model_name = "google/flan-t5-small"

search_context = 'elasticsearch'
boost = {'question': 3.0, 'section': 0.5}
filter_dict={'course': 'data-engineering-zoomcamp'}
num_results = 5
prompt_template_path = os.path.join(PROJECT_DIR,"prompts/course_qa.txt")


rag_params = dict(
    es_client=es_client,
    query=query,
    index_name=index_name,
    filter_dict=filter_dict,
    boost=boost,
    num_results=num_results,
    prompt_template_path=prompt_template_path,
    model=model_name,
    search_context=search_context
)

response = rag(**rag_params)
print(response)

Registration is just to gauge interest before the start date.
