In [1]:
import sys
import os

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/course"
sys.path.append(PROJECT_DIR)

from utils.huggingface import (
    setup_hf_cache_dir,
    setup_transformers_cache_dir,
    setup_sentence_transformers_cache_dir,
    vectorize_sentences
)
from utils.utils import load_json_document

from utils.elasticsearch import (
    create_elasticsearch_client,
    search_elasticsearch_indecis,
    load_index_settings,
    create_elasticsearch_index,
    remove_elasticsearch_index,
    index_documents,
    knn_elastic_search,
    get_index_mapping,
)

from utils.query import rag

## HF_HOME
setup_hf_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache")
)
## TRANSFORMERS_CACHE
setup_transformers_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache/transformers_cache")
)
## SENTENCE_TRANSFORMERS_HOME
setup_sentence_transformers_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache/transformers_cache")
)

from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from openai import OpenAI

HuggingFace cache directory
($HF_HOME) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache

HuggingFace transformers cache directory 
($TRANSFORMERS_CACHE) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache/transformers_cache

HuggingFace sentenct transformers cache directory
($SENTENCE_TRANSFORMERS_HOME) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache/transformers_cache



# Query & Documents

In [2]:
## question
query = 'How many hours per week?'

In [3]:
document_path = f'{PROJECT_DIR}/data/1/documents.json'

documents = load_json_document(document_path)

for _ in documents[10:12]:
    print(_, end="\n\n")

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.', 'section': 'General course-related questions', 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?', 'course': 'data-engineering-zoomcamp'}

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.", 'section': 'General course-related questions', 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?', 'course': 'data-engineering-zoomcamp'}



# ElasticSearch Client

In [4]:
host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path = os.path.join(PROJECT_DIR, "config/elasticsearch/course_qa_vec_index_settings.json")
index_settings = load_index_settings(index_settings_path)

recreate_index = False

es_client = create_elasticsearch_client(host, port)
search_elasticsearch_indecis(es_client)

Connected to Elasticsearch


['course-questions']

In [5]:
if recreate_index:
    remove_elasticsearch_index(es_client, index_name)
    create_elasticsearch_index(es_client, index_name, index_settings)

# Sentence Transformer

In [6]:
model = SentenceTransformer("all-mpnet-base-v2",)

In [7]:
field_to_embed = "text"

if recreate_index:
    vectorized_documents = vectorize_sentences(
        model, documents, field=field_to_embed
    )

    print("New Fields:",vectorized_documents[0].keys())
    print("Embedding shape:",vectorized_documents[0][f"{field_to_embed}_vector"].__len__())

# Index Vectorized Documents

In [8]:
if recreate_index:
    index_documents(es_client, index_name, vectorized_documents)

In [9]:
print("Index Mapping:")
print(get_index_mapping(es_client, index_name))

Index Mapping:
{'course': 'keyword', 'question': 'text', 'section': 'text', 'text': 'text', 'text_vector': 'dense_vector'}


# Query

In [10]:
query_vector = model.encode(query)

In [11]:
filter_dict = {"section": "General course-related questions"}
field = f"{field_to_embed}_vector"
k = 5
num_results = 1


knn_search_params = dict(
    es_client=es_client,
    index_name=index_name,
    query_vector=query_vector,
    filter_dict=filter_dict,
    k=k,
    field=field,
    num_results=num_results,
)


knn_elastic_search(
    **knn_search_params,
)


[{'_index': 'course-questions',
  '_id': 'COkP8JAB_XI7s67Kbb22',
  '_score': 11.718761,
  '_source': {'text': 'Around ~10 hours per week. Timur Kamaliev did a detailed analysis of how much time students of the previous cohort needed to spend on different modules and projects. Full article',
   'section': 'General course-related questions',
   'question': 'How much time do I need for this course?',
   'course': 'machine-learning-zoomcamp'}}]

# DUMP