In [1]:
import sys
import os
import pandas as pd
import pickle

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/course"
sys.path.append(PROJECT_DIR)

from utils.huggingface import (
    setup_hf_cache_dir,
    setup_transformers_cache_dir,
    setup_sentence_transformers_cache_dir,
    vectorize_sentences,
)
from utils.utils import (
    initialize_env_variables,
    load_json_document,
    id_documents,
)

from utils.elasticsearch import (
    elastic_search,
    create_elasticsearch_client,
    search_elasticsearch_indecis,
    load_index_settings,
    create_elasticsearch_index,
    remove_elasticsearch_index,
    index_documents,
    knn_elastic_search,
    get_index_mapping,
    
)

from utils.query import (
    search,
    build_prompt,
    llm,
    rag,
)

from utils.groundtruth import generate_questions_using_openai

from utils.evaluate import (
    calculate_relevance,
    hit_rate,
    mrr
)

from utils import minsearch

from utils.ollama import (
    embed_documents,
    get_embedding,
)

## HF_HOME
setup_hf_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache")
)
## TRANSFORMERS_CACHE
setup_transformers_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache/transformers_cache")
)
## SENTENCE_TRANSFORMERS_HOME
setup_sentence_transformers_cache_dir(
    os.path.join(PROJECT_DIR, "hf_cache/transformers_cache")
)

from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from openai import OpenAI

initialize_env_variables(PROJECT_DIR)
client = OpenAI()

HuggingFace cache directory
($HF_HOME) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache

HuggingFace transformers cache directory 
($TRANSFORMERS_CACHE) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache/transformers_cache

HuggingFace sentenct transformers cache directory
($SENTENCE_TRANSFORMERS_HOME) has been set to: /mnt/workspace/__ing/llming/DTC/course/hf_cache/transformers_cache

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/course/.env


# Query & Documents

In [2]:
## question
query = 'How many hours per week?'

In [3]:
document_path = f'{PROJECT_DIR}/data/1/documents.json'

documents = load_json_document(document_path)

for _ in documents[10:12]:
    print(_, end="\n\n")

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.', 'section': 'General course-related questions', 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?', 'course': 'data-engineering-zoomcamp'}

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.", 'section': 'General course-related questions', 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?', 'course': 'data-engineering-zoomcamp'}



# ElasticSearch Client

In [2]:
host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path = os.path.join(PROJECT_DIR, "config/elasticsearch/course_qa_vec_index_settings.json")
index_settings = load_index_settings(index_settings_path)

recreate_index = False

es_client = create_elasticsearch_client(host, port)
search_elasticsearch_indecis(es_client)

ElasticsearchConnectionError: Could not connect to Elasticsearch

In [5]:
if recreate_index:
    remove_elasticsearch_index(es_client, index_name)
    create_elasticsearch_index(es_client, index_name, index_settings)

# Sentence Transformer

In [6]:
model = SentenceTransformer("all-mpnet-base-v2",)

In [7]:
field_to_embed = "text"

if recreate_index:
    vectorized_documents = vectorize_sentences(
        model, documents, field=field_to_embed
    )

    print("New Fields:",vectorized_documents[0].keys())
    print("Embedding shape:",vectorized_documents[0][f"{field_to_embed}_vector"].__len__())

# Index Vectorized Documents

In [8]:
if recreate_index:
    index_documents(es_client, index_name, vectorized_documents)

In [9]:
print("Index Mapping:")
print(get_index_mapping(es_client, index_name))

Index Mapping:
{'course': 'keyword', 'question': 'text', 'section': 'text', 'text': 'text', 'text_vector': 'dense_vector'}


# Query

In [10]:
query_vector = model.encode(query)

In [11]:
filter_dict = {"section": "General course-related questions"}
field = f"{field_to_embed}_vector"
k = 5
num_results = 1


knn_search_params = dict(
    es_client=es_client,
    index_name=index_name,
    query_vector=query_vector,
    filter_dict=filter_dict,
    k=k,
    field=field,
    num_results=num_results,
)


knn_elastic_search(
    **knn_search_params,
)


[{'_index': 'course-questions',
  '_id': 'COkP8JAB_XI7s67Kbb22',
  '_score': 11.718761,
  '_source': {'text': 'Around ~10 hours per week. Timur Kamaliev did a detailed analysis of how much time students of the previous cohort needed to spend on different modules and projects. Full article',
   'section': 'General course-related questions',
   'question': 'How much time do I need for this course?',
   'course': 'machine-learning-zoomcamp'}}]

# Evaluation

## Ground Truth Generation

In [4]:
documents = id_documents(documents)
prompt_template_path = f"{PROJECT_DIR}/prompts/generate_ground_truth.txt"
test_docs = documents[:5]

generated_questions_df = generate_questions_using_openai(
    client, prompt_template_path, test_docs, model_name='gpt-4o'
)

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
generated_questions_df

Unnamed: 0,question,course,document
0,What is the exact start date and time of the c...,data-engineering-zoomcamp,c02e79ef
1,How can I add the course schedule to my Google...,data-engineering-zoomcamp,c02e79ef
2,Where do I need to register before the course ...,data-engineering-zoomcamp,c02e79ef
3,Is there a communication channel for course an...,data-engineering-zoomcamp,c02e79ef
4,Should I join any other platforms or channels ...,data-engineering-zoomcamp,c02e79ef
5,questions,data-engineering-zoomcamp,1f6520ca
6,Am I eligible to join the course after it has ...,data-engineering-zoomcamp,7842b56a
7,Is it possible to submit homework assignments ...,data-engineering-zoomcamp,7842b56a
8,Are there any deadlines for final projects if ...,data-engineering-zoomcamp,7842b56a
9,Can I register for the course after the offici...,data-engineering-zoomcamp,7842b56a


## Evaluate Text

### Elastic Search

In [4]:
recreate_index = True ## set to True, if not already created

documents = id_documents(documents)

In [5]:
host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path = os.path.join(PROJECT_DIR, "config/elasticsearch/course_qa_id_index_settings.json")
index_settings = load_index_settings(index_settings_path)

es_client = create_elasticsearch_client(host, port)
search_elasticsearch_indecis(es_client)

if recreate_index:
    remove_elasticsearch_index(es_client, index_name)
    create_elasticsearch_index(es_client, index_name, index_settings)
    index_documents(es_client, index_name, documents)
    
## check
expected_mapping = sorted(
    list(documents[0].keys())
)
actual_mapping = sorted(
    list(get_index_mapping(es_client, index_name).keys())
)

assert expected_mapping == actual_mapping

Connected to Elasticsearch
Successfully removed index course-questions.
Successfully created index course-questions.


  0%|          | 0/948 [00:00<?, ?it/s]

Successfully indexed 948/948 documents in index course-questions


In [6]:
from utils.temp import calculate_relevance
from utils.elasticsearch import elastic_search
from utils.query import search

ground_truth_df_path = f'{PROJECT_DIR}/data/3/ground-truth-data.csv'
boost = {'question': 3.0}
num_results = 5

df_ground_truth = pd.read_csv(ground_truth_df_path)

search_callable_params=dict(
    es_client=es_client,
    index_name=index_name,
    boost = boost,
    num_results = num_results,   
)

relevance_total = calculate_relevance(
    df_ground_truth=df_ground_truth,
    search_callable=elastic_search,
    search_callable_params=search_callable_params,
    search_context='elasticsearch',
    query_type='text',
)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [7]:
print("Hit Rate:",hit_rate(relevance_total))

Hit Rate: 0.7395720769397017


In [8]:
print("MRR:",mrr(relevance_total))

MRR: 0.6029788920106625


### Minsearch

In [10]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<utils.minsearch.Index at 0x7d6e3e1b3110>

In [11]:
ground_truth_df_path = f'{PROJECT_DIR}/data/3/ground-truth-data.csv'
boost = {'question': 3.0}
num_results = 5

df_ground_truth = pd.read_csv(ground_truth_df_path)


search_callable_params=dict(
    index=index,
    boost=boost,
    num_results=num_results, 
)

relevance_total = calculate_relevance(
    df_ground_truth=df_ground_truth,
    search_callable=search,
    search_callable_params=search_callable_params,
    search_context='minsearch',
    query_type='text',
)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [12]:
print("Hit Rate:",hit_rate(relevance_total))
print("MRR:",mrr(relevance_total))

Hit Rate: 0.7668035444132267
MRR: 0.656202723146748


## Evaluate Vectors

In [4]:
recreate_index = False ## set to True, if not already created
documents = id_documents(documents)

host = "localhost"
port = 9200

index_name = "course-questions"
index_settings_path = os.path.join(PROJECT_DIR, "config/elasticsearch/course_qa_id_vecs_index_settings.json")
index_settings = load_index_settings(index_settings_path)

es_client = create_elasticsearch_client(host, port)
print(search_elasticsearch_indecis(es_client))

if recreate_index:
    remove_elasticsearch_index(es_client, index_name)
    create_elasticsearch_index(es_client, index_name, index_settings)

Connected to Elasticsearch
['course-questions']


In [5]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [6]:
model_name = 'locusai/multi-qa-minilm-l6-cos-v1'
vectorized_documents_path = os.path.join(PROJECT_DIR,'data/3/vectorized_documents.pkl')

In [7]:
if os.path.exists(vectorized_documents_path):
    with open(vectorized_documents_path, 'rb') as file:
        vectorized_documents = pickle.load(file)
else:
    vectorized_documents = embed_documents(client, documents, model_name)
    with open(vectorized_documents_path, 'wb') as file:
        pickle.dump(vectorized_documents, file)

In [8]:
if recreate_index:
    index_documents(es_client, index_name, vectorized_documents)

# check
expected_mapping = sorted(
    list(vectorized_documents[0].keys())
)
actual_mapping = sorted(
    list(get_index_mapping(es_client, index_name).keys())
)

assert expected_mapping == actual_mapping

In [9]:
query = 'When does the course begin?'
query_vector = get_embedding(
    client, query, model_name='locusai/multi-qa-minilm-l6-cos-v1'
)

In [10]:
field = "question_vector"
filter_dict={'course': 'data-engineering-zoomcamp'}

params_dict = dict(
    es_client=es_client,
    index_name=index_name,
    query_vector=query_vector,
    filter_dict=filter_dict,
    field=field,
    k=5,
    num_results = 5,
)

In [11]:
results = knn_elastic_search(
    **params_dict,
)

In [12]:
results[0]

{'_index': 'course-questions',
 '_id': 'wCl39pAB_t9YwEPT8onI',
 '_score': 1.7546118,
 '_source': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 'id': 'c02e79ef'}

In [13]:
from utils.elasticsearch import knn_elastic_search
from utils.temp import calculate_relevance
from utils.elasticsearch import elastic_search
from utils.query import search

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)
model_name = 'locusai/multi-qa-minilm-l6-cos-v1'

ground_truth_df_path = f'{PROJECT_DIR}/data/3/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_df_path)

num_results = 5
k = 5
field = 'question_vector'


search_callable_params=dict(
    es_client=es_client,
    index_name=index_name,
    k=k,
    field=field,
    num_results=num_results,
    model_name=model_name,
    client=client,
)

relevance_total = calculate_relevance(
    df_ground_truth=df_ground_truth,
    search_callable=knn_elastic_search,
    search_callable_params=search_callable_params,
    search_context='elasticsearch',
    query_type='vector',
)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [14]:
print("Hit Rate:",hit_rate(relevance_total))
print("MRR:",mrr(relevance_total))

Hit Rate: 0.736330235573806
MRR: 0.6540667098912186
