In [1]:
import sys
import os
import numpy as np
import warnings
from tqdm.auto import tqdm
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from IPython.display import display


# Suppress all warnings
warnings.filterwarnings("ignore")

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (
    initialize_env_variables,
    flatten_list_of_lists,
    sample_from_list,
    read_json_file,
    save_json_file,
    extract_item_by_keys,
    save_to_pickle,
    load_pickle,
    get_json_files_in_dir,
    read_json_file,
    parse_list_response,
    add_key_value
)

initialize_env_variables()

from utils.multithread import map_progress
from utils.variables import INDEX_NAME, ES_CLIENT, OLLAMA_CLIENT
from utils.ollama import get_embedding

from utils.query import (
    elastic_search_text,
    elastic_search_knn,
    elastic_search_hybrid_rrf,
    elastic_search_hybrid_rrf_qr,
    build_prompt,
    llm
)
from utils.evaluate import (
    hit_rate, mrr, retrieve_relevance,
    retrieve_adjusted_relevance, adjusted_hit_rate,
    adjusted_mrr
)

from utils.query_rewrting import rewrite_query

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Connected to Elasticsearch


In [2]:
pickle_file_path = os.path.join(
    PROJECT_DIR, 
    "data/generated_embeddings/vectorized_questions.pkl"
)
vectorized_questions = load_pickle(pickle_file_path)

**(BASELINE) elastic_search_hybrid_rrf:**
* Adj HR: 0.799145
* Adj MRR: 0.680520

# Rewriting

## Zero-shot Query Rewriting

In [3]:
prompt_template_path = os.path.join(
    PROJECT_DIR,
    "prompts/zero_shot_query_rewriting.txt"
)
model_choice="openai/gpt-4o-mini"
technique = "zero_shot"

### Example

In [4]:
# query = vectorized_questions[0]['question']
# query

'What concerns does Nicole Perlroth express about the presence of Jeffrey Epstein at MIT?'

In [6]:
# query_rewriting_results = rewrite_query(
#     query=query,
#     prompt_template_path=prompt_template_path,
#     technique=technique,
#     model_choice=model_choice
# )
# query_rewriting_results

["What issues does Nicole Perlroth raise regarding Jeffrey Epstein's association with MIT?",
 'What worries does Nicole Perlroth mention about Jeffrey Epstein being at MIT?',
 "What apprehensions does Nicole Perlroth have about Jeffrey Epstein's presence at MIT?"]

### Compute

In [9]:
# vectorized_questions_qr = map_progress(
#     f=lambda question_dict: add_key_value(
#         question_dict,
#         technique,
#         rewrite_query(
#             query=question_dict["question"],
#             prompt_template_path=prompt_template_path,
#             technique=technique,
#             model_choice=model_choice
#         )
#     ),
#     seq=vectorized_questions,
#     max_workers=6,
#     verbose=False,
# )

In [None]:
# save_to_pickle(
#     vectorized_questions_qr,
#     os.path.join(
#         PROJECT_DIR,
#         "data/generated_rewriting/vectorized_questions_with_rewriting.pkl"
#     )
# )

## HyDE

In [16]:
prompt_template_path = os.path.join(
    PROJECT_DIR,
    "prompts/hyde_query_rewriting.txt"
)
model_choice="openai/gpt-4o-mini"
technique = "hyde"

### Example

In [17]:
# query = vectorized_questions[0]['question']
# query

'What concerns does Nicole Perlroth express about the presence of Jeffrey Epstein at MIT?'

In [18]:
# query_rewriting_results = rewrite_query(
#     query=query,
#     prompt_template_path=prompt_template_path,
#     technique=technique,
#     model_choice=model_choice
# )
# query_rewriting_results

["Nicole Perlroth has expressed significant concerns regarding Jeffrey Epstein's presence at MIT, particularly focusing on the ethical implications and potential influences of his past actions on the institution. She highlights the troubling nature of Epstein's history as a convicted sex offender and the associated risks of normalization that his involvement could pose to the academic environment. Perlroth argues that accepting donations or fostering relationships with individuals like Epstein can undermine the integrity of research institutions, compromise their values, and propagate a culture that tolerates or overlooks serious moral failings. This concern raises broader questions about the accountability of academic institutions in their funding sources and the importance of aligning their partnerships with ethical standards, ensuring that the pursuit of knowledge does not come at the cost of ignoring past transgressions."]

### Compute

In [20]:
# vectorized_questions_qr = map_progress(
#     f=lambda question_dict: add_key_value(
#         question_dict,
#         technique,
#         rewrite_query(
#             query=question_dict["question"],
#             prompt_template_path=prompt_template_path,
#             technique=technique,
#             model_choice=model_choice
#         )
#     ),
#     seq=vectorized_questions_qr,
#     max_workers=6,
#     verbose=False,
# )

  0%|          | 0/234 [00:00<?, ?it/s]

In [22]:
# save_to_pickle(
#     vectorized_questions_qr,
#     os.path.join(
#         PROJECT_DIR,
#         "data/generated_rewriting/vectorized_questions_with_rewriting.pkl"
#     )
# )

# Evaluate

methods = [
    
    "zero_shot",
    "hyde",
    "",
    
]

In [19]:
vectorized_questions_qr = load_pickle(
    os.path.join(
        PROJECT_DIR,
        "data/generated_rewriting/vectorized_questions_with_rewriting.pkl"
    )
)

In [25]:
search_func_list = [
    elastic_search_hybrid_rrf_qr,
    elastic_search_hybrid_rrf_qr,
]
search_func_keys_list = [
    {"query":"question", "query_rewriting_results":"zero_shot"},
    {"query":"question", "query_rewriting_results":"hyde"}
]
performance = []

for i in range(len(search_func_list)):   
    relevance = map_progress(
        f=lambda question_dict: retrieve_adjusted_relevance(
            question_dict=question_dict,
            search_func=search_func_list[i],
            **search_func_keys_list[i]
        ),
        seq=vectorized_questions_qr,
        max_workers=6,
        verbose=False
    )
    
    performance.append(
        {
            "Search": search_func_keys_list[i]["query_rewriting_results"],
            "Adjusted HR": adjusted_hit_rate(relevance),
            "Adjusted MRR": adjusted_mrr(relevance)
        }
    )
    display(pd.DataFrame(performance))

  0%|          | 0/234 [00:00<?, ?it/s]

Unnamed: 0,Search,Adjusted HR,Adjusted MRR
0,zero_shot,0.777778,0.629274


  0%|          | 0/234 [00:00<?, ?it/s]

Unnamed: 0,Search,Adjusted HR,Adjusted MRR
0,zero_shot,0.777778,0.629274
1,hyde,0.794872,0.63693
