In [1]:
import concurrent
import os
import sys

import requests
from requests import Response
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd
import datetime
import json

curr_datetime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

env = "local"
domain = "user_sh8998"
domain_config_version = "2023-10-20"
project_domain = "care"
src_path = __file__.split("evaluation_framework")[0]
documents_dir_location = os.path.join(src_path, "data", project_domain, "docstore")
groundtruth_location = os.path.join(src_path, "data", project_domain, "groundtruth_data", "QnA_subset.xlsx")
groundtruth_question_key = "bcss_question"
generated_answer_key = "generated_answer"
init_chunks_key = "init_chunks"
reranker_chunks_key = "reranker_chunks"
df_output_location = os.path.join(src_path, "data", project_domain, "experiment_results", f"{domain}-{domain_config_version}_{curr_datetime}_output.xlsx")
metrics_to_calculate = ["chunks", "rouge", "bert"]
metrics_output_location = os.path.join(src_path, "data", project_domain, "experiment_results", f"{domain}-{domain_config_version}_{curr_datetime}_metrics.csv")
config_path = os.path.join(src_path, "evaluation_framework", "exp_config_samples", "trial_config.json")
batch_size = 5
concurrency = 3

code_base_version = "v1"  # don't change unless we actually upgraded to something new or are doing isolated testing for new codebase
env = "web" if env == "prod" else env  # don't change this

ingest_url = rf"http://localhost:8802/{code_base_version}/domain-services/upload-documents?domain={domain}&config_version={domain_config_version}"
print(fr"Loading files at [{documents_dir_location}]")


chat_url = rf"http://localhost:8802/{code_base_version}/domain-services/chat"

similarity_search_url = rf"http://localhost:8802/{code_base_version}/domain-services/similarity-search"

# TODO: Update to use blob client once implemented
local_files = os.listdir(documents_dir_location)

def send_post_request_with_files(post_url: str, post_files: list[str]) -> tuple[Response, int]:
    files_post_list = []
    files_to_close = []
    for el in post_files:
        opened_file = open(fr"{documents_dir_location}/{el}", "rb")
        files_post_list.append(("files", (el, opened_file)))
        files_to_close.append(opened_file)

    res = requests.post(post_url, files=files_post_list)
    for entry in files_to_close:
        entry.close()
    return res, len(post_files)

def send_post_request_json_body(post_url: str, json_body: dict) -> Response:
    res = requests.post(post_url, json=json_body)
    return res




Loading files at [c:\Users\sh8998\DataScience_Experiment_Framework\data_science_experiments\care\docstore]


In [3]:

# Ingest documents
pbar_total = len(local_files)
with tqdm(total=pbar_total, desc=rf"Uploading {domain} documents..") as pbar:
    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = []
        files = []
        for _file in local_files:
            files.append(_file)
            if len(files) % batch_size == 0:
                futures.append(executor.submit(send_post_request_with_files, ingest_url, files))
                files = []
        if len(files) > 0:
            futures.append(executor.submit(send_post_request_with_files, ingest_url, files))
        for future in concurrent.futures.as_completed(futures):
            response, files_size = future.result()
            if response.status_code != 200:
                pbar.write(fr"{response.content}")
            pbar.update(files_size)


Uploading user_sh8998 documents..: 100%|██████████| 6/6 [00:00<00:00, 86.19it/s]


In [4]:
## Note: the below is to get the initial chunks from a simple similarity search, without 
groundtruth_df = pd.read_excel(groundtruth_location)
questions_to_test = groundtruth_df[groundtruth_question_key]
llm_responses = []
initial_chunks = []
initial_sources = []
reranked_chunks = []
reranked_sources = []
domain = "user_sh8998"
domain_config_version = "2023-10-20"
question_col = "bcss_question"
generated_answer_col = "generated_answer"

Get Topk from VectorDB

In [5]:
print("Sending questions to similarity search endpoint...")
pbar_chat_len = len(questions_to_test)
init_chunks = 'init_chunks'
with tqdm(total=pbar_chat_len, desc=rf"Running {domain} queries against vector DB..") as pbar:
    for index, row in groundtruth_df.iterrows():
        # Send POST request
        body = {'domain': domain, 'config_version': domain_config_version, 'query': row[question_col]}
        sim_search_response = send_post_request_json_body(similarity_search_url, body)

        # TODO: refactor into helper method
        chunks_per_response = []
        sources_per_response = []
        for i, citation in enumerate(sim_search_response.json()['response']):
            chunks_per_response.append((citation[0]['page_content'], os.path.basename(citation[0]['metadata']["source"])))
        groundtruth_df.at[index, init_chunks] = str(chunks_per_response)
        if sim_search_response.status_code != 200:
            pbar.write(fr"{sim_search_response.content}")
        pbar.update()

Sending questions to similarity search endpoint...


  groundtruth_df.at[index, init_chunks] = str(chunks_per_response)
Running user_sh8998 queries against vector DB..: 100%|██████████| 10/10 [00:24<00:00,  2.44s/it]


In [6]:
groundtruth_df.head(2)

Unnamed: 0,id,Total Views,source,bcss_question,answer,init_chunks
0,10161,15747,Access Point Name (APN) and Internet Protocol....,What is an APN?,Access Point Names (APNs)\nAn APN is a gateway...,"[(""Jump to: Access Point Names (APNs) | Defini..."
1,10167,14643,Account Notes - BCSS.html,What should I include in account notes?,The reason for the call.\nRecommendations prov...,[('Jump to: Requirements | Guidelines | Clari...


In [7]:
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
from evaluation.evaluator import evaluator

curr_path = os.getcwd()
config_path = f"{curr_path}/trial_config.json"
domain_params = json.load(open(config_path))

  from .autonotebook import tqdm as notebook_tqdm


Scoring the chunks

In [8]:
# To disable reranker/initial chunks scoring, send the column name as ""
is_reranker_enabled = domain_params.get("reranker_class_name", None) is not None
is_reranker_enabled = domain_params.get("k_reranker", 1000) < domain_params.get("k_milvus", 0)

data_df, metrics_df = evaluator.calculate_metrics(dataframe=groundtruth_df.copy(), metrics=["chunks"], is_reranker_enabled=is_reranker_enabled, reranker_chunks = "", init_chunks="init_chunks")

[32m2023-10-22 18:26:24.130[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m63[0m - [1mInitial chunks scoring is calculated using init_chunks column![0m
[32m2023-10-22 18:26:24.130[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m64[0m - [1mChunks metrics are generated![0m


Generate Answers

In [9]:
def process_LLMResp(LLResponse):
        result = {}
        result["answer"] = LLResponse.response
        result["chunks"] = []
        for i, citation in enumerate(LLResponse.citations):
            result["chunks"].append((citation['page_content'], os.path.basename(citation["metadata"]["source"])))
        return result

chunk_col_name = "reranker_chunks"
pbar_chat_len = len(questions_to_test)
with tqdm(total=pbar_chat_len, desc=rf"Running {domain} queries against LLM chat..") as pbar:
    for index, row in groundtruth_df.iterrows():
        # Send POST request
        print(row[question_col])
        body = {'domain': domain, 'config_version': domain_config_version, 'query': row[question_col]}
        llm_response = send_post_request_json_body(chat_url, body)
        groundtruth_df.at[index, generated_answer_col] = llm_response.json()['response']
        chunks_per_response = []
        for citation in llm_response.json()['citations']:
             chunks_per_response.append((citation['page_content'], os.path.basename(citation['metadata']["source"])))
        groundtruth_df.at[index, chunk_col_name] = str(chunks_per_response)
        
        if llm_response.status_code != 200:
            pbar.write(fr"{llm_response.content}")
        pbar.update()

Running user_sh8998 queries against LLM chat..:   0%|          | 0/10 [00:00<?, ?it/s]

What is an APN? 


  groundtruth_df.at[index, generated_answer_col] = llm_response.json()['response']
  groundtruth_df.at[index, chunk_col_name] = str(chunks_per_response)
Running user_sh8998 queries against LLM chat..:  10%|█         | 1/10 [00:07<01:05,  7.27s/it]

What should I include in account notes? 


Running user_sh8998 queries against LLM chat..:  20%|██        | 2/10 [00:14<00:57,  7.20s/it]

How do I answer if a customer asks why their 3G device stopped working?


Running user_sh8998 queries against LLM chat..:  30%|███       | 3/10 [00:21<00:48,  6.96s/it]

How do I handle a call from an internal employee that is showing verified succesfully in clarify? 


Running user_sh8998 queries against LLM chat..:  40%|████      | 4/10 [00:29<00:44,  7.39s/it]

Why is the customer charged roaming charges while in the US and not Canada?


Running user_sh8998 queries against LLM chat..:  50%|█████     | 5/10 [00:36<00:36,  7.35s/it]

What if a customer does not know their APN? 


Running user_sh8998 queries against LLM chat..:  60%|██████    | 6/10 [00:42<00:27,  6.88s/it]

What if a customer asks when was the last time their account was accessed?


Running user_sh8998 queries against LLM chat..:  70%|███████   | 7/10 [00:47<00:19,  6.47s/it]

Will a customer be charged for a replacement device from previously having a 3G phone? 


Running user_sh8998 queries against LLM chat..:  80%|████████  | 8/10 [00:56<00:13,  6.97s/it]

How do I handle a call from an internal employee that does not display verified successfully in Clarify? 


Running user_sh8998 queries against LLM chat..:  90%|█████████ | 9/10 [01:03<00:07,  7.23s/it]

Why is the customer charged roaming charges while in the US and not Mexico?


Running user_sh8998 queries against LLM chat..: 100%|██████████| 10/10 [01:11<00:00,  7.13s/it]


In [10]:
groundtruth_df.head(2)

Unnamed: 0,id,Total Views,source,bcss_question,answer,init_chunks,generated_answer,reranker_chunks
0,10161,15747,Access Point Name (APN) and Internet Protocol....,What is an APN?,Access Point Names (APNs)\nAn APN is a gateway...,"[(""Jump to: Access Point Names (APNs) | Defini...",An APN is a gateway through which a device can...,"[(""Jump to: Access Point Names (APNs) | Defini..."
1,10167,14643,Account Notes - BCSS.html,What should I include in account notes?,The reason for the call.\nRecommendations prov...,[('Jump to: Requirements | Guidelines | Clari...,"For the best customer experience, always inclu...","[(""r the best customer experience, be clear an..."


Score Chunks + Generated Answers   

In [11]:
data_df, metrics_df = evaluator.calculate_metrics(dataframe=groundtruth_df.copy(), metrics=["chunks", "rouge", "bert"], is_reranker_enabled=is_reranker_enabled, reranker_chunks = "reranker_chunks", init_chunks="init_chunks")

[32m2023-10-22 18:27:52.140[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m56[0m - [1mReranker chunks scoring is calculated using reranker_chunks column![0m
[32m2023-10-22 18:28:17.146[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m63[0m - [1mInitial chunks scoring is calculated using init_chunks column![0m
[32m2023-10-22 18:28:17.147[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m64[0m - [1mChunks metrics are generated![0m
[32m2023-10-22 18:28:17.429[0m | [1mINFO    [0m | [36mevaluation.evaluator[0m:[36mcalculate_metrics[0m:[36m88[0m - [1mRouge is calculated![0m
Some weights of RobertaModel were not initialized from the model checkpoint at c:\Users\sh8998\DataScience_Experiment_Framework\data_science_experiments\care\../../local_models/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You sho

In [12]:
metrics_df.head(2)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bert_recall,bert_precision,bert_f1,bert_score,in_top_reranker_%,reranker_rouge1_median,reranker_rougeLsum_median,reranker_rougeL_recall_median,reranker_rougeL_precision_median,in_top_init_%,init_rouge1_median,init_rougeLsum_median,init_rougeL_recall_median,init_rougeL_precision_median
0,0.448517,0.385782,0.427267,0.43772,0.876203,0.934607,0.905043,0.975735,0.8,0.246801,0.240112,1.0,0.365105,0.8,0.274247,0.241546,1.0,0.382353


In [None]:
metrics_df.values