In [None]:
import glob
import os

import pandas as pd
pd.set_option('display.max_colwidth', 255)


from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity
from dpp_helpline_qa.modelling.question_answer import load_model_flan, answer_question_flan
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs

In [None]:
# list of files to search
files = [
    os.path.join("..", "data", "Inventory", "FAQs" + ".pdf"),
    os.path.join("..", "data", "Inventory", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 5" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 6" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 7" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit FAQs" + ".pdf"),
    os.path.join("..", "data", "Materiality", "FAQs 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 4" + ".pdf"),
]

In [None]:
# load the model and tokenizer for semantic search
model_semantic = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
model_ss, tokenizer_ss = load_model_ss(model_semantic)
max_length = 400

In [None]:
# load and pre-process the documents to prepare for searching
import time
st = time.time()
para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length, 'FlatL2') #'Cosine'
para_dfs[0].head()
time.time() -st

In [None]:
# identify the pre-processed files for searching
op_files = glob.glob('../output/*/*.*')
op_files

In [None]:
use_gpu = True

In [None]:
# load the model and tokenizer for question and answering

model_checkpoint = "google/flan-t5-xl"
model_qa, tokenizer_qa = load_model_flan(model_checkpoint, use_gpu)



Test Reproducibility

In [None]:
# automatic evaluation process
answers = pd.read_excel('LLM_QA.xlsx')

num_questions = len(answers)
num_trials = 2

all_final_ans = []

for i in range(num_trials):
    context = []
    final_ans = []
    for i in range(num_questions):
        question = answers['Question'][i]
        topic = answers['Primary Topic'][i]
        actual_ans = answers['Answer'][i]
        op_files = glob.glob('../output/'+topic+'/*.csv')
        context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FlatL2')
        # answer generated from top 5 contexts
        main_context = '\n'.join(context_df['content'].values[0:5])
        # answer generated from only 1st context
        ans_context = context_df['content'].values[0]
        context.append(main_context)
        # QA
        output = answer_question_flan(model_qa, tokenizer_qa, ans_context, question, use_gpu)
        final_ans.append(output)
    all_final_ans.append(final_ans)


In [None]:
for i in range(num_questions):
    unique_answers = list(set([final_ans[i] for final_ans in all_final_ans]))
    if len(unique_answers) > 1:
        print(f"{len(unique_answers)} unique answers for question {i}, the model is not consistent")
    else:
        print(f"The model is consistent for question {i}")



Test whether the QA model can accomodate input text longer than 512 tokens

In [None]:
EM_score_ans_all = []
Sbert_score_ans_all = []
NLP_score_ans_all = []

for j in range(5):
    answers = pd.read_excel('LLM_QA.xlsx')
    context = []
    final_ans = []
    EM_score_ans = []
    Sbert_score_ans = []
    NLP_score_ans = []
    EM_score_context = []
    Sbert_score_context = []
    NLP_score_context = []
    model_val = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
    for i in range(num_questions):
        question = answers['Question'][i]
        topic = answers['Primary Topic'][i]
        actual_ans = answers['Answer'][i]
        op_files = glob.glob('../output/'+topic+'/*.csv')
        context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FlatL2')
        # answer generated from top 5 contexts
        main_context = '\n'.join(context_df['content'].values[0:j+1])
        # answer generated from only 1st context
        ans_context = context_df['content'].values[0]
        context.append(main_context)
        # QA
        output = answer_question_flan(model_qa, tokenizer_qa, main_context, question, use_gpu)
        final_ans.append(output)
        # output scoring
        EM_score_ans.append(cal_em_score(output, actual_ans))
        sim_score_ans = calculate_semantic_similarity(model_ss, tokenizer_ss, output, actual_ans) #model_val
        Sbert_score_ans.append(sim_score_ans[1])
        NLP_score_ans.append(sim_score_ans[2])
        # context scoring
        EM_score_context.append(cal_em_score(main_context, actual_ans))
        sim_score_cnxt = calculate_semantic_similarity(model_ss, tokenizer_ss, main_context, actual_ans) #model_val
        Sbert_score_context.append(sim_score_cnxt[1])
        NLP_score_context.append(sim_score_cnxt[2])

    answers['Extracted context'] = context
    answers['Final answer'] = final_ans
    answers['EM_Score_ans'] = EM_score_ans
    answers['Sbert_score_ans'] = Sbert_score_ans
    answers['NLP_score_ans'] = NLP_score_ans
    answers['EM_Score_context'] = EM_score_context
    answers['Sbert_score_context'] = Sbert_score_context
    answers['NLP_score_context'] = NLP_score_context
    answers['context_top5'] = context_5 # remove incase only one ans from all top 5 context 
    answers.to_csv(f'ques_score_{j+1}.csv', index=False)

    EM_score_ans_all.append(EM_score_ans)
    Sbert_score_ans_all.append(Sbert_score_ans)
    NLP_score_ans_all.append(NLP_score_ans)

In [None]:
for i, scores in enumerate(EM_score_ans_all):
    print(f"Context {i+1}: {sum(scores)/len(scores)}")

In [None]:
for i, scores in enumerate(Sbert_score_ans_all):
    print(f"Context {i+1}: {sum(scores)/len(scores)}")

In [None]:
for i, scores in enumerate(NLP_score_ans_all):
    print(f"Context {i+1}: {sum(scores)/len(scores)}")