In [None]:
import glob
import os

import pandas as pd
pd.set_option('display.max_colwidth', 255)


from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity
from dpp_helpline_qa.modelling.question_answer import load_model_flan, answer_question_flan
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs

In [None]:
# list of files to search
files = [
    os.path.join("..", "data", "Inventory", "FAQs" + ".pdf"),
    os.path.join("..", "data", "Inventory", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 5" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 6" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 7" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit FAQs" + ".pdf"),
    os.path.join("..", "data", "Materiality", "FAQs 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Documentation", "ISA_Audit Standard-(UK)-230" + ".pdf"),
    os.path.join("..", "data", "Documentation", "KAEG-I [UK VERSION 2022]_ ISA (UK) 230 Audit documentation" + ".pdf"),
    os.path.join("..", "data", "Documentation", "UK_AU_AudFAQ_AD" + ".pdf"),
]

In [None]:
# load the model and tokenizer for semantic search
model_semantic = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
model_ss, tokenizer_ss = load_model_ss(model_semantic)
max_length = 400

In [None]:
# load and pre-process the documents to prepare for searching
import time
st = time.time()
para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length, 'FlatL2') #'Cosine'
para_dfs[0].head()
time.time() -st

In [None]:
# identify the pre-processed files for searching
op_files = glob.glob('../output/*/*.*')
op_files

In [None]:
use_gpu = False

In [None]:
# load the model and tokenizer for question and answering
model_checkpoint = "google/flan-t5-xxl"
model_qa, tokenizer_qa = load_model_flan(model_checkpoint, use_gpu)

In [None]:
# automatic evaluation process
import time
st = time.time()

all_answers = []

for i_prompt in range(24):

    answers = pd.read_excel('LLM_QA.xlsx')
    context = []
    final_ans = []
    EM_score_ans = []
    Sbert_score_ans = []
    NLP_score_ans = []
    EM_score_context = []
    Sbert_score_context = []
    NLP_score_context = []
    model_val = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
    no_ques = answers.shape[0]
    for i in range(no_ques):
        question = answers['Question'][i]
        topic = answers['Primary Topic'][i]
        actual_ans = answers['Answer'][i]
        op_files = glob.glob('../output/'+topic+'/*.csv')
        context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FlatL2')
        # answer generated from top 5 contexts
        main_context = '\n'.join(context_df['content'].values[0:5])
        # answer generated from only 1st context
        context.append(main_context)
        prompt_templates = [
            f"Question: {question} Context: {main_context}",
            f"Context: {main_context} Question: {question}",
            f"Question: {question} Context: {main_context} Answer:",
            f"Context: {main_context}\n\nQuestion: {question}\n\nAnswer:",
            f"Read this and answer the question\n\n{main_context}\n\n{question}",
            f"Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{main_context}\n\n{question}",
            f"Answer the question based on this context:\n\n{main_context}\n\n{question}",
            f"Answer this question: {question} Here is some context: {main_context}",
            f"Here is some context: {main_context} Answer this question: {question}",
            f"Question: {question} Context: {main_context} Answer:",
            f"Context: {main_context} Question: {question} Answer:",
            f"{question}\n\n{main_context}",
            f"{main_context}\n\n{question}",
            f"Read this and answer the question\n\n{question}\n\n{main_context}",
            f"Read this and answer the question\n\n{main_context}\n\n{question}",
            f"Answer this question: {question} Here is some context: {main_context} Answer this question: {question}",
            f"Answer this question: {question} Here is some context: {main_context}",
            f"Answer the question based on the context below.\n\n{question}\n\n{main_context}",
            f"Answer the question based on the context below.\n\nQuestion: {question}\n\nContext: {main_context}",
            f"Answer the question based on the context below.\n\nQuestion: {question}\n\nContext: {main_context}\n\nAnswer:",
            f"Answer the question based on the context below.\n\nQuestion: {question}\n\nContext: {main_context}\n\nQuestion: {question}\n\nAnswer:",
            f"{main_context}\nAnswer this question: {question}",
            f"{main_context}\nAnswer this question: {question} Do not repeat sentences to satisfy the minimum output length.",
            f"{main_context}\nAnswer this question: {question} Do not repeat sentences to satisfy the minimum output length, use a series of full stops instead.",
        ]
        # QA
        output = answer_question_flan(model_qa, tokenizer_qa, prompt_templates[i_prompt], use_gpu)
        final_ans.append(output)
        # output scoring
        EM_score_ans.append(cal_em_score(output, actual_ans))
        sim_score_ans = calculate_semantic_similarity(model_ss, tokenizer_ss, output, actual_ans) #model_val
        Sbert_score_ans.append(sim_score_ans[1])
        NLP_score_ans.append(sim_score_ans[2])
        # context scoring
        EM_score_context.append(cal_em_score(main_context, actual_ans))
        sim_score_cnxt = calculate_semantic_similarity(model_ss, tokenizer_ss, main_context, actual_ans) #model_val
        Sbert_score_context.append(sim_score_cnxt[1])
        NLP_score_context.append(sim_score_cnxt[2])

    answers['Extracted context'] = context
    answers['Final answer'] = final_ans
    answers['EM_Score_ans'] = EM_score_ans
    answers['Sbert_score_ans'] = Sbert_score_ans
    answers['NLP_score_ans'] = NLP_score_ans
    answers['EM_Score_context'] = EM_score_context
    answers['Sbert_score_context'] = Sbert_score_context
    answers['NLP_score_context'] = NLP_score_context
    #answers['context_top5'] = context_5 # remove incase only one ans from all top 5 context 
    answers.to_csv('ques_score.csv', index=False)

    print(f"\n\nPrompt {i_prompt+1}")
    for col in answers.columns[9:]:
        print(f"{col}_mean: {round(answers[col].mean(), 2)}")

    all_answers.append(answers)

time.time() -st