In [None]:
import glob
import os

import pandas as pd
pd.set_option('display.max_colwidth', 255)

from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity
from dpp_helpline_qa.modelling.question_answer import load_model_flan, answer_question_flan
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs

In [None]:
# list of files to search
files = [
    os.path.join("..", "data", "Inventory", "FAQs" + ".pdf"),
    os.path.join("..", "data", "Inventory", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 5" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 6" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 7" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit FAQs" + ".pdf"),
    os.path.join("..", "data", "Materiality", "FAQs 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 4" + ".pdf"),
]

In [None]:
ss_checkpoints = [
("sentence-transformers/all-MiniLM-L6-v2", 256),
("sentence-transformers/all-mpnet-base-v2", 384),
("sentence-transformers/multi-qa-mpnet-base-dot-v1", 512),
("sentence-transformers/all-distilroberta-v1", 128),
("sentence-transformers/paraphrase-MiniLM-L6-v2", 128),
("sentence-transformers/bert-base-nli-mean-tokens", 128),
("sentence-transformers/all-MiniLM-L12-v2", 256),
("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 128),
("sentence-transformers/all-MiniLM-L12-v1", 128),
("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 128),
("sentence-transformers/paraphrase-mpnet-base-v2", 512),
("sentence-transformers/distiluse-base-multilingual-cased-v2", 128),
("sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 512),
("sentence-transformers/multi-qa-mpnet-base-cos-v1", 512),
]

error_checkpoints = []

answers_dfs = []

EM_Score_ans_mean = []
Sbert_score_ans_mean = []
NLP_score_ans_mean = []
EM_Score_context_mean = []
Sbert_score_context_mean = []
NLP_score_context_mean = []

for ss_checkpoint, max_length in ss_checkpoints:

    try:

        # load the model and tokenizer for semantic search
        model_ss, tokenizer_ss = load_model_ss(model_checkpoint=ss_checkpoint)

        # load and pre-process the documents to prepare for searching
        para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length)
        para_dfs[0].head()

        # identify the pre-processed files for searching
        op_files = glob.glob('../output/*/*.csv')
        op_files

        # load the model and tokenizer for question and answering
        model_qa, tokenizer_qa = load_model_flan()

        # automatic evaluation process
        answers = pd.read_excel('LLM_QA.xlsx')
        context = []
        final_ans = []
        EM_score_ans = []
        Sbert_score_ans = []
        NLP_score_ans = []
        EM_score_context = []
        Sbert_score_context = []
        NLP_score_context = []
        for i in range(8):
            question = answers['Question'][i]
            topic = answers['Primary Topic'][i]
            op_files = glob.glob('../output/'+topic+'/*.csv')
            context_df = context_ranking(question, op_files, model_ss, tokenizer_ss)
            main_context = '\n'.join(context_df['content'].values[0:5])
            context.append(main_context)
            output = answer_question_flan(model_qa, tokenizer_qa, main_context, question)
            final_ans.append(output)
            actual_ans = answers['Answer'][i]
            # output scoring
            EM_score_ans.append(cal_em_score(output, actual_ans))
            sim_score_ans = calculate_semantic_similarity(output, actual_ans)
            Sbert_score_ans.append(sim_score_ans[1])
            NLP_score_ans.append(sim_score_ans[2])
            # context scoring
            EM_score_context.append(cal_em_score(main_context, actual_ans))
            sim_score_cnxt = calculate_semantic_similarity(main_context, actual_ans)
            Sbert_score_context.append(sim_score_cnxt[1])
            NLP_score_context.append(sim_score_cnxt[2])

        answers['Extracted context'] = context
        answers['Final answer'] = final_ans
        answers['EM_Score_ans'] = EM_score_ans
        answers['Sbert_score_ans'] = Sbert_score_ans
        answers['NLP_score_ans'] = NLP_score_ans
        answers['EM_Score_context'] = EM_score_context
        answers['Sbert_score_context'] = Sbert_score_context
        answers['NLP_score_context'] = NLP_score_context
        answers.to_csv(f'ques_score_{ss_checkpoint.split("/")[1]}.csv', index=False)

        answers_dfs.append(answers)

        EM_Score_ans_mean.append(answers["EM_Score_ans"].mean())
        Sbert_score_ans_mean.append(answers["Sbert_score_ans"].mean())
        NLP_score_ans_mean.append(answers["NLP_score_ans"].mean())
        EM_Score_context_mean.append(answers["EM_Score_context"].mean())
        Sbert_score_context_mean.append(answers["Sbert_score_context"].mean())
        NLP_score_context_mean.append(answers["NLP_score_context"].mean())

    except:
        print(f"Error: {ss_checkpoint}")
        error_checkpoints.append(ss_checkpoint)

ss_checkpoints_ = [c for c in ss_checkpoints if c not in error_checkpoints]

scores = pd.DataFrame({
    "model_checkpoint": ss_checkpoints_,
    "EM_Score_ans_mean": EM_Score_ans_mean,
    "Sbert_score_ans_mean": Sbert_score_ans_mean,
    "NLP_score_ans_mean": NLP_score_ans_mean,
    "EM_Score_context_mean": EM_Score_context_mean,
    "Sbert_score_context_mean": Sbert_score_context_mean,
    "NLP_score_context_mean": NLP_score_context_mean,
})

scores

In [None]:
scores.sort_values("Sbert_score_ans_mean", ascending=False)