In [None]:
import glob
import os

import pandas as pd
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader

from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity

pd.set_option('display.max_colwidth', 255)

In [None]:
in_documents = SimpleDirectoryReader('data/inventory').load_data()
print(len(in_documents))
mat_documents = SimpleDirectoryReader('data/materiality').load_data()
print(len(mat_documents))

In [None]:
in_documents[0].text

In [None]:
inv_index = GPTSimpleVectorIndex.from_documents(in_documents)
mat_index = GPTSimpleVectorIndex.from_documents(mat_documents)

In [None]:
# automatic evaluation process
import time
st = time.time()
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
for i in range(8):
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    if topic == 'Inventory':
        ans = inv_index.query(str(question))
    elif topic== 'Materiality':
        ans = mat_index.query(str(question))
    output = ans.response
    final_ans.append(output)
    actual_ans = answers['Answer'][i]
    # output scoring
    EM_score_ans.append(cal_em_score(output, actual_ans))
    sim_score_ans = calculate_semantic_similarity(output, actual_ans)
    Sbert_score_ans.append(sim_score_ans[1])
    NLP_score_ans.append(sim_score_ans[2])
time.time() -st

In [None]:
answers['Final answer'] = final_ans
answers['EM_Score_ans'] = EM_score_ans
answers['Sbert_score_ans'] = Sbert_score_ans
answers['NLP_score_ans'] = NLP_score_ans

answers.to_csv('ques_score.csv', index=False)

In [None]:
ans1 = index.query('When is it impracticable to attend an inventory count?')
ans1.response

In [None]:
# to check the source nodes
ans1.source_nodes

## Context Generation + OpenAI for QA

In [None]:
### AS, FAQ + openai
# list of files to search
from dpp_helpline_qa.modelling.question_answer import load_model_qa, answer_question
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs

files = [
    os.path.join("data", "inventory", "FAQs" + ".pdf"),
    os.path.join("data", "inventory", "Audit Standard" + ".pdf"),
    os.path.join("data", "materiality", "Audit FAQs" + ".pdf"),
    os.path.join("data", "materiality", "FAQs 2" + ".pdf"),
    os.path.join("data", "materiality", "Audit Standard" + ".pdf")
]

# load the model and tokenizer for semantic search
model_ss, tokenizer_ss = load_model_ss()
max_length = 512

# load and pre-process the documents to prepare for searching
import time
st = time.time()
para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length) #'Cosine',  'IVF'
para_dfs[0].head()
time.time() -st


In [None]:
# identify the pre-processed files for searching
op_files = glob.glob('../output/*/*.csv')
op_files

In [None]:
# automatic evaluation process
import time
import openai
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 512,
    "model": "text-davinci-003",
    }   

st = time.time()
header = 'Answer the below question based on the given context:\n'
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
for i in range(8):
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    op_files = glob.glob('../output/'+topic+'/*.csv')
    context_df = context_ranking(question, op_files, model_ss, tokenizer_ss)
    main_context = '\n'.join(context_df['content'].values[0:5])
    context.append(main_context)
    prompt = header + main_context + "\n\n Q: " + question + "\n A:" 
    response = openai.Completion.create(
                    prompt=prompt,
                    **COMPLETIONS_API_PARAMS
                )
    output = response["choices"][0]["text"]
    final_ans.append(output)
    actual_ans = answers['Answer'][i]
    # output scoring
    EM_score_ans.append(cal_em_score(output, actual_ans))
    sim_score_ans = calculate_semantic_similarity(output, actual_ans)
    Sbert_score_ans.append(sim_score_ans[1])
    NLP_score_ans.append(sim_score_ans[2])
    # context scoring
    EM_score_context.append(cal_em_score(main_context, actual_ans))
    sim_score_cnxt = calculate_semantic_similarity(main_context, actual_ans)
    Sbert_score_context.append(sim_score_cnxt[1])
    NLP_score_context.append(sim_score_cnxt[2])

time.time() -st

In [None]:
answers['Extracted context'] = context
answers['Final answer'] = final_ans
answers['EM_Score_ans'] = EM_score_ans
answers['Sbert_score_ans'] = Sbert_score_ans
answers['NLP_score_ans'] = NLP_score_ans
answers['EM_Score_context'] = EM_score_context
answers['Sbert_score_context'] = Sbert_score_context
answers['NLP_score_context'] = NLP_score_context
answers.to_csv('ques_score.csv', index=False)