In [None]:
import glob
import os

import pandas as pd
pd.set_option('display.max_colwidth', 255)


from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity
from dpp_helpline_qa.modelling.question_answer import load_model_flan, answer_question_flan
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs

In [None]:
# list of files to search
files = [
    os.path.join("..", "data", "Inventory", "FAQs" + ".pdf"),
    os.path.join("..", "data", "Inventory", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 5" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 6" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 7" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit FAQs" + ".pdf"),
    os.path.join("..", "data", "Materiality", "FAQs 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Documentation", "ISA_Audit Standard-(UK)-230" + ".pdf"),
    os.path.join("..", "data", "Documentation", "KAEG-I [UK VERSION 2022]_ ISA (UK) 230 Audit documentation" + ".pdf"),
    os.path.join("..", "data", "Documentation", "UK_AU_AudFAQ_AD" + ".pdf"),
]

In [None]:
# load the model and tokenizer for semantic search
model_semantic = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
model_ss, tokenizer_ss = load_model_ss(model_semantic)
max_length = 400

In [None]:
# load and pre-process the documents to prepare for searching
import time
st = time.time()
para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length, 'FlatL2') #'Cosine'
para_dfs[0].head()
time.time() -st

In [None]:
# identify the pre-processed files for searching
op_files = glob.glob('../output/*/*.*')
op_files

In [None]:
# load the model and tokenizer for question and answering

# model_qa, tokenizer_qa = load_model_qa()

In [None]:
# # saved model in dbfs
# model_qa.save_pretrained('/dbfs/FileStore/tables/gpt4all-j')
# tokenizer_qa.save_pretrained('/dbfs/FileStore/tables/gpt4all-j')

In [None]:
# # load the model and tokenizer for question and answering

# model_checkpoint = 'google/flan-t5-xxl' #"/dbfs/FileStore/tables/google_flan_model" #/dbfs/FileStore/tables/google_flan_model
# model_qa, tokenizer_qa = load_model_flan(model_checkpoint, False)

# from transformers import AutoModelForCausalLM, AutoTokenizer
# model_checkpoint = "nomic-ai/gpt4all-j"
# model_qa = AutoModelForCausalLM.from_pretrained(model_checkpoint, revision="v1.2-jazzy")
# tokenizer_qa = AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
context = '''My test counts have identified differences between the physical number of items in stock and the number recorded in the accounts. Does this mean that I can't place any reliance on the stock count as a control? No. If operating effectively, the client's stock count will detect and correct any errors in the stock balance recorded in the financial statements. Therefore a deviation is only identified in the control where KPMG's count differs from the client's final count and therefore an error has gone unnoticed.
What else do we think about if our count results differ from inventory records? Management cannot use the results of our count to alter their inventory records as this would constitute us preparing management's financial statements. If differences are identified, management considers whether they need to investigate and ultimately quantify and record their own correction of any misstatement - which may or may not include management performing a complete physical count.
What if there are differences between the inventory count results and management's final adjusted inventory listing? If management's subsequent investigation has determined inventory records (i.e. the final adjusted inventory listing) are correct and an adjustment is not needed, then we obtain evidence that this is appropriate and that there is no misstatement. If management's investigation identifies that an adjustment is necessary to the inventory records, then this is an audit misstatement and a potential control deviation.
What if there are differences between the inventory count results and management's final adjusted inventory listing? If management's subsequent investigation has determined inventory records (i.e. the final adjusted inventory listing) are correct and an adjustment is not needed, then we obtain evidence that this is appropriate and that there is no control deviation. If management's investigation identifies that an adjustment is necessary to the inventory records, then this is an audit misstatement and a control deviation.
counts? If there is a difference between the count sheets and management's final adjusted inventory listing, we investigate the reason for such difference. If our subsequent investigation determines that the final adjusted inventory listing is incorrect, then this is a control deviation. When the adjustments from book to physical are trivial, it supports the assessment that cycle counts produce results substantially the same as those which would be obtained by a complete physical count. If more than trivial adjustments are made to adjust inventory quantities as a result of the cycle counts, this challenges the effectiveness of the related transaction level controls and undermines the assessment that the same results are obtained from the perpetual inventory system as those by annual physical count. In such cases, we reassess control risk, and determine whether it is necessary for management to perform a complete physical count at or near period end that we would then attend. If the entity has remediated the cycle count controls, we may re-test the cycle counts subsequent to remediation of identified deficiencies.
'''
question = '''My test counts have identified differences between the physical number of items in stock and the number recorded in the accounts. Does this mean that I can't place any reliance on the stock count as a control?'''
prompt = f"Answer the question as truthfully as possible using and only using the provided context and if the answer is not contained within the context/text, say Irrelevant, Context: {context} \n Question: {question} \n Answer: "
prompt

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
baseModel = "databricks/dolly-v2-12b"
load_8bit = True  
tokenizer_qa = AutoTokenizer.from_pretrained("databricks/dolly-v2-12b")
model_qa = AutoModelForCausalLM.from_pretrained(baseModel, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
generator = pipeline(task='text-generation', model=model_qa, tokenizer=tokenizer_qa)


In [None]:
# automatic evaluation process
import time
st = time.time()
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
model_val = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
no_ques = answers.shape[0]
for i in range(no_ques):
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    actual_ans = answers['Answer'][i]
    op_files = glob.glob('../output/'+topic+'/*.csv')
    context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FaltL2')
    # answer generated from top 5 contexts
    main_context = '\n'.join(context_df['content'].values[0:5])
    # answer generated from only 1st context
    # ans_context = context_df['content'].values[0]
    context.append(main_context)
    # QA
    # output = answer_question_flan(model_qa, tokenizer_qa, main_context, question, False)
    #output_ans = answer_question(model_qa, tokenizer_qa, main_context, question)
    # ans = output_ans.split('Answer:')[1]
    # output = ans.split('Question:')[0]
    # Dolly model
    prompt = f"Answer the question as truthfully as possible using and only using the provided context and if the answer is not contained within the context/text, say Irrelevant, Context: {main_context} \n Question: {question} \n Answer: "
    response = generator(prompt, do_sample=True, 
                   max_new_tokens=200, temperature=0.1
                   # min_length=250, max_length=500, 
                   #clean_up_tokenization_spaces=True,return_full_text=True
                      )
    output = response[0]['generated_text'].split('Answer:')[1]
    final_ans.append(output)
    # output scoring
    EM_score_ans.append(cal_em_score(output, actual_ans))
    sim_score_ans = calculate_semantic_similarity(model_ss, tokenizer_ss, output, actual_ans) #model_val
    Sbert_score_ans.append(sim_score_ans[1])
    NLP_score_ans.append(sim_score_ans[2])
    # context scoring
    EM_score_context.append(cal_em_score(main_context, actual_ans))
    sim_score_cnxt = calculate_semantic_similarity(model_ss, tokenizer_ss, main_context, actual_ans) #model_val
    Sbert_score_context.append(sim_score_cnxt[1])
    NLP_score_context.append(sim_score_cnxt[2])

time.time() -st

In [None]:
## langchain use
import torch
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline


generate_text = pipeline(model="databricks/dolly-v2-12b", torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True)

# automatic evaluation process
import time
st = time.time()
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
model_val = '/dbfs/FileStore/tables/multi-qa-mpnet-base-cos-v1/'
no_ques = answers.shape[0]
for i in range(1): #no_ques
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    actual_ans = answers['Answer'][i]
    op_files = glob.glob('../output/'+topic+'/*.csv')
    context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FaltL2')
    # answer generated from top 5 contexts
    main_context = '\n'.join(context_df['content'].values[0:5])
    # answer generated from only 1st context
    # ans_context = context_df['content'].values[0]
    context.append(main_context)
    # QA
    # Dolly model with langchain
    #template for an instruction with input
    prompt_with_context = PromptTemplate(
        input_variables=["instruction", "context"],
        template="{instruction}\n\nInput:\n{context}")
    
    hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

    llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
    print(llm_context_chain.predict(instruction=question, context=main_context).lstrip())

    # output = response[0]['generated_text'].split('Answer:')[1]
    # final_ans.append(output)
    # # output scoring
    # EM_score_ans.append(cal_em_score(output, actual_ans))
    # sim_score_ans = calculate_semantic_similarity(model_ss, tokenizer_ss, output, actual_ans) #model_val
    # Sbert_score_ans.append(sim_score_ans[1])
    # NLP_score_ans.append(sim_score_ans[2])
    # # context scoring
    # EM_score_context.append(cal_em_score(main_context, actual_ans))
    # sim_score_cnxt = calculate_semantic_similarity(model_ss, tokenizer_ss, main_context, actual_ans) #model_val
    # Sbert_score_context.append(sim_score_cnxt[1])
    # NLP_score_context.append(sim_score_cnxt[2])

time.time() -st

In [None]:
answers['Extracted context'] = context
answers['Final answer'] = final_ans
answers['EM_Score_ans'] = EM_score_ans
answers['Sbert_score_ans'] = Sbert_score_ans
answers['NLP_score_ans'] = NLP_score_ans
answers['EM_Score_context'] = EM_score_context
answers['Sbert_score_context'] = Sbert_score_context
answers['NLP_score_context'] = NLP_score_context
#answers['context_top5'] = context_5 # remove incase only one ans from all top 5 context 
answers.to_csv('ques_score.csv', index=False)

## Generate 5 answers from top 5 context and select best

In [None]:
# automatic evaluation process
import time
st = time.time()
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
context_5 = []
for i in range(8):
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    actual_ans = answers['Answer'][i]

    op_files = glob.glob('../output/'+topic+'/*.csv')
    context_df = context_ranking(question, op_files, model_ss, tokenizer_ss, 'FlatL2')
    main_context = '\n'.join(context_df['content'].values[0:5])
    context_5.append(main_context)
    con_score = []
    op_list = []
    for j in range(5):
        con = context_df['content'].values[j]
        if GPU:
            input_ids = tokenizer(f"{con_score}\n\n{question}", return_tensors="pt").input_ids.to("cuda")
        else:
            input_ids = tokenizer(f"{con}\n\n{question}", return_tensors="pt").input_ids
        temp_op = tokenizer.decode(model.generate(input_ids, max_length=200, min_length=50)[0])
        temp_em_scr = cal_em_score(temp_op, actual_ans)
        temp_sim_scr = calculate_semantic_similarity(temp_op, actual_ans)
        temp_scr = (temp_em_scr+temp_sim_scr[1])/2
        con_score.append(temp_scr)
        op_list.append(temp_op)
    best_ind = max(enumerate(con_score),key=lambda x: x[1])[0]
    context.append(context_df['content'].values[best_ind])
    output = op_list[best_ind]
    final_ans.append(output)
    # output scoring
    EM_score_ans.append(cal_em_score(output, actual_ans))
    sim_score_ans = calculate_semantic_similarity(output, actual_ans)
    Sbert_score_ans.append(sim_score_ans[1])
    NLP_score_ans.append(sim_score_ans[2])
    # context scoring
    EM_score_context.append(cal_em_score(main_context, actual_ans))
    sim_score_cnxt = calculate_semantic_similarity(main_context, actual_ans)
    Sbert_score_context.append(sim_score_cnxt[1])
    NLP_score_context.append(sim_score_cnxt[2])

### Generating multiple answers from top5 contexts also gives similar performance only, however, time is 5 times [60min] as for each ans now 5 times ans are generated.

In [None]:
for col in answers.columns[9:]:
    print(f"{col}_mean: {round(answers[col].mean(), 2)}")

## Longformer trial

In [None]:
from transformers import LongformerTokenizer, LongformerModel
from sklearn.metrics.pairwise import cosine_similarity

m1 = "allenai/longformer-base-4096" #"navteca/multi-qa-mpnet-base-cos-v1"
model = LongformerModel.from_pretrained(m1)
tokenizer = LongformerTokenizer.from_pretrained(m1)
from dpp_helpline_qa.preprocessing.preprocessing import get_embeddings

text_list = ['this is good', 'this worse']

emb = get_embeddings(text_list, model, tokenizer).cpu().detach().numpy()
cosine_similarity(emb[[0]], emb[[1]]) # 0.99  where as with mpnet model its coming as 0.27 which makes sense

sentence embedding doesnt make sense so cant be used

In [None]:
encoding = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")

global_attention_mask = [1].extend([0]*encoding["input_ids"].shape[-1])

encoding["global_attention_mask"] = global_attention_mask

# I don't want to use it for Question-Answer use-case. I just need the sentence embeddings
# default is local attention everywhere
# the forward method will automatically set global attention on question tokens

o = model(**encoding)

sentence_embedding = o.last_hidden_state[:,0]
n1 = sentence_embedding.cpu().detach().numpy()
cosine_similarity(n1[[0]], n1[[1]]) # 0.99 this method also gives similar results as above