In [None]:
import glob
import os

import pandas as pd
pd.set_option('display.max_colwidth', 255)
# list of files to search
from dpp_helpline_qa.modelling.question_answer import load_model_qa, answer_question
from dpp_helpline_qa.modelling.semantic_search import load_model_ss, context_ranking
from dpp_helpline_qa.preprocessing.preprocessing import process_docs
from dpp_helpline_qa.model_validation.model_validation import cal_em_score, calculate_semantic_similarity

## Context Generation + OpenAI for QA

In [None]:
### AS, FAQ + openai


# list of files to search
files = [
    os.path.join("..", "data", "Inventory", "FAQs" + ".pdf"),
    os.path.join("..", "data", "Inventory", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 4" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 5" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 6" + ".pdf"),
    os.path.join("..", "data", "Inventory", "KAEG part 7" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit FAQs" + ".pdf"),
    os.path.join("..", "data", "Materiality", "FAQs 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "Audit Standard" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 1" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 2" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 3" + ".pdf"),
    os.path.join("..", "data", "Materiality", "KAEG part 4" + ".pdf"),
]



# load and pre-process the documents to prepare for searching
import time
st = time.time()
para_dfs = process_docs(files, model_ss, tokenizer_ss, max_length) #'Cosine',  'IVF'
para_dfs[0].head()
time.time() -st


In [None]:
# load the model and tokenizer for semantic search
model_ss, tokenizer_ss = load_model_ss()
max_length = 512

In [None]:
# identify the pre-processed files for searching
op_files = glob.glob('../output/*/*.csv')
op_files

In [None]:
# automatic evaluation process
import time
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, AutoTokenizer, AutoModel, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('flyhero/gpt-j-6B') 
tokenizer = AutoTokenizer.from_pretrained("gpt2")
#model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")


st = time.time()
header = 'Answer the below question based on the given context:\n'
answers = pd.read_excel('LLM_QA.xlsx')
context = []
final_ans = []
EM_score_ans = []
Sbert_score_ans = []
NLP_score_ans = []
EM_score_context = []
Sbert_score_context = []
NLP_score_context = []
for i in range(8):
    question = answers['Question'][i]
    topic = answers['Primary Topic'][i]
    op_files = glob.glob('../output/'+topic+'/*.csv')
    context_df = context_ranking(question, op_files, model_ss, tokenizer_ss)
    main_context = '\n'.join(context_df['content'].values[0:5])
    context.append(main_context)
    prompt = 'Context:' + main_context + "\n\n Question: " + question + "\n Answer:" 
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        # min_length=500, 
        # max_length=5000,
        max_new_tokens=200,
    )
    output_ans = tokenizer.batch_decode(gen_tokens)[0]
    ans = output_ans.split('Answer:')[1]
    output = ans.split('Question:')[0]
    final_ans.append(output)
    actual_ans = answers['Answer'][i]
    # output scoring
    EM_score_ans.append(cal_em_score(output, actual_ans))
    sim_score_ans = calculate_semantic_similarity(output, actual_ans)
    Sbert_score_ans.append(sim_score_ans[1])
    NLP_score_ans.append(sim_score_ans[2])
    # context scoring
    EM_score_context.append(cal_em_score(main_context, actual_ans))
    sim_score_cnxt = calculate_semantic_similarity(main_context, actual_ans)
    Sbert_score_context.append(sim_score_cnxt[1])
    NLP_score_context.append(sim_score_cnxt[2])

time.time() -st

In [None]:
answers['Extracted context'] = context
answers['Final answer'] = final_ans
answers['EM_Score_ans'] = EM_score_ans
answers['Sbert_score_ans'] = Sbert_score_ans
answers['NLP_score_ans'] = NLP_score_ans
answers['EM_Score_context'] = EM_score_context
answers['Sbert_score_context'] = Sbert_score_context
answers['NLP_score_context'] = NLP_score_context
answers.to_csv('ques_score.csv', index=False)

## using GPUs

In [None]:
from transformers import pipeline
from cleantext import clean
import GPUtil

import pprint as pp
import os, gc

import logging 
import numpy as np
LOGGER = logging.getLogger()
def gpuname():
    # Returns the model name of the first available GPU
    try:
        gpus = GPUtil.getGPUs()
    except:
        LOGGER.warning("Unable to detect GPU model. Is your GPU configured? Is Colab Runtime set to GPU?")
        return "UNKNOWN"
    if len(gpus) == 0:
        raise ValueError("No GPUs detected in the system")
    return gpus[0].name 

def gpu_mem_total():
    # Returns the total memory of the first available GPU
    try:
        gpus = GPUtil.getGPUs()
    except:
        LOGGER.warning("Unable to detect GPU model. Is your GPU configured? Is Colab Runtime set to GPU?")
        return np.nan
    if len(gpus) == 0:
        raise ValueError("No GPUs detected in the system")
    return gpus[0].memoryTotal 
     

In [None]:

# Getting all memory using os.popen() cpu_
cpu_total_memory, cpu_used_memory, cpu_free_memory = map(
    int, os.popen('free -t -m').readlines()[-1].split()[1:])

cpu_RAM_tot = round(cpu_total_memory / 1024, 2)
print(cpu_RAM_tot, cpu_used_memory, cpu_free_memory)
     

In [None]:
gpu_mem_total()

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
model_6B_pars = "flyhero/gpt-j-6B" # see note above
model_3B_pars = 'EleutherAI/gpt-neo-2.7B'
model_1B_pars = "EleutherAI/gpt-neo-1.3B"

gpu_mem = round(gpu_mem_total() / 1024, 2)

if gpu_mem > 17 and cpu_RAM_tot > 36:
    print("using biggest 6B model. GPU - {} GB, CPU-RAM - {} GB".format(gpu_mem,
                                                                    cpu_RAM_tot))
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    # actual_model = AutoModelForCausalLM.from_pretrained(model_6B_pars)
    generator = pipeline('text-generation', model=model_6B_pars, 
                         tokenizer=tokenizer, device=0) 

elif gpu_mem > 14 and cpu_RAM_tot > 16:
    print("using medium model. GPU - {} GB, CPU-RAM - {} GB".format(gpu_mem,
                                                                    cpu_RAM_tot))
    actual_model = model_3B_pars
    generator = pipeline('text-generation', model=actual_model, 
                     device=0) 
else:
    actual_model = model_1B_pars
    print("using SMALLER model. GPU - {} GB, CPU-RAM - {} GB".format(gpu_mem,
                                                                    cpu_RAM_tot))
    print("using the smaller {} model".format(actual_model))
    generator = pipeline('text-generation', model=actual_model, 
                     device=0) 

gc.collect()

In [None]:
# GPT
def clean_gpt_out(text, remove_breaks=True):
    cleaned_text = clean(text,
                         fix_unicode=True,               # fix various unicode errors
                        to_ascii=True,                  # transliterate to closest ASCII representation
                        lower=False,                     # lowercase text
                        no_line_breaks=remove_breaks,           # fully strip line breaks as opposed to only normalizing them
                        no_urls=True,                  # replace all URLs with a special token
                        no_emails=True,                # replace all email addresses with a special token
                        no_phone_numbers=True,         # replace all phone numbers with a special token
                        no_numbers=False,               # replace all numbers with a special token
                        no_digits=False,                # replace all digits with a special token
                        no_currency_symbols=True,      # replace all currency symbols with a special token
                        no_punct=False,                 # remove punctuations
                        replace_with_punct="",          # instead of removing punctuations you may replace them
                        replace_with_url="",
                        replace_with_email="",
                        replace_with_phone_number="",
                        replace_with_number="",
                        replace_with_digit="0",
                        replace_with_currency_symbol="",
                        lang="en"                       # set to 'de' for German special handling
                    )
    return cleaned_text

In [None]:
ques = '''My test counts have identified differences between the physical number of items in stock and the number 
recorded in the accounts. Does this mean that I can't place any reliance on the stock count as a control?'''

cc = '''My test counts have identified differences between the physical number of items in stock and the number recorded in the
accounts. Does this mean that I can't place any reliance on the stock count as a control?
No. If operating effectively, the client's stock count will detect and correct any errors in the stock balance recorded in the financial statements. Therefore a deviation is only identified in the control where KPMG's count differs from the client's final count and therefore an error has gone unnoticed.
What do I do if I identify an issue during an inventory count?
For each test count performed, there will be one of three outcomes:    
KPMG's and management's counts both agree to the inventory records: The inventory records are not misstated.
KPMG's and management's counts agree, but do not agree to the inventory records: In this case, management needs to update the inventory records. If the records are accurately updated, they are not misstated.
KPMG's and management's counts do not agree: See below.
We only get one opportunity to observe the inventory count and obtain the audit evidence we need. If we do not resolve issues on the day, they may be more difficult to resolve after the count.
If KPMG's and management's counts do not agree, we firstly recount and agree with management what the correct quantity should be. If we conclude that management had counted incorrectly, this may represent a control deviation in management's count control (if
we have not determined a tolerable difference)Í¾ this control deviation exists even if management updates the inventory records in agreement with our count result.
In all cases where a difference is identified between management's and KPMG's counts, it is important for the counter to discuss with the in-charge and manager whether further work is needed. This should be done BEFORE LEAVING SITE (or before finishing the virtual count) and ensures that senior members of the engagement team can consider the potential implications of any potential
deviations. This is true even if the individual count differences are not material, bearing in mind they may have a significant impact when we evaluate the control or the substantive sample (depending on the purpose of our count).
Additional information may be required around the nature and cause of the deficiency, or further test counts may be required.
Prior to leaving the count (or finishing the virtual count), it is important that we understand the process in place for any recounts and updates being made to the system. This may be later on the day of the count or potentially in the following few days after the count. The counter will need to ensure that they understand what follow up information they require, and request this prior to leaving (or finishing the virtual count). We also obtain, or request to be provided to us, copies of all count documentation (eg. count sheets) which demonstrate that all items due to be counted had been counted.
If your audit approach is a dual purpose approach where the substantive sample size has been set based on expected controls reliance you may need to reconsider RoMM and therefore sample sizes.
What do I need to bear in mind when I perform independent inventory counts (i.e. not counting at the same time as management's
counts)?
Remember that, even when we don't plan to test management's inventory count controls, ISA 501 requires us to attend management's inventory counting and perform certain specified procedures when inventory is material (ISA (UK) 501.4). This enables us to report back to management and those charged with governance if we have identified any concerns about the entity's inventory count procedures. Therefore, even when our substantive evidence will come from independent counts, we still need to attend at least one management count.   
Even where we don't plan to test the operating effectiveness of the entity's inventory count controls, counting at the same time as management may give us additional insight into management's count process, so can be preferable in that respect. We should, therefore, plan our substantive test counts alongside management's counts for at least a proportion of our count attendance, where possible and practical. If we have concerns with management's counts, then it may be appropriate to perform our own independent counts, and we should document our rationale for this approach.
Note that testing the entity's inventory count controls is often the most efficient and effective approach. When considering taking a fully substantive approach because the entity has not counted its inventory, or we anticipate their counts may be ineffective, first challenge management to remediate their processÍ¾ if management's process is deficient, our substantive count may identify misstatements that mean we need to ask management to investigate and correct the inventory records anyway, so it is better for them to avoid this by ensuring that their count process works before we move to our substantive testing.
Attendance at Physical Inventory Counting 
A1.  Management ordinarily establishes procedures under which inventory is physically  counted at least once a year to serve as a basis for the preparation of the financial  statements and, if applicable, to ascertain the reliability of the entityâ€™s perpetual  inventory system. 
A2.  Attendance at physical inventory counting involves: 
Inspecting the inventory to ascertain its existence and evaluate its condition, and  performing test counts; 
Observing compliance with managementâ€™s instructions and the performance of  procedures for recording and controlling the results of the physical inventory  count; and 
Obtaining audit evidence as to the reliability of managementâ€™s count procedures. 
These procedures may serve as test of controls or substantive procedures depending  on the auditorâ€™s risk assessment, planned approach and the specific procedures 
carried out. 
A3.  Matters relevant in planning attendance at physical inventory counting (or in designing  and performing audit procedures pursuant to paragraphs 4â€“8 of this ISA (UK)) include,  for example: 
The risks of material misstatement related to inventory. 
The nature of the internal control related to inventory. 
Whether adequate procedures are expected to be established and proper  instructions issued for physical inventory counting. 
The timing of physical inventory counting. 
Whether the entity maintains a perpetual inventory system. 
The locations at which inventory is held, including the materiality of the inventory  and the risks of material misstatement at different locations, in deciding at which  locations attendance is appropriate. ISA (UK) 600 (Revised June 2016)
 deals  with the involvement of other auditors and accordingly may be relevant if such  involvement is with regards to attendance of physical inventory counting at a  remote location. 
Whether the assistance of an auditorâ€™s expert is needed. ISA (UK) 620 (Revised  June 2016)
 deals with the use of an auditorâ€™s expert to assist the auditor to  obtain sufficient appropriate audit evidence. 
Evaluate Managementâ€™s Instructions and Procedures 
A4.  Matters relevant in evaluating managementâ€™s instructions and procedures for recording  and controlling the physical inventory counting include whether they address, for  example: 

'''

In [None]:
import time
st = time.time()
response_min_chars =  100#@param {type:"integer"}
response_max_chars =  300#@param {type:"integer"}
import pprint as pp
prompt1 = 'Context:' + cc + "\n\n Question: " + ques + "\n Answer:" 
response1 = generator(prompt1, do_sample=True, min_length=response_min_chars, 
                      max_length=response_max_chars,
                      clean_up_tokenization_spaces=True,
                      return_full_text=True)
gc.collect()
print("Prompt: \n")
pp.pprint(prompt1)
print("\nResponse: \n")
out1_dict = response1[0]
pp.pprint(clean_gpt_out(out1_dict["generated_text"]), compact=True)
time.time()-st
     

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import time
st = time.time()
response_min_chars =  100#@param {type:"integer"}
response_max_chars =  1000#@param {type:"integer"}
import pprint as pp
prompt1 = 'Context:' + cc + "\n\n Question: " + ques + "\n Answer:" 
# input_ids = tokenizer(prompt1, return_tensors="pt").input_ids
# gen_tokens = model.generate(
#     input_ids,
#     do_sample=True,
#     temperature=0.9,
#     max_new_tokens=200,
# )
# output_ans = tokenizer.batch_decode(gen_tokens)[0]
generator = pipeline('text-generation', model=actual_model)
                    #device=0) 
response1 = generator(prompt1, do_sample=True, 
                    max_new_tokens=100, temperature=0.9
                   #min_length=response_min_chars, max_length=response_max_chars, clean_up_tokenization_spaces=True,return_full_text=True
                      )

gc.collect()
pp.pprint(response1)

time.time()-st
