In [3]:
import sys
import os
import numpy as np

SCRIPT_DIR = os.path.dirname(os.path.abspath("..."))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from training.generate import (generate_response, load_model_tokenizer_for_generate, 
                               get_special_token_id, get_special_token_id)
from training.consts import END_KEY, PROMPT_FORMAT, RESPONSE_KEY_NL
from training.trainer import PROMPT_FORMAT, create_data_set_from_json_list
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm


# Load test dataset that is unseen in train

In [4]:

test_data = load_from_disk("/opt/home/bo_ling/dataset/doc_transcript_pii_data.hf")["test"]
test_data

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 69978
})

# Load checkpoint at step: 19600 (12% of total data in first epoch) 

In [18]:
local_output_dir="/opt/home/doc_transcript_pii_checkpoint-19600"
#model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
import torch
tokenizer = AutoTokenizer.from_pretrained(local_output_dir, padding_side="left")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(local_output_dir, trust_remote_code=True).to(device)

In [19]:
def generate_helpdesk_response(
    instruction: str,
    input_text: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    do_sample: bool = True,
    max_new_tokens: int = 256,
    top_p: float = 0.92,
    top_k: int = 0,
    **kwargs,
) -> str:
    texts = PROMPT_FORMAT.format(instruction=instruction, input_text=input_text, output_text="")
    input_ids = tokenizer(texts, return_tensors="pt").input_ids.to("cuda")

    response_key_token_id = get_special_token_id(tokenizer, RESPONSE_KEY_NL)
    end_key_token_id = get_special_token_id(tokenizer, END_KEY)

    gen_tokens = model.generate(
        input_ids,
        pad_token_id=tokenizer.pad_token_id,
        # Ensure generation stops once it generates "### End"
        eos_token_id=end_key_token_id,
        do_sample=do_sample,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        **kwargs,
    )[0].cpu()

    # The response will be set to this variable if we can identify it.
    decoded = None

    # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the prompt,
    # we should definitely find it.  We will return the tokens found after this token.
    response_pos = None
    response_positions = np.where(gen_tokens == response_key_token_id)[0]
    if len(response_positions) == 0:
        logger.warn(f"Could not find response key {response_key_token_id} in: {gen_tokens}")
    else:
        response_pos = response_positions[0]

    if response_pos:
        # Next find where "### End" is located.  The model has been trained to end its responses with this sequence
        # (or actually, the token ID it maps to, since it is a special token).  We may not find this token, as the
        # response could be truncated.  If we don't find it then just return everything to the end.  Note that
        # even though we set eos_token_id, we still see the this token at the end.
        end_pos = None
        end_positions = np.where(gen_tokens == end_key_token_id)[0]
        if len(end_positions) > 0:
            end_pos = end_positions[0]

        decoded = tokenizer.decode(gen_tokens[response_pos + 1 : end_pos]).strip()

    return decoded

# Print some sample output for human to read

In [20]:
count = 0
for d in test_data:
    instruction = d["instruction"]
    input_text= d["input"]
    generated = generate_helpdesk_response(instruction, input_text, model, tokenizer)
    expected = d['output']
    print("="*100)
    print("="*100)
    print("INSTRUCTION:")
    print(instruction)
    print("\nINPUT:")
    print(input_text)
    print("\nGENERATED:")
    print(generated)
    print("\nEXPECTED:")
    print(expected)
    count += 1
    if count > 10:
        break

INSTRUCTION:
Extract first name from the following input:

INPUT:
NEW JERSEYMVC AUTO DRIVER LICENSE Ball Chief Administrator MECHA New Jersey Motor Vehicle Commission DL M0841 12083 10962 CLASS D DOB 10-25-1996 ISS 12-25-2020 EXP 10-25-2024 MATTHEWS ALP CHAD T 214 WEST 2ND STREET CLIFTON, NJ 07011-2462 END NONE RESTR 1 A SEY GENDER M HGT 5'-06" EYES BRN WX NJ WX202036000000002 DUP01 11.00

GENERATED:
Luis

EXPECTED:

Chad
INSTRUCTION:
Extract zip from the following input:

INPUT:
KANSAS Cat B DIRECTOR OF VEHICLES Dave in Har SECRETARY Work A. Bugh DRIVER'S LICENSE K02-46-6014 4d LIC. NO. 3 DOB 06/17/1990 BROWN 2 ARTHUR DONELL, JR 8 12126 E KILLENWOOD DR WICHITA, KS 67206-4124 9 CLASS C 15 SEX M 16 HGT 6'-01" 17 WGT 220 lb 18 EYES BRO 5 DD 90201528110 BA21020M2317FB USA KS NOT FOR FEDERAL ID 9a END NONE 12 REST B 06/17/1990 4a ISS 01/20/2021 4b EXP 06/17/2023

GENERATED:
67206

EXPECTED:

67206
INSTRUCTION:
Extract issue date from the following input:

INPUT:
California fuy USA DRIVER L

In [10]:
def normalize_str(s):
    return s.replace("\n", "").strip().lower()
normalize_str("OSCAR") == normalize_str("\n\nOscar ")

True

#  Compute accuracy based on instruction

In [23]:
count = 0
statistics = {}
for d in test_data:
    instruction = d["instruction"]
    input_text= d["input"]
    generated = generate_helpdesk_response(instruction, input_text, model, tokenizer)
    expected = d['output']
    if instruction in statistics:
        stat = statistics[instruction]
    else:
        stat = {"eq": 0, "neq": 0}
        statistics[instruction] = stat
    if normalize_str(generated) == normalize_str(expected):
        stat["eq"] += 1
    else:
        stat["neq"] += 1
    count += 1
    if count > 500:
        break

In [31]:
print(f"******************** Performance for doc transcript pii *****************\n")
for k ,v in statistics.items():
    v["accuracy"] = v["eq"] / (v["eq"] + v['neq'])
    print(f"======= The accuracy for task `{k}`: {v['accuracy']}  ========")

******************** Performance for doc transcript pii *****************



## Sample results when load earlier checkpoint 12000 (performance is worse)

In [17]:
#statistics #for check point 12000

{'Extract first name from the following input:': {'eq': 2, 'neq': 4},
 'Extract zip from the following input:': {'eq': 8, 'neq': 1},
 'Extract issue date from the following input:': {'eq': 7, 'neq': 0},
 'Extract license class from the following input:': {'eq': 8, 'neq': 0},
 'Extract last name from the following input:': {'eq': 1, 'neq': 4},
 'Extract drivers license number from the following input:': {'eq': 4,
  'neq': 1},
 'Extract middle name from the following input:': {'eq': 7, 'neq': 2},
 'Extract address from the following input:': {'eq': 7, 'neq': 1},
 'Extract driving licence issue state from the following input:': {'eq': 11,
  'neq': 0},
 'Extract city from the following input:': {'eq': 3, 'neq': 1},
 'Extract expiration date from the following input:': {'eq': 13, 'neq': 0},
 'Is the driving license valid for identification? :': {'eq': 3, 'neq': 0},
 'Extract gender from the following input:': {'eq': 7, 'neq': 1},
 'Extract dob from the following input:': {'eq': 5, 'neq': 0}