In [3]:
import sys
import os
import numpy as np

SCRIPT_DIR = os.path.dirname(os.path.abspath("..."))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    T5ForConditionalGeneration
)

# Load goldern dataset that is unseen in train

In [4]:

test_data = load_from_disk("/opt/home/bo_ling/dataset/eats_receipt_gcp_test.hf")
test_data

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 419
    })
})

In [5]:
test_data['test'][0]

{'input_text': "What are the receipt_number, total_price, and total_tax in 'COUPONS ONLY Albertsons Store 4279 Dir MICHAEL STOCKTON Main:(817) 447-9106 Rx: (817) 447-8060 833 N E Alsbury Blvd Burleson TX 76028 CAT CHOW NATURALS Regular Price 14.49 You Save 0.50- 2 QTY OREO THINS Regular Price You Save GROCERY GROC NONEDIBLE PUFFS 3X PLUS FAM Regular Price You Save TAX **** BALANCE REFRIG/FROZEN CHOBANI YGRT PMGRN Regular Price You Save GEN MERCHANDISE SIG PETROLEUM JELY SCRUB BUBB AEROSOL SPONGE DADDY PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Visa 10.98 5.49- 7.99 0.50- YOUR SAVINGS 1.59 0.09- Store Savings Total Total Savings Value CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/26/23 08:10 4279 53 4 13.99 T YOUR CASHIER TODAY WAS SELF Credit Purchase 02/26/23 08:10 CARD # ************000 REF: 081033408580 AUTH: 00367287 47.31 5.49 S 7.49 T 1.50 S 2.79 X 5.99 T 6.99 T 3.07 47.31 47.31 0.00 8 8853 6.58 6.58 13% 00427905300042302260810 Thank you for shopping Albert

# Load checkpoint at step: 19600 (12% of total data in first epoch) 

In [6]:
local_output_dir="/opt/home/bo_ling/dolly_training/eats_receipt_gcp_t5"
#model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
import torch
tokenizer = AutoTokenizer.from_pretrained(local_output_dir)
device = "cuda:2" if torch.cuda.is_available() else "cpu"
model = T5ForConditionalGeneration.from_pretrained(local_output_dir, trust_remote_code=True).to(device)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [7]:
def generate_transcipt_response(
    input_text: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    PAD_TOKEN = "<pad>",
    EOS_TOKEN = "</s>",
    do_sample: bool = True,
    max_new_tokens: int = 768,
    top_p: float = 0.85,
    top_k: int = 0,
    **kwargs,
) -> str:
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    end_key_token_id = tokenizer.encode(EOS_TOKEN)[0]

    gen_tokens = model.generate(
        input_ids,
        pad_token_id=tokenizer.pad_token_id,
        # Ensure generation stops once it generates "### End"
        eos_token_id=end_key_token_id,
        do_sample=do_sample,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        **kwargs,
    )[0].cpu()

    decoded = tokenizer.decode(gen_tokens).strip(PAD_TOKEN).strip(EOS_TOKEN).strip()

    return decoded

# Print some sample output for human to read

In [9]:
count = 0
for d in test_data['test']:
    input_text= d["input_text"]
    generated = generate_transcipt_response(input_text, model, tokenizer)
    expected = d['output_text']
    print("="*100)
    print("="*100)
    print("\nINPUT:")
    print(input_text)
    print("\nGENERATED:")
    print(generated)
    print("\nEXPECTED:")
    print(expected)
    count += 1
    if count > 50:
        break


INPUT:
What are the receipt_number, total_price, and total_tax in 'COUPONS ONLY Albertsons Store 4279 Dir MICHAEL STOCKTON Main:(817) 447-9106 Rx: (817) 447-8060 833 N E Alsbury Blvd Burleson TX 76028 CAT CHOW NATURALS Regular Price 14.49 You Save 0.50- 2 QTY OREO THINS Regular Price You Save GROCERY GROC NONEDIBLE PUFFS 3X PLUS FAM Regular Price You Save TAX **** BALANCE REFRIG/FROZEN CHOBANI YGRT PMGRN Regular Price You Save GEN MERCHANDISE SIG PETROLEUM JELY SCRUB BUBB AEROSOL SPONGE DADDY PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Visa 10.98 5.49- 7.99 0.50- YOUR SAVINGS 1.59 0.09- Store Savings Total Total Savings Value CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/26/23 08:10 4279 53 4 13.99 T YOUR CASHIER TODAY WAS SELF Credit Purchase 02/26/23 08:10 CARD # ************000 REF: 081033408580 AUTH: 00367287 47.31 5.49 S 7.49 T 1.50 S 2.79 X 5.99 T 6.99 T 3.07 47.31 47.31 0.00 8 8853 6.58 6.58 13% 00427905300042302260810 Thank you for shopping Albertsons! Fo

Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors



INPUT:
What are the receipt_number, total_price, and total_tax in 'MR Albertsons Store 3129 Dir Ron Sousa Main: (661) 827-0394 3500 Panama Lane BAKERSFIELD CA 93313 GEN MERCHANDISE VICKS VAPORUB NSL Regular Price Sale Savings VAPOPATCH ADULT Regular Price 10.99 1.00- MISCELLANEOUS RCYCBLE BAG CHARGE TAX **** BALANCE 11.99 Sale Savings RICOL CGH DROP NTL 2.99 T Regular Price Sale Savings PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 2.00- YOUR SAVINGS 3.29 0.30- Store Savings Total Total Savings Value Visa CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/26/23 19:39 3129 52 168 Credit Purchase 02/26/23 19:39 CARD # ************000 REF: 463945436270 AUTH: 00393584 YOUR CASHIER TODAY WAS SELF 9.99 X 9.99 X 24.97 0.10 1.90 24.97 24.97 0.00 3 8852 3.30 3.30 13% 00312905201682302261939 Thank you for shopping Albertsons! For ALBERTSONS FOR U questions call 877-276-9637 or Albertsons.com/foru'?

GENERATED:
receipt_number: 00312905201682302261939; total_price: 24.97; total_tax


INPUT:
What are the receipt_number, total_price, and total_tax in '10.00 SAFEWAY S Store 3501 Dir Nick Ericson Main: (253) 838-9941 2109 S.W. 336th St. FFEDERAL WAY WA MR REFRIG/FROZEN LUCERNE WHOLE MILK DELI GAT OPN NAT FLATBREAD MISCELLANEOUS RCYCBLE BAG CHARGE TAX **** BALANCE Join the Safeway Club today. Membership is Free and Instant YOUR CASHIER TODAY WAS SELF 4.49 S 4.29 S Debit Purchase 02/27/23 15:09 CARD # ************000 PRIMARY TOTAL TRANSACTION AMOUNT: 8.87 CASH BACK AMOUNT: 0.00 REF: 690955113840 AUTH: 00115269 Debit CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/27/23 15:09 3501 54 99 12U HTIW 32 TROVOA 0.08 T 0.01 8.87 8.87 0.00 2 8854 0035010540099230227150940 Thank you for shopping Safeway! For SAFEWAY FOR U questions call 877-276-9637 or Safeway.com/foru D'?

GENERATED:
receipt_number: 0035010540099230227150940; total_price: 8.87; total_tax: 0.01

EXPECTED:
receipt_number: ; total_price: 8.87; total_tax: 0.01

INPUT:
What are the receipt_number, total_price, and total_tax i


INPUT:
What are the receipt_number, total_price, and total_tax in 'Albertsons Store 4131 Dir Rickie Farley Main:(323) 295-1919 Rx: (323) 295-3330 3901 Crenshaw Blvd LOS ANGELES CA 90008 GEN MERCHANDISE REUSABLE BAG 2 QTY PULL-UPS L Regular Price Sale Savings TAX **** BALANCE PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Visa Credit Purchase 02/26/23 18:40 CARD # ************000 REF: 464053406700 AUTH: 00029064 27.98 YOUR SAVINGS 2.00- Store Savings Total Total Savings Value YOUR CASHIER TODAY WAS Vicente 15-18 kg 26.08 CHANGE 0.00 TOTAL NUMBER OF ITEMS SOLD = 3 02/26/23 18:40 4131 3 283 7516 0.10 25.98 0.00 26.08 26.08 2.00 2.00 7% 00413100302832302261840 Thank you for shopping Albertsons! For ALBERTSONS FOR U questions call 877-276-9637 or Albertsons.com/foru'?

GENERATED:
receipt_number: 00413100302832302261840; total_price: 26.08; total_tax: 0.0

EXPECTED:
receipt_number: ; total_price: 26.08; total_tax: 0.0

INPUT:
What are the receipt_number, total_pric


INPUT:
What are the receipt_number, total_price, and total_tax in 'bet 192 & bean stan bvia noti que woarr SAFEWAYS. Store 253 Dir Jonathan Boggess Main:(480) 982-6120 Rx: (480) 288-2728 3185 W. Apache Trail APACHE JUNCTION AZ 85220 GROCERY noi OST SAMT DEW CODE RED T e cvri Regular Price TE-7.9908:4 Member Savings 1.00- NBC OREO COOKIES Regular Price ATE 6.49 Member Savings 1.50- 223M12 REFRIG/FROZEN TAVA UT LUC LUC COTTAGE CHEESE 1 QTY MATTEL HOT WHEELS (AS) 20 MACCHUDP MEAT 1 QTY GEN MERCHANDISE BAR-S FRANKS TAX **** BALANCE 13220 PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 6.99 B 4.99 B Credit Purchase 02/27/23 19:09 CARD # ****** **000 REF: 890928445820 AUTH: 004066324ANOXIA BIEDT DS 3.49 B YOUR SAVINGS Member Savings 2.40% SALES TAX 6.70% SALES TAX TOTAL TAX TOTAL NUMBER OF ITEMS SOLD = 02/27/23 19:09 253 54 268 Total 80- Total Savings Value 1.25 T 1.00 B 0.51 18.23 22ALD OTUA Visa DOA'S 18.23 CHANGE 18.23 YOUR CASHIER TODAY WAS SELF 0.00 0.43 0.08 0.51 5 885


INPUT:
What are the receipt_number, total_price, and total_tax in 'E FYTYYTE (2 HONE APPS 3 A DISC: USB * iPhone 81 Media B  x Shuffle crazy Zac Gi crazy 1/1 PM 00:0 AUTO Chap Stick Albertsons Store 590 Dir Chad Onchi Main:(503) 591-7557 Rx: (503) 591-0997 MR 7500 E Main St Hillsboro OR 97123 GRANOLA BAR TAX **** BALANCE ON/ OFF REFRIG/FROZEN 2 QTY LUC WHOLE LUC YOGURT STRWBRY Regular Price You Save PAYMENT AMOUNT GROCERY MISCELLANEOUS RCYCBLE BAG CHARGE 0.10 0.00 14.36 On Credit Purchase 02/28/23 07:33 CARD # ************000 bom100bni of of REF: 953328421740 AUTH: 00854941 SYNC ----- AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Store Savings Total YOUR CASHT MOUNT 374 Visa CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/28/23 07:33 590 51 3  YOUR CASHIER TODAY WAS SELF YOUR SAVINGS 2.79 0.30- * < >$ 14.36 CLAW DROGE VORO 723807 02/27/23 15 TOTAL NUME ||||| 00059005100032302280733 Thank you for shopping Albertsons! For ALBERTSONS FOR U questions call 877-276-9637 or Albertsons.com


INPUT:
What are the receipt_number, total_price, and total_tax in '53 SAFEWAY S Store 1513 Dir Shayne Adams Main: (410) 319-8591 Rx: (410) 319-8620 4401 Harford Road BALTIMORE MD 21214 GROCERY DOMINO SGR GRNLTD DEER PARK NATURAL. Regular Price 6.99 Member Savings 2.49- REFRIG/FROZEN INT DEL CREAMER Regular Price 5.19 Member Savings 1.69- TAX **** BALANCE PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Visa Credit Purchase 02/28/23 08:59 CARD # ************000 REF: 365937419420 AUTH: 00633721 CHANGE TOTAL NUMBER OF ITEMS SOLD = 02/28/23 08:59 1513 53 13 YOUR CASHIER TODAY WAS SELF YOUR SAVINGS 13.56 Member Savings Total Total Savings Value 5.29 S 4.50 B 3.50 S 0.27 13.56 13.56 0.00 3 8853 4.18 4.18 24% HOW WAS YOUR SHOPPING EXPERIENCE? WE VALUE YOUR FEEDBACK! SCAN THE QR CODE BELOW FOR A TWO MINUTE SURVEY 00151305300132302280859 Thank you for shopping Safeway! For SAFEWAY FOR U questions call 877-276-9637 or Safeway.com/foru'?

GENERATED:
receipt_number: 001513


INPUT:
What are the receipt_number, total_price, and total_tax in '+ 1.75 E ON A MAC CS1-18  BEH YEBEK12002 2003 DE 300-3-1-410311020 VTBEBIZON2 2000 DIE EOB VDAEBI12IMG Cy 132A110801 AAlbertsons Store 4009 Dir Margaret Benedict Main: (702) 658-2030 Rx: (702) 658-6669 8410 Farm Road 118 Las Vegas NV 89131 SW 180 Far Va GROCERY 101 4 QTY SIG WTR PU REFRIG/FROZEN HA BLUE DIAMOND ALMD Regular Price Sale Savings TAX **** BALANCE AMADIR PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 68001 Credit Purchase 02/26/23 19:15 byens CARD # ************000m bns obulitis yage moy REF: 991530438060 AUTH: 00217062 MOD.2A03VWOROWOHZ Store Savings Total Visit RAD JEM POST 4.49 0.50- YOUR SAVINGS worl2 T238 ZAGV CAT Zaz TOTAL NUMBER OF ITEMS SOLD = 02/26/23 19:15 4009 96 32AHJAUS 211UTIW 211 1049 190-1012 YOUR CASHIER TODAY WAS SELF www.w 19iden3 91012 of qqs TUTA Visas AS9V2A3.09 35 23.95 DANCE YOU CHANGE 19.96 S 23.95 vanom 1028 160 baills-blo3 0.00 23.95 dovleast framegane cos869 


INPUT:
What are the receipt_number, total_price, and total_tax in 'ENVOYDAEBLIZI Albertsons H2AW 170 MERFRORJAH Store 981 Dir Casey Dietz AMTRCBJA 1.Jemolaup 190 naque C Main: (480) 899-7102 Rx:(480) 899-7713 3145 S Alma School Road Chandler AZ 85248 85248 T GEN MERCHANDISE AVEENO BODY WASH CORA TMPNS DUO PK TAX **** BALANCE PAYMENT AMOUNT AL VISA CREDIT AID A0000000031010 TVR 8000008000 TSI 6800 Credit Purchase 02/26/23 12:18 CARD # ************000 REF: 381854491340 AUTH: 00491471 Visa AHO 10.99 T 12.99 X Moson CHANGE 1.5% SALES TAX 6.3% SALES TAX TOTAL TAX TOTAL NUMBER OF ITEMS SOLD = 02/26/23 12:18 981 54 76 YOUR CASHIER TODAY WAS SELF 1.87 25.85 MAK 25.85 25.850NON 000 a 0.00 0.36 1.51 1.87RA 8854 2 00098105400762302261218 Thank you for shopping Albertsons! For ALBERTSONS FOR U questions call 877-276-9637 or Albertsons.com/foru seluayingsini-warnia 100 10 S'?

GENERATED:
receipt_number: 00098105400762302261218; total_price: 25.85; total_tax: 1.87

EXPECTED:
receipt_number: ; total

In [12]:
def str_to_json(js, do_print_exception=False):
    res = {}
    for ss in js.lower().split(";"):
        try:
            k, v = ss.split(":")
            if k.strip() in ["total_price", "total_tax"]:
                res[k.strip()] = float(v.strip())
            else:
                res[k.strip()] = v.strip()
        except:
            if do_print_exception:
                print(js)
    return res
str_to_json("receipt_number: 00400505302182302271921; total_price: 10.98; total_tax: 0.0")

{'receipt_number': '00400505302182302271921',
 'total_price': 10.98,
 'total_tax': 0.0}

#  Compute accuracy based on instruction on goldern dataset

In [13]:
statistics = {"receipt_number": 0, "total_price": 0, "total_tax": 0}
count  = 0
for d in test_data['test']:
    input_text= d["input_text"]
    generated = str_to_json(generate_transcipt_response(input_text, model, tokenizer), True)
    expected = str_to_json(d['output_text'])
    for k, v in expected.items():
        if k in generated and v == generated[k]:
            statistics[k] += 1

In [15]:
for k, v in statistics.items():
    print(f"=========Correctly extracted {v} {k} among {len(test_data)}, accuracy is {v/len(test_data['test'])} ")

