In [1]:
import sys
import os
import numpy as np

SCRIPT_DIR = os.path.dirname(os.path.abspath("..."))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    T5ForConditionalGeneration
)

# Load test dataset that is unseen in train

In [2]:

test_data = load_from_disk("/opt/home/bo_ling/dataset/eats_receipt_gcp_v3_test.hf")
test_data

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 350
    })
})

In [3]:
test_data['test'][0]

{'input_text': "Extract 23-digit receipt_number, total_price, and total_tax from 'Albertsons\nStore 199 Dir BRIAN CONLEY\n1219 Broadway Avenue\nBOISE ID 83706\n(208)336-5278\nGROCERY\nLACROIX 8PK\n4.59 B\n2 QTY SIG WATER\n5.98 B\nRegular Price\n7.58\nYou Save\n1.60-\nBARILLA PASTA\n1.99 B\nRegular Price\n2.49\nYou Save\n0.50-\nREFRIG/FROZEN\n5.29 B\nCREAM CHEESE\nTAX\n1.07\nTAX EXEMPTION\n1.07-\n**** BALANCE\n17.85\nCredit Purchase 02/26/23 14:26\nCARD # ************000\nREF: 882607465070 AUTH: 00736188\nPAYMENT AMOUNT\n17.85\nAL VISA CREDIT\nAID A0000000031010\nTVR 8000008000\nTSI 6800\n17.85\nVisa\n0.00\nCHANGE\nTOTAL NUMBER OF ITEMS SOLD = 5\nTAX EXEMPT ID: ***\n***000\n02/26/23 14:26 199 1 102 8729\nYOUR CASHIER TODAY WAS JACK\nYOUR SAVINGS\n2.10\nStore Savings\n2.10\nTotal\n11%\nTotal Savings Value\n00019900101022302261426\nThank you for shopping Albertsons!\nFor ALBERTSONS FOR U questions call\n877-276-9637 or Albertsons.com/foru'?",
 'output_text': 'receipt_number: 0001990010102

# Load trained model T5-3B

In [4]:
local_output_dir="/opt/home/bo_ling/dolly_training/eats_receipt_gcp_t5_v3"
#model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
import torch
tokenizer = AutoTokenizer.from_pretrained(local_output_dir)
device = "cuda:2" if torch.cuda.is_available() else "cpu"
model = T5ForConditionalGeneration.from_pretrained(local_output_dir, trust_remote_code=True).to(device)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [5]:
def generate_transcipt_response(
    input_text: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    PAD_TOKEN = "<pad>",
    EOS_TOKEN = "</s>",
    do_sample: bool = True,
    max_new_tokens: int = 768,
    top_p: float = 0.85,
    top_k: int = 0,
    **kwargs,
) -> str:
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    end_key_token_id = tokenizer.encode(EOS_TOKEN)[0]

    gen_tokens = model.generate(
        input_ids,
        pad_token_id=tokenizer.pad_token_id,
        # Ensure generation stops once it generates "### End"
        eos_token_id=end_key_token_id,
        do_sample=do_sample,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        **kwargs,
    )[0].cpu()

    decoded = tokenizer.decode(gen_tokens).strip(PAD_TOKEN).strip(EOS_TOKEN).strip()

    return decoded

# Print some sample output for human to read

In [6]:
count = 0
for d in test_data['test']:
    input_text= d["input_text"]
    generated = generate_transcipt_response(input_text, model, tokenizer)
    expected = d['output_text']
    print("="*100)
    print("="*100)
    print("\nINPUT:")
    print(input_text)
    print("\nGENERATED:")
    print(generated)
    print("\nEXPECTED:")
    print(expected)
    count += 1
    if count > 50:
        break


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'Albertsons
Store 199 Dir BRIAN CONLEY
1219 Broadway Avenue
BOISE ID 83706
(208)336-5278
GROCERY
LACROIX 8PK
4.59 B
2 QTY SIG WATER
5.98 B
Regular Price
7.58
You Save
1.60-
BARILLA PASTA
1.99 B
Regular Price
2.49
You Save
0.50-
REFRIG/FROZEN
5.29 B
CREAM CHEESE
TAX
1.07
TAX EXEMPTION
1.07-
**** BALANCE
17.85
Credit Purchase 02/26/23 14:26
CARD # ************000
REF: 882607465070 AUTH: 00736188
PAYMENT AMOUNT
17.85
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 6800
17.85
Visa
0.00
CHANGE
TOTAL NUMBER OF ITEMS SOLD = 5
TAX EXEMPT ID: ***
***000
02/26/23 14:26 199 1 102 8729
YOUR CASHIER TODAY WAS JACK
YOUR SAVINGS
2.10
Store Savings
2.10
Total
11%
Total Savings Value
00019900101022302261426
Thank you for shopping Albertsons!
For ALBERTSONS FOR U questions call
877-276-9637 or Albertsons.com/foru'?

GENERATED:
receipt_number: 00019900101022302261426; total_price: 17.85; total_tax: 1.07

EXPECTED:
receipt_numbe

Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors



INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'Albertsons
Store 359 Dir Kevin Thill
Main: (818) 894-6415
9022 Balboa Blvd
NORTHRIDGE CA 91325
GROCERY
SUNCHIPS SALSA
3.99 S
Regular Price
4.19
Sale Savings
0.20-
CHEETOS CRUNCHY
3.99 S
Regular Price
6.59
Sale Savings
2.60-
SMARTFOOD POPCORN
3.99 S
Regular Price
6.59-
Sale Savings
2.60-
1 QTY
REESES MILK CHOC
1.67 S
Regular Price
2.50
Sale Savings
0.83-0
NBC OREO MLK CKY
4.99 S
Regular Price
5.79
Sale Savings
0.80-
GEN MERCHANDISE
2 QTY REUSABLE B
0.20
PRODUCE
RASPBERRIES RED
3.99 S
STRAWBERRIES 1LB
3.99 S
Regular Price
4.99
Sale Savings
1.00-
BLUEBERRIES 6OZ
3.50 S
Regular Price
3.99
Sale Savings
0.49-
TAX
0.00
TRX EXEMPTION
0.00
**** BALANCE
30.31
Credit Purchase 02/24/23 22:25
CARD # ************000
REF: 732555465990 AUTH: 00607330
PAYMENT AMOUNT
30.31
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 6800
Visa
30.31
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
10
TAX EXEMPT ID: Not Available
02/24/23 22:25 359


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'SAFEWAY S
EU
Store 1627 Dir Alex Argetsinger
Main:(503) 772-4440 Rx: (503) 772-4445
3940 S.E. Powell
PORTLAND OR 97202
GROCERY
CHEEZ IT CRCKR
7.99 S
ROLD GOLD TWISTS
3.79 S
Regular Price
3.99
Member Savings
0.20-
2 QTY CHEETOS CR
11.38 S
Regular Price
13.18
Member Savings
1.80-2
HSHY ASTD CHOC PAR. 12.99 S
Regular Price
15.49
Member Savings
2.50-
OREO COOKIES DBL
6.29 S
Regular Price
6.79
Member Savings 0.50-
TAKIS FUEGO FIESTA. 6.49 S
Regular Price
6.99
Member Savings 0.50-
MISCELLANEOUS
MR
2 QTY RCYCBLE BA
0.20
M04003 ACT
0.2025
SUR
0.00
**** BALANCE
49.13
Credit Purchase 02/27/23 18:12
CARD # *******
**000
REF: 221241402500 AUTH: 00647810
Ami
PAYMENT AMOUNT
49.13
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 6800
HO T
Visa
49.13
910 10 5
10m 110 06
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
7
02/27/23 18:12 1627 51 131
8851
YOUR CASHIER TODAY WAS SELF
YOUR SAVINGS
Member Savings
5.50
Total
5.50
Total Savi


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from '110-
SAFEWAY.
Store 1439 Dir Team 1439
Main:(408) 481-3300 Rx (408) 481-3302
785 East El Camino Real
SUNNYVALE CA 94807
GROCERY
BC FUDGE BRWNI MIX
3.49 S
BC CHOC CHIP COOKI
4.49 S
AJUOY J.
MOD
DELI JOYAS
BH MEAT CHS TRAY
15.99 S
MISCELLANEOUS
MR
RCYCBLE BAG CHARGE
0.10
TAX
0.00
**** BALANCE
24.07
Credit Purchase 02/27/23 21:36
CARD # ************000
MEMO
REF: 993648415370 AUTH: 00177704
PAYMENT AMOUNT
24.07
UST
AL VISA CREDIT
AID A0000000031010 us
TVR 8000008000
TSI 6800
Visa
24.07
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
3
02/27/23 21:36 1439 53 178
8853
YOUR CASHIER TODAY WAS SELF
00143905301782302272136
Thank you for shopping Safeway!
For SAFEWAY FOR U questions call
877-276-9637 or Safeway.com/foru moz
53'?

GENERATED:
receipt_number: 00143905301782302272136; total_price: 24.07; total_tax: 0.0

EXPECTED:
receipt_number: 00143905301782302272136; total_price: 24.07; total_tax: 0.0

INPUT:
Extract 23-digit recei


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'SAFEWAY S
Store 2606 Dir Dave Papalias
Main: (415) 633-1001 Rx: (415) 633-1020
298 King St.
SAN FRANCISCO CA 94107
GEN MERCHANDISE
SWIFFER SWEEPER
6.99 T
SWIFFER WET HVY AC
12.99 T
GILLETTE
3.49 T
G-U-M FLSSRS EZTHRAC
3.49 T
2.33
TAX
29.29
**** BALANCE
Credit Purchase 02/27/23 19:42
CARD # ************000
REF 824208461040 AUTH: 00150581
29.29
PAYMENT AMOUNT
AL
AID A0000000031010
TVR 0000000000
TSI 0000
Visa
29.29
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
02/27/23 19:42 2606 96 204
8896
Join the Safeway Club today.
Membership is Free and Instant
YOUR CASHIER TODAY WAS SELF
00260609602042302271942
Thank you for shopping Safeway!
For SAFEWAY FOR U questions call
877-276-9637 or Safeway.com/foru'?

GENERATED:
receipt_number: 00260609602042302271942; total_price: 29.29; total_tax: 2.33

EXPECTED:
receipt_number: 00260609602042302271942; total_price: 29.29; total_tax: 2.33

INPUT:
Extract 23-digit receipt_number, tota


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'AIMEE
SAFEWAY S
Store 1620 Dir Janice Hill
Main: (301) 670-1626 Rx: (301) 670-1631
20211 Goshen Road
GAITHERSBURG MD 20879
GROCERY
TOP RAMEN BEEF
0.75 S
Regular Price
0.79
Member Savings
0.04-
NISSAN TOP RAMEN 3.50 S
Regular Price
3.79
Member Savings
0.29-
NISSIN NOODLE BOWL.
1.50 S
Regular Price
1.69
Member Savings 0.19-
IBC ROOT BEER
5.69 B
MIKES MIG SOUP VGT
2.99 S
REFRIG/FROZEN
TURKEY HILL VAN
7.19 S
BUBBIES ICE CREAM
6.49 B
MISCELLANEOUS
MR 2 QTY DSPSBL BAG
0.10
TAX
0.73
**** BALANCE
28.94
Credit Purchase 02/28/23 08:58
CARD # ************000
REF: 645836434040 AUTH: 00022247
PAYMENT AMOUNT
28.94
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 5800
28.94
Visa
0.00
CHANGE
0.73
6% SALES TAX
TOTAL NUMBER OF ITEMS SOLD =
7
8822
02/28/23 08:58 1620 176
4
YOUR CASHIER TODAY WAS SELF
YOUR SAVINGS
0.52
Member Savings
0.52
Total
HOW WAS YOUR SHOPPING EXPERIENCE?
WE VALUE YOUR FEEDBACK!
SCAN THE QR CODE BELOW FOR 


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'Store 1709 Dir Domingo Segcovio
Main: (650) 854-3056
325 Sharon Park Drive
MENLO PARK CA 94025
GROCERY
PEP FRM RASBRY MIL.
4.99 S
Regular Price
5.49
Member Savings
0.50-
PEP FARM COOKIE
4.99 S
Regular Price
5.49
Member Savings
0.50-
2EP FARM MILANO
4.99 S
Regular Price
5.49
Member Savings
0.50-
BISCOFF COOKIES
3.50 S
Regular Price
4.49
Member Savings
0.99-
CLUB CRACKERS ORIG. 4.99 S
Regular Price
5.49
Member Savings
0.50-
2 QTY SP STRAWBE
3.98 S
NBC CRACKERS
3.99 S
Regular Price
5.79
Member Savings
1.80-
2 QTY PADE ZERO
3.00 S
CRV SFTDK SNGL NTX
0.20 S
Regular Price
3.98
Member Savings
0.98-
2 QTY FER ROCHER
10.98 S
Regular Price
13.98
Member Savings
3.00-
MISCELLANEOUS
MR
SAFEWAYS.
RCYCBLE BAG CHARGE
0.25
TAX
0.00
**** BALANCE
45.86
Credit Purchase 02/27/23 20:24
CARD # ************000
REF: 272434404700 AUTH: 00967612
PAYMENT AMOUNT
45.86
AL
AID A0000000031010
TVR 0000000000
TSI 0000
Visa
45.86
CHANGE
0.00
TOTAL


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'GURA
Store 3129 Dir Ron Sousa
Main: (661) 827-0394
3500 Panama Lane
BAKERSFIELD CA 93313
snack is
print
s or Com
GROCERY
52-4477
GMILLS CINN TST
6.49
S
DORITOS TORTILLA
5.59 S
olay.com
to 4:30pm CT
Regular Price
6.59
Sale Savings
1.00-
KELLOGGS APPLE
4.00 S
Regular Price
6.99
Sale Savings
2.99-
NBC CHIPS AHOY CHU 4.99 S
NBC RITZ FRESH
4.99 S
Regular Price
5.99
Sale Savings
1.00-
REFRIG/FROZEN
2 QTY SIG LUCERN
9.98 S
KNUDSEN COTTAGE
4.49 S
Regular Price
5.99
Sale Savings
1.50-
BAKED GOODS
2 QTY ENTENM BRW
8.00 S
Regular Price
11.58
Sale Savings
3.58-
Flavor
MISCELLANEOUS
MR
Albertsons
5 QTY RCYCBLE BA
0.50
TAX
0.00
**** BALANCE
49.03
Credit Purchase 02/24/23 21:02
CARD # ************000
REF: 750213413990 AUTH: 00548648
PAYMENT AMOUNT
49.03
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 6800
Visa
49.03
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
10
02/24/23 21:02 3129 51 131
8851
2
YOUR CASHIER TODAY WAS SELF
YO


INPUT:
Extract 23-digit receipt_number, total_price, and total_tax from 'Albertsons
Store 4187 Dir Raymond Maddox
Main: (972) 285-0226 Rx: (972) 285-1352
19
1500 S Beltline Rd
MESQUITE TX 75149
3759 f*
GROCERY
SMARTFOOD POPCORN
4.99 S
OCN SPRY CNRBRY
3.49 B
Regular Price
3.79
You Save
0.30-
CRAN-GRAPE
3.49 B
Regular Price
3.79
You Save
0.30-
GATORADE G ZERO
7.29 B
TAX
1.18
**** BALANCE
20.44
Credit Purchase 02/26/23 14:01
CARD # ************000
REF: 280113447460 AUTH: 00131450
PAYMENT AMOUNT
20.44
AL VISA CREDIT
AID A0000000031010
TVR 8000008000
TSI 6800
Visa
20.44
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
02/26/23 14:01 4187 2 63 6330
YOUR CASHIER TODAY WAS Deanna
YOUR SAVINGS
Store Savings
0.60
Total
0.60
00418700200632302261401
Thank you for shopping Albertsons!
For ALBERTSONS FOR U questions call
877-276-9637 or Albertsons.com/foru'?

GENERATED:
receipt_number: 00418700200632302261401; total_price: 20.44; total_tax: 1.18

EXPECTED:
receipt_number: 00418700200632302261401; total_p

In [7]:
def str_to_json(js, do_print_exception=False):
    res = {}
    for ss in js.lower().split(";"):
        try:
            k, v = ss.split(":")
            if k.strip() in ["total_price", "total_tax"]:
                res[k.strip()] = float(v.strip())
            else:
                res[k.strip()] = v.strip()
        except:
            if do_print_exception:
                print(js)
    return res
str_to_json("receipt_number: 00400505302182302271921; total_price: 10.98; total_tax: 0.0")

{'receipt_number': '00400505302182302271921',
 'total_price': 10.98,
 'total_tax': 0.0}

#  Compute accuracy leftout test dataset

In [8]:
statistics = {"receipt_number": 0, "total_price": 0, "total_tax": 0}
count  = 0
for d in test_data['test']:
    input_text= d["input_text"]
    generated = str_to_json(generate_transcipt_response(input_text, model, tokenizer), True)
    expected = str_to_json(d['output_text'])
    for k, v in expected.items():
        if k in generated and v == generated[k]:
            statistics[k] += 1

In [9]:
for k, v in statistics.items():
    print(f"=========Correctly extracted {v} {k} among {len(test_data['test'])}, accuracy is {v/len(test_data['test'])} ")



In [11]:
statistics = {"receipt_number": 0, "total_price": 0, "total_tax": 0}
count  = 0
for d in test_data['test']:
    input_text= d["input_text"]
    
    expected = str_to_json(d['output_text'])
    if expected["receipt_number"] == "00175817601672302272019":
        print(generate_transcipt_response(input_text, model, tokenizer), d)
    

receipt_number: 00175817601672302272019; total_price: 11.47; total_tax: 0.0 {'input_text': "Extract 23-digit receipt_number, total_price, and total_tax from 'an\nSAFEWAYS\nStore 1758 Dir Dee Frances\nMain (410) 256-3021 Rx (410) 256-6423\n9645 Belair Road\nPerry Hall MD 21128\nare\nREFRIG/FROZEN\n2.99 S\nGOLD PEAK\nRegular Price\n3.19\nMember Savings\n0.20-\nBAKED GOODS\n3.99 S\nBREAD\nPRODUCE\n2n\nSIG FARMS HER SL\n1.99 S\n1 QTY\nMED HASS AVOCADOS\n2,50 S\nROOS\n0.00\nTAX\n**** BALANCE\n11.47\nCredit Purchase 02/27/23 20:19\nLOB VDAEBLIZIN cvr\nCARD # ************000\nREF: 431911468260 AUTH: 00783373\nPAYMENT AMOUNT\n11.47\nAL VISA CREDIT\nAID A0000000031010\nTVR 800000800048\n9\nTSI 6800\nVisa 130 ase 11.47\nCHANGE\n0.00\nTOTAL NUMBER OF ITEMS SOLD =\n4\n02/27/23 20:19 1758 176 167\n8822\nYOUR CASHIER TODAY WAS SELF\nYOUR SAVINGS\nMember Savings\n0.20\nTotal\n0.20\nHOW WAS YOUR SHOPPING EXPERIENCE?\nWE VALUE YOUR FEEDBACK!\nSCAN THE QR CODE BELOW FOR A TWO MINUTE SURVEY\na bore\nGATY