In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

import itertools
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_colwidth = 10000

In [2]:
# read provided dataset
df = pd.read_csv("receipt_data.csv")
df.head(1)

Unnamed: 0,comments,currency,lastUpdated,purchaseTime,status,uploadedTime,totalConfidence,lastOpenAt,lastVerifiedAt,taxTotal,total,deletedAt,retailerName,retailerCountry,retailerVatNumber,rawData,receiptImage
0,,GBP,2019-06-15 17:00:57,2019-06-14 18:20:00,REVIEWED,2019-06-14 22:11:40,0.7,,2019-06-15 17:00:57,,,,Selfridges,GB,,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Selfridges"",""validatedEstablishment"":false,""date"":""2019-06-14 18:20:00"",""total"":30.000,""url"":""www.selfridges.com"",""phoneNumber"":""8569 4004156"",""paymentMethod"":""VISA"",""address"":""400 Oxford Street London W1A 1AB"",""validatedTotal"":false,""subTotal"":30.000,""validatedSubTotal"":true,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.99,""totalConfidence"":0.7,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""FULTON BROLLYMA41568569 400415685692"",""unit"":"""",""price"":0.000,""descClean"":""You 193 were served by : Ines FULTON BROLLYMA 400415685692"",""lineTotal"":18.000,""productCode"":""41568569""},{""qty"":0,""desc"":""JJ DOT HYDR - TRA 82401412"",""unit"":"""",""price"":0.000,""descClean"":""JJ DOT HYDR - TRA"",""lineTotal"":12.000,""productCode"":""82401412""}],""summaryItems"":[{""qty"":2,""desc"":""Total 2 Items"",""unit"":"""",""price"":0.000,""descClean"":""Total Items"",""lineTotal"":30.000,""productCode"":""""},{""qty"":0,""desc"":""Visa"",""unit"":"""",""price"":0.000,""descClean"":""Visa"",""lineTotal"":30.000,""productCode"":""""},{""qty"":14,""desc"":""009999 Date 14 / 06 / 19"",""unit"":"""",""price"":0.000,""descClean"":""Date / 06 / 19"",""lineTotal"":18.200,""productCode"":""009999""}]}}",https://s3-eu-west-1.amazonaws.com/wevat-receipts/5955cc32-894b-4dbd-8c1f-f54fbd95d8e8


In [3]:
# filter retailerName to be used as label and rawData as features
df = df.filter(['retailerName', 'rawData'])
df.head(5)

Unnamed: 0,retailerName,rawData
0,Selfridges,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Selfridges"",""validatedEstablishment"":false,""date"":""2019-06-14 18:20:00"",""total"":30.000,""url"":""www.selfridges.com"",""phoneNumber"":""8569 4004156"",""paymentMethod"":""VISA"",""address"":""400 Oxford Street London W1A 1AB"",""validatedTotal"":false,""subTotal"":30.000,""validatedSubTotal"":true,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.99,""totalConfidence"":0.7,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""FULTON BROLLYMA41568569 400415685692"",""unit"":"""",""price"":0.000,""descClean"":""You 193 were served by : Ines FULTON BROLLYMA 400415685692"",""lineTotal"":18.000,""productCode"":""41568569""},{""qty"":0,""desc"":""JJ DOT HYDR - TRA 82401412"",""unit"":"""",""price"":0.000,""descClean"":""JJ DOT HYDR - TRA"",""lineTotal"":12.000,""productCode"":""82401412""}],""summaryItems"":[{""qty"":2,""desc"":""Total 2 Items"",""unit"":"""",""price"":0.000,""descClean"":""Total Items"",""lineTotal"":30.000,""productCode"":""""},{""qty"":0,""desc"":""Visa"",""unit"":"""",""price"":0.000,""descClean"":""Visa"",""lineTotal"":30.000,""productCode"":""""},{""qty"":14,""desc"":""009999 Date 14 / 06 / 19"",""unit"":"""",""price"":0.000,""descClean"":""Date / 06 / 19"",""lineTotal"":18.200,""productCode"":""009999""}]}}"
1,Harrods,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Harrods"",""validatedEstablishment"":false,""date"":""2019-06-19 13:24:00"",""total"":86.000,""url"":""harrods.com"",""phoneNumber"":""020 7730 1234"",""paymentMethod"":"""",""address"":""LONDON S1 ot Road"",""validatedTotal"":false,""subTotal"":86.000,""validatedSubTotal"":true,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.6,""totalConfidence"":0.0,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""Rewards 0005708615 Saving FRAGRANCE"",""unit"":"""",""price"":0.000,""descClean"":""Rewards Saving FRAGRANCE f8.60"",""lineTotal"":86.000,""productCode"":""0005708615""}],""summaryItems"":[{""qty"":0,""desc"":""Sub Total"",""unit"":"""",""price"":0.000,""descClean"":""Sub Total"",""lineTotal"":86.000,""productCode"":""""},{""qty"":1,""desc"":""TOTAL ( 1 item )"",""unit"":"""",""price"":0.000,""descClean"":""TOTAL ( item )"",""lineTotal"":77.400,""productCode"":""""},{""qty"":0,""desc"":""Chinese Mobile Tendered"",""unit"":"""",""price"":0.000,""descClean"":""Chinese Mobile Tendered"",""lineTotal"":77.400,""productCode"":""""},{""qty"":0,""desc"":""Rewards Points balance"",""unit"":"""",""price"":0.000,""descClean"":""Rewards Points balance"",""lineTotal"":194.800,""productCode"":""""}]}}"
2,Boots,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Boots"",""validatedEstablishment"":false,""date"":""2019-06-18 18:24:00"",""total"":20.460,""url"":"""",""phoneNumber"":""(0115) 9410199"",""paymentMethod"":"""",""address"":"""",""validatedTotal"":true,""subTotal"":0.000,""validatedSubTotal"":false,""cash"":""50.500"",""change"":""30.040"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.99,""cashConfidence"":0.99,""changeConfidence"":0.99,""roundingConfidence"":0.0,""otherData"":[""f"",""Boots UK Limited"",""NOTTINGHAM - 6"",""18/06/2019"",""(0115) 9410199"",""18:24"",""Served by: LAURIEN GOOD"",""1404"",""619 7646 0006 144 *6330************686"",""""],""lineItems"":[{""qty"":28,""desc"":""Nno Nno Int Int Wht Wht Stp Strong 28 2 5.00 NOW"",""unit"":"""",""price"":0.000,""descClean"":""Nno Nno Int Int Wht Wht Stp Strong 2 5.00 NOW STUDENT DISCOUNT"",""lineTotal"":18.750,""productCode"":""""}],""summaryItems"":[{""qty"":0,""desc"":""TOTAL TO PAY"",""unit"":"""",""price"":0.000,""descClean"":""TOTAL TO PAY"",""lineTotal"":20.460,""productCode"":""""},{""qty"":0,""desc"":""CASH"",""unit"":"""",""price"":0.000,""descClean"":""CASH"",""lineTotal"":50.500,""productCode"":""""},{""qty"":0,""desc"":""CHANGE"",""unit"":"""",""price"":0.000,""descClean"":""CHANGE"",""lineTotal"":30.040,""productCode"":""""}]}}"
3,Selfridges,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Selfridges"",""validatedEstablishment"":false,""date"":""2019-06-01 20:48:00"",""total"":370.000,""url"":""www.selfridges.com"",""phoneNumber"":""+44 (0) 207 160 62"",""paymentMethod"":""ALIPAY"",""address"":""400 Oxford Street London W1A 1AB"",""validatedTotal"":false,""subTotal"":0.000,""validatedSubTotal"":false,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.0,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[""SELFRIDGES&Cº"",""400 Oxford Street London W1A 1AB"",""Freephone 0800 123 400"",""( From overseas +44 (0) 207 160 6222)"",""V.A.T NO. GB 705 3259 52"",""www.selfridges.com"",""You were served by: Mikel"",""Chanel RTW & Accs"",""1 @ f370.00"",""CAR"",""Notified terms and conditions will apply"",""For details visit selfridges.com/london"",""****************************** ************"",""We have a passion for what we do and"",""want you to have an amazing experience"",""every time you come to Selfridges."",""*****************************************"",""Loteborhoomorpbtitles 298 espela"",""Shop online at www.selfridges.com"",""You are welcome to exchange or refund"",""your purchase by Sat 29 Jun 2019"",""Please note Sale items"",""can only be exchanged"",""-"",""-"",""Term 0250 Opr 9999979506 Trans 3919"",""Store 009999 Date 01/06/19 20:48"",""00999902503919190601204713"",""mellogu"",""erot new uovi"",""""],""lineItems"":[{""qty"":1,""desc"":""1 @ f370.00"",""unit"":"""",""price"":0.000,""descClean"":""f370.00"",""lineTotal"":370.000,""productCode"":""""},{""qty"":1,""desc"":""Alipay READVICE"",""unit"":"""",""price"":0.000,""descClean"":""- - Total Item READVICE f370.00 Alipay READVICE"",""lineTotal"":370.000,""productCode"":""""}],""summaryItems"":[]}}"
4,John Lewis,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""John Lewis"",""validatedEstablishment"":false,""date"":""2019-06-16 14:02:00"",""total"":26.940,""url"":""www.johnlewis.com/customer"",""phoneNumber"":""0114 276 8511"",""paymentMethod"":"""",""address"":"", SS1 2HB"",""validatedTotal"":false,""subTotal"":26.940,""validatedSubTotal"":true,""cash"":""3.060"",""change"":""0.000"",""tax"":4.490,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.6,""cashConfidence"":0.6,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""S 73220442 Construction Toy"",""unit"":"""",""price"":0.000,""descClean"":""S Construction Toy"",""lineTotal"":8.970,""productCode"":""73220442""},{""qty"":2,""desc"":""S 73220446 Construction Toy"",""unit"":"""",""price"":0.000,""descClean"":""S Construction Toy No.items"",""lineTotal"":17.970,""productCode"":""73220446""}],""summaryItems"":[{""qty"":0,""desc"":""Total"",""unit"":"""",""price"":0.000,""descClean"":""Total"",""lineTotal"":26.940,""productCode"":""""},{""qty"":0,""desc"":""Cash"",""unit"":"""",""price"":0.000,""descClean"":""Cash"",""lineTotal"":30.000,""productCode"":""""},{""qty"":0,""desc"":""Cash"",""unit"":"""",""price"":0.000,""descClean"":""Cash"",""lineTotal"":3.060,""productCode"":""""},{""qty"":5,""desc"":""S 20.0% 26.94 4.49"",""unit"":"""",""price"":4.490,""descClean"":""S 20.0% 26.94"",""lineTotal"":22.450,""productCode"":""""}]}}"


In [4]:

# we can see that some Boot tags are duplicated
df['retailerName'].value_counts()

Selfridges           1961
Boots                1601
Harrods              1188
Holland & Barrett     582
John Lewis            458
TK Maxx               433
Debenhams             394
Boots UK Limited      383
Name: retailerName, dtype: int64

In [5]:
# clean some data with bad labels
df.retailerName = df.retailerName.str.replace('Boots.*', 'Boots', regex=True)

# Boot retailer tags must be now deduplicated
# The counts should be closer for the model be more balanced
df['retailerName'].value_counts()

Boots                1984
Selfridges           1961
Harrods              1188
Holland & Barrett     582
John Lewis            458
TK Maxx               433
Debenhams             394
Name: retailerName, dtype: int64

In [6]:
ignore = df[df.rawData.apply(lambda x: not isinstance(x, str))]
ignore.head(5)

Unnamed: 0,retailerName,rawData
685,Boots,
1326,Boots,
1537,John Lewis,
1872,Holland & Barrett,
3050,Boots,


In [7]:
# drop rows with empty or not string rawData 
dfc = df[df.rawData.apply(lambda x: isinstance(x, str))]
dfc.head(5)

Unnamed: 0,retailerName,rawData
0,Selfridges,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Selfridges"",""validatedEstablishment"":false,""date"":""2019-06-14 18:20:00"",""total"":30.000,""url"":""www.selfridges.com"",""phoneNumber"":""8569 4004156"",""paymentMethod"":""VISA"",""address"":""400 Oxford Street London W1A 1AB"",""validatedTotal"":false,""subTotal"":30.000,""validatedSubTotal"":true,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.99,""totalConfidence"":0.7,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""FULTON BROLLYMA41568569 400415685692"",""unit"":"""",""price"":0.000,""descClean"":""You 193 were served by : Ines FULTON BROLLYMA 400415685692"",""lineTotal"":18.000,""productCode"":""41568569""},{""qty"":0,""desc"":""JJ DOT HYDR - TRA 82401412"",""unit"":"""",""price"":0.000,""descClean"":""JJ DOT HYDR - TRA"",""lineTotal"":12.000,""productCode"":""82401412""}],""summaryItems"":[{""qty"":2,""desc"":""Total 2 Items"",""unit"":"""",""price"":0.000,""descClean"":""Total Items"",""lineTotal"":30.000,""productCode"":""""},{""qty"":0,""desc"":""Visa"",""unit"":"""",""price"":0.000,""descClean"":""Visa"",""lineTotal"":30.000,""productCode"":""""},{""qty"":14,""desc"":""009999 Date 14 / 06 / 19"",""unit"":"""",""price"":0.000,""descClean"":""Date / 06 / 19"",""lineTotal"":18.200,""productCode"":""009999""}]}}"
1,Harrods,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Harrods"",""validatedEstablishment"":false,""date"":""2019-06-19 13:24:00"",""total"":86.000,""url"":""harrods.com"",""phoneNumber"":""020 7730 1234"",""paymentMethod"":"""",""address"":""LONDON S1 ot Road"",""validatedTotal"":false,""subTotal"":86.000,""validatedSubTotal"":true,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.6,""totalConfidence"":0.0,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""Rewards 0005708615 Saving FRAGRANCE"",""unit"":"""",""price"":0.000,""descClean"":""Rewards Saving FRAGRANCE f8.60"",""lineTotal"":86.000,""productCode"":""0005708615""}],""summaryItems"":[{""qty"":0,""desc"":""Sub Total"",""unit"":"""",""price"":0.000,""descClean"":""Sub Total"",""lineTotal"":86.000,""productCode"":""""},{""qty"":1,""desc"":""TOTAL ( 1 item )"",""unit"":"""",""price"":0.000,""descClean"":""TOTAL ( item )"",""lineTotal"":77.400,""productCode"":""""},{""qty"":0,""desc"":""Chinese Mobile Tendered"",""unit"":"""",""price"":0.000,""descClean"":""Chinese Mobile Tendered"",""lineTotal"":77.400,""productCode"":""""},{""qty"":0,""desc"":""Rewards Points balance"",""unit"":"""",""price"":0.000,""descClean"":""Rewards Points balance"",""lineTotal"":194.800,""productCode"":""""}]}}"
2,Boots,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Boots"",""validatedEstablishment"":false,""date"":""2019-06-18 18:24:00"",""total"":20.460,""url"":"""",""phoneNumber"":""(0115) 9410199"",""paymentMethod"":"""",""address"":"""",""validatedTotal"":true,""subTotal"":0.000,""validatedSubTotal"":false,""cash"":""50.500"",""change"":""30.040"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.99,""cashConfidence"":0.99,""changeConfidence"":0.99,""roundingConfidence"":0.0,""otherData"":[""f"",""Boots UK Limited"",""NOTTINGHAM - 6"",""18/06/2019"",""(0115) 9410199"",""18:24"",""Served by: LAURIEN GOOD"",""1404"",""619 7646 0006 144 *6330************686"",""""],""lineItems"":[{""qty"":28,""desc"":""Nno Nno Int Int Wht Wht Stp Strong 28 2 5.00 NOW"",""unit"":"""",""price"":0.000,""descClean"":""Nno Nno Int Int Wht Wht Stp Strong 2 5.00 NOW STUDENT DISCOUNT"",""lineTotal"":18.750,""productCode"":""""}],""summaryItems"":[{""qty"":0,""desc"":""TOTAL TO PAY"",""unit"":"""",""price"":0.000,""descClean"":""TOTAL TO PAY"",""lineTotal"":20.460,""productCode"":""""},{""qty"":0,""desc"":""CASH"",""unit"":"""",""price"":0.000,""descClean"":""CASH"",""lineTotal"":50.500,""productCode"":""""},{""qty"":0,""desc"":""CHANGE"",""unit"":"""",""price"":0.000,""descClean"":""CHANGE"",""lineTotal"":30.040,""productCode"":""""}]}}"
3,Selfridges,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""Selfridges"",""validatedEstablishment"":false,""date"":""2019-06-01 20:48:00"",""total"":370.000,""url"":""www.selfridges.com"",""phoneNumber"":""+44 (0) 207 160 62"",""paymentMethod"":""ALIPAY"",""address"":""400 Oxford Street London W1A 1AB"",""validatedTotal"":false,""subTotal"":0.000,""validatedSubTotal"":false,""cash"":""0.000"",""change"":""0.000"",""tax"":0.000,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.0,""cashConfidence"":0.0,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[""SELFRIDGES&Cº"",""400 Oxford Street London W1A 1AB"",""Freephone 0800 123 400"",""( From overseas +44 (0) 207 160 6222)"",""V.A.T NO. GB 705 3259 52"",""www.selfridges.com"",""You were served by: Mikel"",""Chanel RTW & Accs"",""1 @ f370.00"",""CAR"",""Notified terms and conditions will apply"",""For details visit selfridges.com/london"",""****************************** ************"",""We have a passion for what we do and"",""want you to have an amazing experience"",""every time you come to Selfridges."",""*****************************************"",""Loteborhoomorpbtitles 298 espela"",""Shop online at www.selfridges.com"",""You are welcome to exchange or refund"",""your purchase by Sat 29 Jun 2019"",""Please note Sale items"",""can only be exchanged"",""-"",""-"",""Term 0250 Opr 9999979506 Trans 3919"",""Store 009999 Date 01/06/19 20:48"",""00999902503919190601204713"",""mellogu"",""erot new uovi"",""""],""lineItems"":[{""qty"":1,""desc"":""1 @ f370.00"",""unit"":"""",""price"":0.000,""descClean"":""f370.00"",""lineTotal"":370.000,""productCode"":""""},{""qty"":1,""desc"":""Alipay READVICE"",""unit"":"""",""price"":0.000,""descClean"":""- - Total Item READVICE f370.00 Alipay READVICE"",""lineTotal"":370.000,""productCode"":""""}],""summaryItems"":[]}}"
4,John Lewis,"{""message"":""SUCCESS: Result available"",""status"":""done"",""status_code"":3,""token"":null,""success"":true,""code"":202,""duplicate"":false,""duplicateToken"":null,""result"":{""establishment"":""John Lewis"",""validatedEstablishment"":false,""date"":""2019-06-16 14:02:00"",""total"":26.940,""url"":""www.johnlewis.com/customer"",""phoneNumber"":""0114 276 8511"",""paymentMethod"":"""",""address"":"", SS1 2HB"",""validatedTotal"":false,""subTotal"":26.940,""validatedSubTotal"":true,""cash"":""3.060"",""change"":""0.000"",""tax"":4.490,""discount"":0.000,""subTotalConfidence"":0.0,""totalConfidence"":0.6,""cashConfidence"":0.6,""changeConfidence"":0.0,""roundingConfidence"":0.0,""otherData"":[],""lineItems"":[{""qty"":0,""desc"":""S 73220442 Construction Toy"",""unit"":"""",""price"":0.000,""descClean"":""S Construction Toy"",""lineTotal"":8.970,""productCode"":""73220442""},{""qty"":2,""desc"":""S 73220446 Construction Toy"",""unit"":"""",""price"":0.000,""descClean"":""S Construction Toy No.items"",""lineTotal"":17.970,""productCode"":""73220446""}],""summaryItems"":[{""qty"":0,""desc"":""Total"",""unit"":"""",""price"":0.000,""descClean"":""Total"",""lineTotal"":26.940,""productCode"":""""},{""qty"":0,""desc"":""Cash"",""unit"":"""",""price"":0.000,""descClean"":""Cash"",""lineTotal"":30.000,""productCode"":""""},{""qty"":0,""desc"":""Cash"",""unit"":"""",""price"":0.000,""descClean"":""Cash"",""lineTotal"":3.060,""productCode"":""""},{""qty"":5,""desc"":""S 20.0% 26.94 4.49"",""unit"":"""",""price"":4.490,""descClean"":""S 20.0% 26.94"",""lineTotal"":22.450,""productCode"":""""}]}}"


In [8]:
rawData = dfc.rawData
parsedRawData = rawData.apply(json.loads).apply(pd.Series)
parsedRawData.head(2)

Unnamed: 0,message,status,status_code,token,success,code,duplicate,duplicateToken,result
0,SUCCESS: Result available,done,3,,True,202,False,,"{'establishment': 'Selfridges', 'validatedEstablishment': False, 'date': '2019-06-14 18:20:00', 'total': 30.0, 'url': 'www.selfridges.com', 'phoneNumber': '8569 4004156', 'paymentMethod': 'VISA', 'address': '400 Oxford Street London W1A 1AB', 'validatedTotal': False, 'subTotal': 30.0, 'validatedSubTotal': True, 'cash': '0.000', 'change': '0.000', 'tax': 0.0, 'discount': 0.0, 'subTotalConfidence': 0.99, 'totalConfidence': 0.7, 'cashConfidence': 0.0, 'changeConfidence': 0.0, 'roundingConfidence': 0.0, 'otherData': [], 'lineItems': [{'qty': 0, 'desc': 'FULTON BROLLYMA41568569 400415685692', 'unit': '', 'price': 0.0, 'descClean': 'You 193 were served by : Ines FULTON BROLLYMA 400415685692', 'lineTotal': 18.0, 'productCode': '41568569'}, {'qty': 0, 'desc': 'JJ DOT HYDR - TRA 82401412', 'unit': '', 'price': 0.0, 'descClean': 'JJ DOT HYDR - TRA', 'lineTotal': 12.0, 'productCode': '82401412'}], 'summaryItems': [{'qty': 2, 'desc': 'Total 2 Items', 'unit': '', 'price': 0.0, 'descClean': 'Total Items', 'lineTotal': 30.0, 'productCode': ''}, {'qty': 0, 'desc': 'Visa', 'unit': '', 'price': 0.0, 'descClean': 'Visa', 'lineTotal': 30.0, 'productCode': ''}, {'qty': 14, 'desc': '009999 Date 14 / 06 / 19', 'unit': '', 'price': 0.0, 'descClean': 'Date / 06 / 19', 'lineTotal': 18.2, 'productCode': '009999'}]}"
1,SUCCESS: Result available,done,3,,True,202,False,,"{'establishment': 'Harrods', 'validatedEstablishment': False, 'date': '2019-06-19 13:24:00', 'total': 86.0, 'url': 'harrods.com', 'phoneNumber': '020 7730 1234', 'paymentMethod': '', 'address': 'LONDON S1 ot Road', 'validatedTotal': False, 'subTotal': 86.0, 'validatedSubTotal': True, 'cash': '0.000', 'change': '0.000', 'tax': 0.0, 'discount': 0.0, 'subTotalConfidence': 0.6, 'totalConfidence': 0.0, 'cashConfidence': 0.0, 'changeConfidence': 0.0, 'roundingConfidence': 0.0, 'otherData': [], 'lineItems': [{'qty': 0, 'desc': 'Rewards 0005708615 Saving FRAGRANCE', 'unit': '', 'price': 0.0, 'descClean': 'Rewards Saving FRAGRANCE f8.60', 'lineTotal': 86.0, 'productCode': '0005708615'}], 'summaryItems': [{'qty': 0, 'desc': 'Sub Total', 'unit': '', 'price': 0.0, 'descClean': 'Sub Total', 'lineTotal': 86.0, 'productCode': ''}, {'qty': 1, 'desc': 'TOTAL ( 1 item )', 'unit': '', 'price': 0.0, 'descClean': 'TOTAL ( item )', 'lineTotal': 77.4, 'productCode': ''}, {'qty': 0, 'desc': 'Chinese Mobile Tendered', 'unit': '', 'price': 0.0, 'descClean': 'Chinese Mobile Tendered', 'lineTotal': 77.4, 'productCode': ''}, {'qty': 0, 'desc': 'Rewards Points balance', 'unit': '', 'price': 0.0, 'descClean': 'Rewards Points balance', 'lineTotal': 194.8, 'productCode': ''}]}"


In [9]:
joined = df.join(parsedRawData)
data = joined.filter(['retailerName', 'result'])

In [10]:
# parse json result field and add columns to dataframe
parsedResult = data.result.apply(pd.Series)
joined = df.join(parsedResult)

# filter so that we get only features and labels to train the model
data = joined.filter(['establishment', 'retailerName'])
data.head(5)

Unnamed: 0,establishment,retailerName
0,Selfridges,Selfridges
1,Harrods,Harrods
2,Boots,Boots
3,Selfridges,Selfridges
4,John Lewis,John Lewis


In [11]:
ignore = data[data.establishment.apply(lambda x: not isinstance(x, str))]
ignore.head(2)

Unnamed: 0,establishment,retailerName
685,,Boots
1326,,Boots


In [12]:
ignore = data[data.retailerName.apply(lambda x: not isinstance(x, str))]
ignore.head(2)

Unnamed: 0,establishment,retailerName


In [13]:
data = data[data.establishment.apply(lambda x: isinstance(x, str))]
# data = data[data.retailerName.apply(lambda x: isinstance(x, str))]
data.head(10)

Unnamed: 0,establishment,retailerName
0,Selfridges,Selfridges
1,Harrods,Harrods
2,Boots,Boots
3,Selfridges,Selfridges
4,John Lewis,John Lewis
5,Selfridges,Selfridges
6,HOLLAND & BARRETT,Holland & Barrett
7,Selfridges,Selfridges
8,Selfridges,Selfridges
9,John Lewis,John Lewis


In [14]:
# get train and test dataset
train_size = int(len(data) * .8)
train_ocr = data['establishment'][:train_size]
train_tags = data['retailerName'][:train_size]
test_ocr = data['establishment'][train_size:]
test_tags = data['retailerName'][train_size:]

test_tags.size, train_size, len(data)

(1397, 5585, 6982)

In [15]:
train_ocr.head(10)

0           Selfridges
1              Harrods
2                Boots
3           Selfridges
4           John Lewis
5           Selfridges
6    HOLLAND & BARRETT
7           Selfridges
8           Selfridges
9           John Lewis
Name: establishment, dtype: object

In [16]:
# use bag of words model
max_words = 1000
tokenize = Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_ocr)

In [17]:
x_train = tokenize.texts_to_matrix(train_ocr)
x_test = tokenize.texts_to_matrix(test_ocr)

# x_train[0]

In [18]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

y_test

array([1, 6, 5, ..., 2, 5, 5])

In [19]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)
y_test_cat

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [20]:
# Inspect the dimensions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train_cat shape:', y_train_cat.shape)
print('y_test_cat shape:', y_test_cat.shape)

x_train shape: (5585, 1000)
x_test shape: (1397, 1000)
y_train_cat shape: (5585, 7)
y_test_cat shape: (1397, 7)


In [21]:
batch_size = 32
epochs = 4

In [22]:
# Build the model
model = keras.Sequential(
    [
        layers.Dense(2, input_shape=(max_words,),activation="relu", name="layer1"),
        layers.Dense(3, activation="relu", name="layer2"),
        layers.Dense(4, name="layer3"),
    ]
)
x = tf.ones((0, 1000))
y = model(x)

# model = keras.Sequential()
# model.add(layers.Dense(512, input_shape=(max_words,)))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
# model.add(Dense(num_classes))
# model.add(Activation('softmax'))


In [23]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [24]:
# train the model
history = model.fit(x_train, y_train_cat,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/4


ValueError: in user code:

    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step  **
        loss = self.compiled_loss(
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 7) and (None, 4) are incompatible


In [None]:
# evaluate the accuracy
score = model.evaluate(x_test, y_test_cat, batch_size=batch_size, verbose=1)
print('Test accuracy: {0:.2f}%'.format(score[1] * 100))

In [25]:
text_labels = encoder.classes_ 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_ocr.iloc[i][:50], "...")
    print('correct:' + test_tags.iloc[i])
    print("predicted: " + predicted_label + "\n")

Debelum ...
correct:Debenhams
predicted: Boots

TK Maxx ...
correct:TK Maxx
predicted: Boots

Selfridges ...
correct:Selfridges
predicted: Boots

JOHN LEWIS ...
correct:John Lewis
predicted: Boots

Boots ...
correct:Boots
predicted: Holland & Barrett

Boots ...
correct:Boots
predicted: Holland & Barrett

Harrods ...
correct:Harrods
predicted: Holland & Barrett

Selfridges ...
correct:Selfridges
predicted: Boots

Debenhams ...
correct:Debenhams
predicted: Harrods

HOLLAND & BARRETT ...
correct:Holland & Barrett
predicted: Holland & Barrett



In [26]:
encoder.classes_


array(['Boots', 'Debenhams', 'Harrods', 'Holland & Barrett', 'John Lewis',
       'Selfridges', 'TK Maxx'], dtype=object)

In [27]:
encoder.classes_
matrix = tokenize.texts_to_matrix(['TK UK'])
prediction = model.predict(np.array([matrix[0]]))
predicted_label = text_labels[np.argmax(prediction)]
print("predict:{0}\nindex:{1} \nlabel:{2}".format(prediction, np.argmax(prediction), predicted_label))

predict:[[-0.01592306 -0.05342743  0.03455738  0.06057582]]
index:3 
label:Holland & Barrett


In [28]:
y_softmax = model.predict(x_test)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test_cat)):
    probs = y_test_cat[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)

In [29]:
# This utility function is from the sklearn docs: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Correct label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [30]:
cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(24,20))
plot_confusion_matrix(cnf_matrix, classes=text_labels, title="Confusion matrix")
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [1397, 0]