In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

import itertools
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_colwidth = 10000

In [None]:
# read provided dataset
df = pd.read_csv("../receipt_data.csv")
df.size

In [None]:
df.head(1)

In [None]:
# filter retailerName to be used as label and rawData as features
df = df.filter(['retailerName', 'rawData'])
df.head(5)

In [None]:
# we can see that some Boot tags are duplicated
df['retailerName'].value_counts()

In [None]:
# clean some data with bad labels
df.retailerName = df.retailerName.str.replace('Boots.*', 'Boots', regex=True)

# Boot retailer tags must be now deduplicated
# The counts should be closer for the model be more balanced
df['retailerName'].value_counts()

In [None]:
ignore = df[df.rawData.apply(lambda x: not isinstance(x, str))]
ignore.head(5)

In [None]:
# drop rows with empty or not string rawData 
dfc = df[df.rawData.apply(lambda x: isinstance(x, str))]
dfc.head(5)

In [None]:
rawData = dfc.rawData
parsedRawData = rawData.apply(json.loads).apply(pd.Series)
parsedRawData.head(2)

In [None]:
joined = df.join(parsedRawData)
data = joined.filter(['retailerName', 'result'])

In [None]:
# parse json result field and add columns to dataframe
parsedResult = data.result.apply(pd.Series)
joined = df.join(parsedResult)

# filter so that we get only features and labels to train the model
data = joined.filter(['establishment', 'retailerName'])
data.head(5)

In [None]:
ignore = data[data.establishment.apply(lambda x: not isinstance(x, str))]
ignore.head(2)

In [None]:
ignore = data[data.retailerName.apply(lambda x: not isinstance(x, str))]
ignore.head(2)

In [None]:
data = data[data.establishment.apply(lambda x: isinstance(x, str))]
# data = data[data.retailerName.apply(lambda x: isinstance(x, str))]
data.head(10)

In [None]:
# get train and test dataset
train_size = int(len(data) * .8)
train_ocr = data['establishment'][:train_size]
train_tags = data['retailerName'][:train_size]
test_ocr = data['establishment'][train_size:]
test_tags = data['retailerName'][train_size:]

test_tags.size, train_size, len(data)

In [None]:
train_ocr.head(10)

In [None]:
# use bag of words model
max_words = 1000
tokenize = Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_ocr)

In [None]:
x_train = tokenize.texts_to_matrix(train_ocr)
x_test = tokenize.texts_to_matrix(test_ocr)
#x_train[0]

In [None]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

y_test

In [None]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)
y_test_cat

In [None]:
# Inspect the dimensions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train_cat shape:', y_train_cat.shape)
print('y_test_cat shape:', y_test_cat.shape)

In [None]:
batch_size = 32
epochs = 4

In [None]:
model = tf.keras.Sequential([
    InputLayer(input_shape=(max_words,)),
    Dense(512, activation='relu', name='hidden_layer'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax', name='output')
])

In [None]:
LR = 1e-5 # Keep it small when transfer learning
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
  loss='categorical_crossentropy',
  metrics=['accuracy'])

In [None]:
# train the model
history = model.fit(x_train, y_train_cat,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

In [None]:
# evaluate the accuracy
score = model.evaluate(x_test, y_test_cat, batch_size=batch_size, verbose=1)
print('Test accuracy: {0:.2f}%'.format(score[1] * 100))

In [None]:
text_labels = encoder.classes_ 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_ocr.iloc[i][:50], "...")
    print('correct:' + test_tags.iloc[i])
    print("predicted: " + predicted_label + "\n")

In [None]:
encoder.classes_


In [None]:
encoder.classes_
matrix = tokenize.texts_to_matrix(['HTK UK'])
prediction = model.predict(np.array([matrix[0]]))
predicted_label = text_labels[np.argmax(prediction)]
print("predict:{0}\nindex:{1} \nlabel:{2}".format(prediction, np.argmax(prediction), predicted_label))

In [None]:
y_softmax = model.predict(x_test)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test_cat)):
    probs = y_test_cat[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('Correct label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [None]:
len(y_test_1d), len(y_pred_1d)

In [None]:
cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)

In [None]:
plt.figure(figsize=(24,20))
plot_confusion_matrix(cnf_matrix, classes=text_labels, title="Confusion matrix")
plt.show()