In [1]:
import numpy as np
import pandas as pd
import string
import re

In [2]:
with open('Invoice.txt',mode='r',encoding='utf8',errors='ignore') as f:
    text = f.read()

In [3]:
data = list(map(lambda x:x.split('\t'),text.split('\n')))

In [4]:
df = pd.DataFrame(data[1:],columns=data[0])

In [5]:
df.head(10)

Unnamed: 0,id,text,tag
0,N001.jpg,tan,B-ORG
1,N001.jpg,woon,I-ORG
2,N001.jpg,yann,ORG
3,N001.jpg,BOOK,O
4,N001.jpg,TA-K,O
5,N001.jpg,(TAMAN,O
6,N001.jpg,DAYA),O
7,N001.jpg,SDN,O
8,N001.jpg,BHD,O
9,N001.jpg,NO.5?,O


### Cleaning Text
- Remove white space
- Remove Unwanted special characters

In [6]:
whitespace = string.whitespace
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
tableWhitespace = str.maketrans('','',whitespace)
tablePunctuation = str.maketrans('','',punctuation)
def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)


In [7]:
df['text'] = df['text'].apply(cleanText)

In [8]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace=True)


In [9]:
dataClean.head(10)

Unnamed: 0,id,text,tag
0,N001.jpg,tan,B-ORG
1,N001.jpg,woon,I-ORG
2,N001.jpg,yann,ORG
3,N001.jpg,book,O
4,N001.jpg,ta-k,O
5,N001.jpg,taman,O
6,N001.jpg,daya,O
7,N001.jpg,sdn,O
8,N001.jpg,bhd,O
9,N001.jpg,no.5,O


### Convert Data into Spacy Format

In [10]:
group = dataClean.groupby(by='id')

In [11]:
cards = group.groups.keys()

In [13]:
allInvoiceData = []
for card in cards:
    invoiceData = []
    grouparray = group.get_group(card)[['text','tag']].values
    content = ''
    annotations = {'entities':[]}
    start = 0
    end = 0
    for text, label in grouparray:
        text = str(text)
        stringLength = len(text) + 1

        start = end
        end = start + stringLength

        if label != 'O':
            annot = (start,end-1,label)
            annotations['entities'].append(annot)

        content = content + text + ' '
        
        
    invoiceData = (content,annotations)
    allInvoiceData.append(invoiceData)

In [14]:
allInvoiceData

[('tan woon yann book ta-k taman daya sdn bhd no.5 55,57 59 jalan sagu 18 taman daya 81100 johor bahru, johor. a document no tdo1167104 date 25/12/2018 081339 pm cashier manis member cash bill code/desc price amouht quy rm rm 9,56e12 kf modelling clay kiddy fish 9.00 6 9.00 total 9.00 rour ding adjustment 0.00 roundd total rm 9.60 cash change goods sold are not returnar exchangeable thank you. please come agaty ',
  {'entities': [(0, 3, 'B-ORG'),
    (4, 8, 'I-ORG'),
    (9, 13, 'ORG'),
    (122, 132, 'B-CARDINAL'),
    (138, 148, 'B-DATE'),
    (267, 271, 'B-MONEY'),
    (278, 282, 'B-MONEY')]}),
 ('tan woon yann indah gift home beco 27 jalan dedap 13 tanan johor jaya, 81100 johor bahru, johor. tel07-3507405 fax07-3558160 receipt 19/10/2018 204959 01 cashier cn location/sp 5 /0531 mb 4026588 room no of 50100025279 gty price amt/rh st-privilege card/gd indah 89888 1 10.00 10.00 gf-table lamp/stitch i 62483 1 55.90 55.90 10.00 htotal qty 2 total aht. rounding adi. thank you please come 

In [15]:
invoice_data_df = pd.DataFrame(allInvoiceData,columns=['text','labels'])
invoice_data_df['isNull'] = invoice_data_df['labels'].apply(lambda x: 'Null' if len(x['entities']) ==0 
                                                      else 'Clean')

#### Null entries need to drop

In [16]:
invoice_data_df.query('isNull == "Null"')

Unnamed: 0,text,labels,isNull


#### Consilder only clean data

In [17]:
invoice_data_df.dropna(inplace=True)
clean_data = invoice_data_df.query('isNull == "Clean"')[['text','labels']]

In [18]:
allInvoiceData = list(map(lambda x: tuple(x), clean_data.values.tolist()))

## Split the Data into Training and Testing Set

In [20]:
import random

In [21]:
random.shuffle(allInvoiceData)

In [22]:
len(allInvoiceData)

31

In [23]:
TrainData = allInvoiceData[:240]
TestData = allInvoiceData[240:]

In [24]:
import pickle

In [26]:
pickle.dump(TrainData,open('./data/TrainData.pickle',mode='wb'))
pickle.dump(TestData,open('./data/TestData.pickle',mode='wb'))