## Apply best model on test (submit) data

## Import

In [59]:
import pytesseract
import cv2
import os
import ntpath
import pandas as pd
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from functions.add_to_file import add_to_file 

## Read and transform train data

In [None]:
# Unzip train dataset
!unzip '../data/train/0325updated.task2train(626p)-20220330T115700Z-001' -d '../data/train/train_dataset'

Archive:  ../data/train/0325updated.task2train(626p)-20220330T115700Z-001.zip
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008164998.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008114321.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008164996.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099085.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51009453801.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099083.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008123447.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008030565.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008042778.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099087.txt  
  inflating: ../data/train/train

In [None]:
train_dataset_path = '../data/train/train_dataset/0325updated.task2train(626p)'

train_corpus = pd.DataFrame()

id_list = []
text_list = []
company_list = []
date_list = []
address_list = []
total_list = []

for filename in os.listdir(train_dataset_path):
    # If its a text (.txt)
    if '.txt' in filename:
        id = filename.replace('.txt','').replace('(1)','')
        id_list.append(id)

        f = open(train_dataset_path+'/'+filename)
        data = json.load(f)
        company_list.append(data.get('company'))
        date_list.append(data.get('date'))
        address_list.append(data.get('address'))
        total_list.append(data.get('total'))

train_corpus['id'] = id_list
train_corpus['company'] = company_list
train_corpus['date'] = date_list
train_corpus['address'] = address_list
train_corpus['total'] = total_list

In [None]:
train_corpus.head()

Unnamed: 0,id,company,date,address,total
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.9
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.0
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.0
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.0


In [None]:
for filename in os.listdir(train_dataset_path):
    # If its an image (.jpg)
    if '.jpg' in filename:
        id = filename.replace('.jpg','').replace('(1)','')
        
        file_path = os.path.join(train_dataset_path, filename)
        if os.path.isfile(file_path):
            # Read image
            image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
            # Get text from image
            text = pytesseract.image_to_string(image, lang='eng').replace('\n', ' ').replace('  ',' ')
            train_corpus.loc[train_corpus['id'] == id, "text"] = text

Corrupt JPEG data: bad Huffman code


In [None]:
train_corpus.head()

Unnamed: 0,id,company,date,address,total,text
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.9,"AOS. 0 AEON CO. (M) BHD (126926-H) SRD FLR, AE..."
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05,ASO ELECTRICAL TRADING SDN BHD 1000131-K NO 31...
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.0,a ‘Harvey Norman Harvey Norman M'sia Parad...
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.0,LEONG HENG SHELL SERVICE STATION Company No ...
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.0,21803053 =: ‘ GOLDEN KEY MAKER (000760274-K) N...


In [None]:
train_corpus.to_csv('../data/train/train_dataset.csv', index=False)

## Read test (submit) data

In [None]:
# Unzip input dataset
!unzip '../data/input/SROIE_test_images_task_3.zip' -d '../data/input/SROIE_test_images_task_3'

In [3]:
test_dataset_path = '../data/input/SROIE_test_images_task_3'

test_corpus = pd.DataFrame()

id_list = []
text_list = []

for filename in os.listdir(test_dataset_path):
    id = filename.replace('.jpg','').replace('(1)','')
    
    file_path = os.path.join(test_dataset_path, filename)
    if os.path.isfile(file_path):
        # Read image
        image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
        # Get text from image
        text = pytesseract.image_to_string(image, lang='eng')
        # Preprocess
        text = text.replace('\n', ' ').replace('  ',' ')
        # Store informations
        id_list.append(id)
        text_list.append(text)

test_corpus['id'] = id_list
test_corpus['text'] = text_list

In [4]:
test_corpus.head()

Unnamed: 0,id,text
0,X51007846290,NOEONF UNIHAKKA INTERNATIONAL SDN BHD 07 Jun 2...
1,X51005447859,PASARAYA BORONG PINTAR SDN BHD BR No.: (1245...
2,X51005746203,/ SUPER SEVEN CASH & th. SDN BHD wale 590 150-...
3,X51006329183,’ tore : \o SEMBOYAN TEGAS SDN BHD we” No.5 & ...
4,X51005442388,"A03 (26 CONTENTO (JNO761170-4) 15, JALAN PERHA..."


In [7]:
test_corpus.to_csv('../data/test/test_dataset.csv', index=False)

## Preprocessing

In [17]:
# Define path to file
path = '../data/train/train_dataset.csv'

# Read file 
train_corpus = pd.read_csv(path, encoding='utf8', sep=',')

In [18]:
train_corpus.head()

Unnamed: 0,id,company,date,address,total,text
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.9,"AOS. 0 AEON CO. (M) BHD (126926-H) SRD FLR, AE..."
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05,ASO ELECTRICAL TRADING SDN BHD 1000131-K NO 31...
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.0,a ‘Harvey Norman Harvey Norman M'sia Parad...
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.0,LEONG HENG SHELL SERVICE STATION Company No ...
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.0,21803053 =: ‘ GOLDEN KEY MAKER (000760274-K) N...


In [8]:
# Define path to file
path = '../data/test/test_dataset.csv'

# Read file 
test_corpus = pd.read_csv(path, encoding='utf8', sep=',')


In [10]:
test_corpus.head()

Unnamed: 0,id,text
0,X51007846290,NOEONF UNIHAKKA INTERNATIONAL SDN BHD 07 Jun 2...
1,X51005447859,PASARAYA BORONG PINTAR SDN BHD BR No.: (1245...
2,X51005746203,/ SUPER SEVEN CASH & th. SDN BHD wale 590 150-...
3,X51006329183,’ tore : \o SEMBOYAN TEGAS SDN BHD we” No.5 & ...
4,X51005442388,"A03 (26 CONTENTO (JNO761170-4) 15, JALAN PERHA..."


Transform None type into string

In [20]:
train_corpus.fillna("",inplace=True)
test_corpus.fillna("",inplace=True)

Transform text column to string upper case

In [21]:
train_corpus['text'] = train_corpus['text'].astype(str).str.upper()
test_corpus['text'] = test_corpus['text'].astype(str).str.upper()

In [23]:
test_corpus['text'].head()

0    NOEONF UNIHAKKA INTERNATIONAL SDN BHD 07 JUN 2...
1      PASARAYA BORONG PINTAR SDN BHD BR NO.: (1245...
2    / SUPER SEVEN CASH & TH. SDN BHD WALE 590 150-...
3    ’ TORE : \O SEMBOYAN TEGAS SDN BHD WE” NO.5 & ...
4    A03 (26 CONTENTO (JNO761170-4) 15, JALAN PERHA...
Name: text, dtype: object

## TF-IDF

Convert data into a matrix of TF-IDF features

In [39]:
texts = pd.concat([train_corpus['text'], test_corpus['text']])

print(train_corpus['text'].shape[0])
print(test_corpus['text'].shape[0])
print(texts.shape[0])

876
347
1223


In [42]:
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_df=0.95, min_df=4)
tfidf = vectorizer.fit_transform(texts)

tfidf_train = tfidf[:train_corpus['text'].shape[0]]
tfidf_test = tfidf[train_corpus['text'].shape[0]:]

print(tfidf_train.shape)
print(tfidf_test.shape)

(876, 2617)
(347, 2617)


In [43]:
x_train = tfidf_train
x_test = tfidf_test

y_company_train = train_corpus['company']
y_date_train = train_corpus['date']
y_address_train = train_corpus['address']
y_total_train = train_corpus['total']

## Classification

### Functions

In [44]:
def classificate(classifier, x_train, y_train, x_test):
    # Create a classifier using train data
    classifier.fit(x_train, y_train)

    # Predict labels using test data
    y_pred = classifier.predict(x_test)

    return y_pred

### Company

In [45]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), alpha=0.001, random_state=0)

# Create classifier
y_company_pred = classificate(mlp_clf, x_train, y_company_train, x_test)



In [47]:
test_corpus['company'] = y_company_pred

In [53]:
y_company_pred

array(['UNIHAKKA INTERNATIONAL SDN BHD', 'PASARAYA BORONG PINTAR SDN BHD',
       'SUPER SEVEN CASH & CARRY SDN BHD', 'GOLDEN KEY MAKER',
       'MR. D.I.Y. (M) SDN BHD',
       'AIK HUAT HARDWARE ENTERPRISE (SETIA ALAM) SDN BHD',
       'YONG CEN ENTERPRISE', "KING'S CONFECTIONERY S/B",
       'SANYU STATIONERY SHOP',
       'AIK HUAT HARDWARE ENTERPRISE (SETIA ALAM) SDN BHD',
       'SUPER SEVEN CASH & CARRY SDN BHD', 'SYARIKAT PERNIAGAAN GIN KEE',
       'YHM AEON TEBRAU CITY', 'GARDENIA BAKERIES (KL) SDN BHD',
       'ABC HO TRADING', 'C W KHOO HARDWARE SDN BHD',
       'MR. D.I.Y. (M) SDN BHD', 'AEON CO. (M) BHD',
       'GH DISTRIBUTOR & MARKETING SDN BHD',
       'BECON STATIONER BECON ENTERPRISE SDN BHD',
       'UNIHAKKA INTERNATIONAL SDN BHD',
       'TRIPLE SIX POINT ENTERPRISE 666',
       'GARDENIA BAKERIES (KL) SDN BHD', 'KEDAI PAPAN YEW CHUAN',
       'OLIVE9 PHARMACY SDN BHD', '99 SPEED MART S/B',
       'ASO ELECTRICAL TRADING SDN BHD', 'AEON CO. (M) BHD',
       'MR. 

### Date

In [48]:
svm_clf = SVC(kernel='poly', C=1000, random_state=0)

# Create classifier
y_date_pred = classificate(svm_clf, x_train, y_date_train, x_test)

In [49]:
test_corpus['date'] = y_date_pred

In [54]:
y_date_pred

array(['20 JUN 2018', '20180304', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '16 MAY 2018', '24-03-18', '24-03-18', '24-03-18', '31/03/2017',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-MAR-2018', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '09 MAY 2018', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '21-03-2018', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24-03-18', '24-03-18',
       '24-03-18', '24-03-18', '24-03-18', '24

### Address

In [51]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(300,), alpha=0.001, random_state=0)

# Create classifier
y_address_pred = classificate(mlp_clf, x_train, y_address_train, x_test)



In [52]:
test_corpus['address'] = y_address_pred

In [55]:
y_address_pred

array(['12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAMPOI,81200 JOHOR BAHRU, JOHOR',
       'NO 19-G& 19-1& 19-2 JALAN TASIK UTAMA 4, MEDAN NIAGA TASIK DAMAI',
       'NO. 1 JALAN EURO 1 OFF JALAN BATU TIGA SUNGAI BULOH SEKSYEN U3 SHAH ALAM, 40150',
       'NO. 53, JALAN BESAR, 45600 BATANG BERJUNTAI SELANGOR DARUL EHSAN',
       'NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA ALAM, SEKSYEN U13, 40170 SHAH ALAM,',
       'UNIT L1-044B, VIVACITY MEGAMALL, JALAN WAN ALWI, 93350 KUCHING, SARAWAK.',
       '9, JALAN SUBANG JASA 3, 40150 SHAN ALAM, SELANGOR.',
       'LOT NO. G23, GIANT KELANA JAYA LOT PT244, JLN PERBANDARAN SS6/4 PUSAT BANDAR KELANA JAYA, PETALING',
       'NO. 31G&33G, JALAN SETIA INDAH X ,U13/X 40170 SETIA ALAM',
       'NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA ALAM, SEKSYEN U13, 40170 SHAH ALAM,',
       'NO. 1 JALAN EURO 1 OFF JALAN BATU TIGA SUNGAI BULOH SEKSYEN U3 SHAH ALAM, 40150',
       'NO 290, JALAN AIR PANAS, SETAPAK, 53200, KUALA LUMPUR.',
       'S117, SECO

### Total

In [56]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(500,), alpha=0.0001, random_state=0)

# Create classifier
y_total_pred = classificate(mlp_clf, x_train, y_total_train, x_test)



In [57]:
test_corpus['total'] = y_total_pred

In [58]:
y_total_pred

array(['RM8.20', '3.20', 'RM367.10', '6.00', '28.00', '6.00', '63.35',
       '7.50', '33.50', '14.90', 'RM367.10', '21.20', '13.10', '41.87',
       '31.00', '48.00', '7.00', '5.90', '25.00', '247.55', '$8.20',
       '7.60', '27.20', '87.45', '12.15', '177.20', '99.00', '6.00',
       '7838.80', '11.60', '404.39', '73.55', '7.00', '21.00', '46.55',
       '100.00', '26.10', '6.00', '7.00', '18.90', '4.90', '4.90', '6.00',
       '14.79', '20.05', '165.00', '7.00', '5.00', '7838.80', '27.55',
       '25.45', '7838.80', '9.00', '27.30', '68.63', '51.88', '$6.60',
       '21.00', '10.40', '7.00', '20.05', '9.90', '7.00', '6.70', '3.90',
       '39.90', '7838.80', '6.36', '9.90', '74.20', '31.00', '5.80',
       '$8.20', '6.70', '7838.80', '5.00', '7.40', '2.10', '42.40',
       'RM7.70', '6.00', '12.00', '63.35', '3.20', 'RM14.20', '13.80',
       '5.00', '9.60', '6.00', '5.00', '21.00', '6.00', 'RM14.20',
       'RM 3.30', '27.30', '100.00', '36.00', 'RM5.00', '4.80', '178.08',
       

## Add data to .txt file

In [61]:
test_corpus.head()

Unnamed: 0,id,text,company,date,address,total
0,X51007846290,NOEONF UNIHAKKA INTERNATIONAL SDN BHD 07 JUN 2...,UNIHAKKA INTERNATIONAL SDN BHD,20 JUN 2018,"12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAM...",RM8.20
1,X51005447859,PASARAYA BORONG PINTAR SDN BHD BR NO.: (1245...,PASARAYA BORONG PINTAR SDN BHD,20180304,"NO 19-G& 19-1& 19-2 JALAN TASIK UTAMA 4, MEDAN...",3.20
2,X51005746203,/ SUPER SEVEN CASH & TH. SDN BHD WALE 590 150-...,SUPER SEVEN CASH & CARRY SDN BHD,24-03-18,NO. 1 JALAN EURO 1 OFF JALAN BATU TIGA SUNGAI ...,RM367.10
3,X51006329183,’ TORE : \O SEMBOYAN TEGAS SDN BHD WE” NO.5 & ...,GOLDEN KEY MAKER,24-03-18,"NO. 53, JALAN BESAR, 45600 BATANG BERJUNTAI SE...",6.00
4,X51005442388,"A03 (26 CONTENTO (JNO761170-4) 15, JALAN PERHA...",MR. D.I.Y. (M) SDN BHD,24-03-18,"NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...",28.00


In [68]:
submit_path = '../data/output/model_nlp_dataset_submit'

In [69]:
# Create the directory for results
os.mkdir(path=submit_path)

In [70]:
for id, row in test_corpus.iterrows():
    data = {}
    data['company'] = row.company
    data['address'] = row.address
    data['total'] = row.total
    data['date'] = row.date
    
    add_to_file(data, submit_path, row.id+'.jpg')

Calculated!{"recall": 0.335014409221902, "precision": 0.335014409221902, "hmean": 0.335014409221902}%