## Import

In [107]:
import pytesseract
import cv2
import os
# import ntpath
# from functions.add_to_file import add_to_file 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Read and transform train data

In [6]:
# Unzip train dataset
!unzip '../data/train/0325updated.task2train(626p)-20220330T115700Z-001' -d '../data/train/train_dataset'

Archive:  ../data/train/0325updated.task2train(626p)-20220330T115700Z-001.zip
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008164998.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008114321.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008164996.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099085.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51009453801.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099083.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008123447.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008030565.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008042778.txt  
  inflating: ../data/train/train_dataset/0325updated.task2train(626p)/X51008099087.txt  
  inflating: ../data/train/train

In [39]:
train_dataset_path = '../data/train/train_dataset/0325updated.task2train(626p)'

train_corpus = pd.DataFrame()

id_list = []
text_list = []
company_list = []
date_list = []
address_list = []
total_list = []

for filename in os.listdir(train_dataset_path):
    # If its a text (.txt)
    if '.txt' in filename:
        id = filename.replace('.txt','').replace('(1)','')
        id_list.append(id)

        f = open(train_dataset_path+'/'+filename)
        data = json.load(f)
        company_list.append(data.get('company'))
        date_list.append(data.get('date'))
        address_list.append(data.get('address'))
        total_list.append(data.get('total'))

train_corpus['id'] = id_list
train_corpus['company'] = company_list
train_corpus['date'] = date_list
train_corpus['address'] = address_list
train_corpus['total'] = total_list

In [40]:
train_corpus.head()

Unnamed: 0,id,company,date,address,total
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.9
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.0
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.0
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.0


In [54]:
for filename in os.listdir(train_dataset_path):
    # If its an image (.jpg)
    if '.jpg' in filename:
        id = filename.replace('.jpg','').replace('(1)','')
        
        file_path = os.path.join(train_dataset_path, filename)
        if os.path.isfile(file_path):
            # Read image
            image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
            # Get text from image
            text = pytesseract.image_to_string(image, lang='eng').replace('\n', ' ').replace('  ',' ')
            train_corpus.loc[train_corpus['id'] == id, "text"] = text

Corrupt JPEG data: bad Huffman code


In [55]:
train_corpus.head()

Unnamed: 0,id,company,date,address,total,text
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.9,"AOS. 0 AEON CO. (M) BHD (126926-H) SRD FLR, AE..."
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05,ASO ELECTRICAL TRADING SDN BHD 1000131-K NO 31...
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.0,a ‘Harvey Norman Harvey Norman M'sia Parad...
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.0,LEONG HENG SHELL SERVICE STATION Company No ...
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.0,21803053 =: ‘ GOLDEN KEY MAKER (000760274-K) N...


In [58]:
train_corpus.to_csv('../data/train/train_dataset.csv', index=False)

## Preprocessing

In [132]:
# Define path to file
path = '../data/train/train_dataset.csv'

# Read file 
train_corpus = pd.read_csv(path, encoding='utf8', sep=',')


Transform None type into string

In [134]:
train_corpus.fillna("",inplace=True)

Transform text column to string upper case

In [136]:
train_corpus['text'] = train_corpus['text'].astype(str).str.upper()

In [137]:
train_corpus['text']

0      AOS. 0 AEON CO. (M) BHD (126926-H) SRD FLR, AE...
1      ASO ELECTRICAL TRADING SDN BHD 1000131-K NO 31...
2        A ‘HARVEY NORMAN   HARVEY NORMAN M'SIA PARAD...
3        LEONG HENG SHELL SERVICE STATION COMPANY NO ...
4      21803053 =: ‘ GOLDEN KEY MAKER (000760274-K) N...
                             ...                        
871    GARDENIA BAKERIES (KL) SDN HEED (139386; X) LO...
872      MORGANFIELDS* % HOME OF SUICKY BONES * TIMEL...
873    4 . ° ° LIVES PUARHACY SDN BHD 1030214 SST/TAR...
874           GARDENIA BAKERIES (KL) SDN BELD (139386...
875    188 GERBANG ALAF RESTAURANTS SDN BHD (65351-M)...
Name: text, Length: 876, dtype: object

## TF-IDF

Convert data into a matrix of TF-IDF features

In [138]:
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_df=0.95, min_df=4)
tfidf = vectorizer.fit_transform(train_corpus['text'])

print(tfidf.shape)

(876, 2003)


## Split dataset into train/test

In [139]:
x = tfidf

y_company = train_corpus['company']
labels_company = train_corpus['company'].unique()
x_company_train, x_company_test, y_company_train, y_company_test = train_test_split(x, y_company, test_size=0.4, random_state=0)

y_date = train_corpus['date']
labels_date = train_corpus['date'].unique()
x_date_train, x_date_test, y_date_train, y_date_test = train_test_split(x, y_date, test_size=0.4, random_state=0)

y_address = train_corpus['address']
labels_address = train_corpus['address'].unique()
x_address_train, x_address_test, y_address_train, y_address_test = train_test_split(x, y_address, test_size=0.4, random_state=0)

y_total = train_corpus['total']
labels_total = train_corpus['total'].unique()
x_total_train, x_total_test, y_total_train, y_total_test = train_test_split(x, y_total, test_size=0.4, random_state=0)

## Classification

### Functions

In [109]:
def test_hyperparameters(classifier, X_train, y_train):
    parameters = {}
    classification = None

    if classifier == 'Support Vector Machine':
        parameters = {'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'C': [1, 10, 100, 1000]}
        classification = SVC(random_state=0)
    elif classifier == 'Multilayer Perceptron':
        parameters = {'hidden_layer_sizes': [(100,), (300,), (500,)], 'alpha': [1, 0.1, 0.01, 0.001, 0.0001]}
        classification = MLPClassifier(random_state=0)

    score = 'f1'

    clf = RandomizedSearchCV(classification, parameters, scoring="%s_macro" % score, cv=5)
    clf.fit(X_train, y_train)
    
    output = f"Best hyperparameters for {classifier}: {clf.best_params_}\n"
    print(output)

In [103]:
def createClassifier(classifier, x_train, y_train, x_test, y_test):
    # Create a classifier using train data
    classifier.fit(x_train, y_train)

    # Predict labels using test data
    y_true, y_pred = y_test, classifier.predict(x_test)

    # Print a text report showing the main classification metrics
    print('Classification report: ')
    print(classification_report(y_true, y_pred, zero_division=True))

    # Print Confusion Matrix
    print('Confusion matrix: ')
    print(confusion_matrix(y_true, y_pred))
    return y_pred

### Test hyperparameters

#### Company

In [113]:
test_hyperparameters('Support Vector Machine', x_company_train, y_company_train)
# Result: Best hyperparameters for Support Vector Machine: {'kernel': 'linear', 'C': 10}



Best hyperparameters for Support Vector Machine: {'kernel': 'linear', 'C': 10}



In [119]:
test_hyperparameters('Multilayer Perceptron', x_company_train, y_company_train)
# Result: Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (100,), 'alpha': 0.001}



Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (100,), 'alpha': 0.001}





#### Date

In [114]:
test_hyperparameters('Support Vector Machine', x_date_train, y_date_train)
# Result: Best hyperparameters for Support Vector Machine: {'kernel': 'poly', 'C': 1000}



Best hyperparameters for Support Vector Machine: {'kernel': 'poly', 'C': 1000}



In [117]:
test_hyperparameters('Multilayer Perceptron', x_date_train, y_date_train)
# Result: Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (300,), 'alpha': 0.001}



Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (300,), 'alpha': 0.001}





#### Address

In [140]:
test_hyperparameters('Support Vector Machine', x_address_train, y_address_train)
# Result: Best hyperparameters for Support Vector Machine: {'kernel': 'linear', 'C': 10}



Best hyperparameters for Support Vector Machine: {'kernel': 'linear', 'C': 10}



In [141]:
test_hyperparameters('Multilayer Perceptron', x_address_train, y_address_train)
# Result: Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (300,), 'alpha': 0.001}



Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (300,), 'alpha': 0.001}





#### Total

In [116]:
test_hyperparameters('Support Vector Machine', x_total_train, y_total_train)
# Result: Best hyperparameters for Support Vector Machine: {'kernel': 'poly', 'C': 1000}



Best hyperparameters for Support Vector Machine: {'kernel': 'poly', 'C': 1000}



In [118]:
test_hyperparameters('Multilayer Perceptron', x_total_train, y_total_train)
# Result: Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (500,), 'alpha': 0.0001}



Best hyperparameters for Multilayer Perceptron: {'hidden_layer_sizes': (500,), 'alpha': 0.0001}





### Company
- SVM: 68%
- MLP: 70%

#### SVM

In [143]:
# Define a Support Vector Machine classifier 
svm_clf = SVC(kernel='linear', C=10, random_state=0)

# Create classifier
y_pred = createClassifier(svm_clf, x_company_train, y_company_train, x_company_test, y_company_test)

# Result: 68% accuracy

Classification report: 
                                                   precision    recall  f1-score   support

                                99 SPEED MART S/B       1.00      0.77      0.87        13
                                  ADVANCO COMPANY       1.00      0.67      0.80         3
                                 AEON CO. (M) BHD       0.50      0.75      0.60         4
                                AEON CO. (M) BHD.       1.00      0.33      0.50         3
                             AEON CO. (M) SDN BHD       1.00      0.00      0.00         1
AIK HUAT HARDWARE ENTERPRISE (SETIA ALAM) SDN BHD       1.00      0.80      0.89         5
                           AMANO MALAYSIA SDN BHD       1.00      1.00      1.00         2
            AMPANG 210 FIRST CITY PARKING SDN BHD       1.00      0.00      0.00         1
               ANEKA INTERTRADE MARKETING SDN BHD       1.00      0.00      0.00         1
                    ANZEL ADVERTISING (M) SDN BHD       1.00     

#### Multilayer Perceptron

In [144]:
# Define a Multilayer Perceptron classifier 
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), alpha=0.001, random_state=0)

# Create classifier
y_pred = createClassifier(mlp_clf, x_company_train, y_company_train, x_company_test, y_company_test)

# Result: 70% accuracy

Classification report: 
                                                   precision    recall  f1-score   support

                                99 SPEED MART S/B       0.92      0.92      0.92        13
                                  ADVANCO COMPANY       1.00      0.67      0.80         3
                                 AEON CO. (M) BHD       0.50      0.75      0.60         4
                                AEON CO. (M) BHD.       1.00      0.33      0.50         3
                             AEON CO. (M) SDN BHD       1.00      0.00      0.00         1
AIK HUAT HARDWARE ENTERPRISE (SETIA ALAM) SDN BHD       0.50      0.80      0.62         5
                           AMANO MALAYSIA SDN BHD       1.00      1.00      1.00         2
            AMPANG 210 FIRST CITY PARKING SDN BHD       1.00      0.00      0.00         1
               ANEKA INTERTRADE MARKETING SDN BHD       1.00      0.00      0.00         1
                    ANZEL ADVERTISING (M) SDN BHD       1.00     



#### LSTM

In [None]:
# Create model
look_back = 1

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], look_back), activation='relu'))
model.add(Dense(1, activation='relu'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64)


### Date

- SVM: 28%
- MLP: 28%

#### SVM

In [145]:
# Define a Support Vector Machine classifier 
svm_clf = SVC(kernel='poly', C=1000, random_state=0)

# Create classifier
y_pred = createClassifier(svm_clf, x_date_train, y_date_train, x_date_test, y_date_test)

# Result: 28% accuracy

Classification report: 
              precision    recall  f1-score   support

  01-06-2018       1.00      1.00      1.00         1
    01/03/18       0.67      1.00      0.80         2
  01/06/2018       1.00      0.00      0.00         1
  01/10/2017       1.00      0.00      0.00         1
  01/11/2017       1.00      0.50      0.67         2
 02 APR 2018       1.00      0.00      0.00         1
  02-04-2018       1.00      1.00      1.00         1
    02-05-18       1.00      0.00      0.00         1
  02/01/2018       0.40      1.00      0.57         2
    02/02/17       1.00      0.00      0.00         1
  02/02/2018       1.00      0.00      0.00         1
    02/03/18       1.00      0.00      0.00         2
  02/12/2017       1.00      1.00      1.00         2
 03 APR 2018       1.00      0.00      0.00         1
    03-02-16       1.00      0.00      0.00         1
    03-05-18       1.00      1.00      1.00         1
  03/01/2018       1.00      1.00      1.00         1
  0

#### Multilayer Perceptron

In [146]:
# Define a Multilayer Perceptron classifier 
mlp_clf = MLPClassifier(hidden_layer_sizes=(300,), alpha=0.001, random_state=0)

# Create classifier
y_pred = createClassifier(mlp_clf, x_date_train, y_date_train, x_date_test, y_date_test)

# Result: 28% accuracy

Classification report: 
              precision    recall  f1-score   support

  01-06-2018       1.00      1.00      1.00         1
    01-11-17       0.00      1.00      0.00         0
    01/03/18       0.29      1.00      0.44         2
  01/06/2018       1.00      0.00      0.00         1
  01/10/2017       1.00      0.00      0.00         1
  01/11/2017       1.00      0.50      0.67         2
 02 APR 2018       1.00      0.00      0.00         1
  02-04-2018       1.00      1.00      1.00         1
    02-05-18       1.00      0.00      0.00         1
  02/01/2018       0.29      1.00      0.44         2
    02/02/17       1.00      0.00      0.00         1
  02/02/2018       0.00      0.00      0.00         1
    02/03/18       1.00      0.00      0.00         2
  02/03/2018       0.00      1.00      0.00         0
  02/12/2017       1.00      1.00      1.00         2
 02/JAN/2017       0.00      1.00      0.00         0
 03 APR 2018       1.00      0.00      0.00         1
   



### Address
- SVM: 59%
- MLP: 60%

#### SVM

In [147]:
# Define a Support Vector Machine classifier 
svm_clf = SVC(kernel='linear', C=10, random_state=0)

# Create classifier
y_pred = createClassifier(svm_clf, x_address_train, y_address_train, x_address_test, y_address_test)

# Result: 59% accuracy

Classification report: 
                                                                                                                                         precision    recall  f1-score   support

                                                                                                                                              1.00      1.00      1.00         1
                                                                                       10-18,JLN KEBUDAYAAN 16,TMN U 81300 JOHOR BAHRU.       1.00      0.00      0.00         1
                                                     106-107, LEVEL 1 THE MINES MALL SERDANG SRI KEMBANGAN, 43300 SELANGOR DARUL EHSAN.       1.00      0.00      0.00         1
                                                                                                   117, JALAN RADIN BAGUS, SRI PETALING       1.00      0.00      0.00         1
                                                              12 JALAN PENGACARA U1/48 TEM

#### Multilayer Perceptron

In [148]:
# Define a Multilayer Perceptron classifier 
mlp_clf = MLPClassifier(hidden_layer_sizes=(300,), alpha=0.001, random_state=0)

# Create classifier
y_pred = createClassifier(mlp_clf, x_address_train, y_address_train, x_address_test, y_address_test)

# Result: 60% accuracy

Classification report: 
                                                                                                                                         precision    recall  f1-score   support

                                                                                                                                              1.00      1.00      1.00         1
                                                                                       10-18,JLN KEBUDAYAAN 16,TMN U 81300 JOHOR BAHRU.       1.00      0.00      0.00         1
                                                     106-107, LEVEL 1 THE MINES MALL SERDANG SRI KEMBANGAN, 43300 SELANGOR DARUL EHSAN.       1.00      0.00      0.00         1
                                                                                                   117, JALAN RADIN BAGUS, SRI PETALING       1.00      0.00      0.00         1
                                                              12 JALAN PENGACARA U1/48 TEM



### Total
- SVM: 31%
- MLP: 33%

#### SVM

In [149]:
# Define a Support Vector Machine classifier 
svm_clf = SVC(kernel='poly', C=1000, random_state=0)

# Create classifier
y_pred = createClassifier(svm_clf, x_total_train, y_total_train, x_total_test, y_total_test)

# Result: 31% accuracy

Classification report: 
              precision    recall  f1-score   support

                   1.00      0.00      0.00         1
      $10.30       1.00      0.00      0.00         1
      $11.40       1.00      0.00      0.00         1
       $6.60       1.00      0.00      0.00         1
       $7.10       1.00      0.00      0.00         2
       $7.60       1.00      0.00      0.00         2
       $8.20       0.38      1.00      0.55         3
       $8.70       0.40      1.00      0.57         2
       -1.73       1.00      0.00      0.00         2
    1,007.50       1.00      1.00      1.00         1
        1.38       1.00      1.00      1.00         1
        1.75       1.00      0.00      0.00         1
       10.00       1.00      0.00      0.00         1
       10.20       1.00      0.00      0.00         1
       10.30       1.00      0.00      0.00         1
       10.40       1.00      0.00      0.00         1
       10.45       1.00      0.00      0.00         1
   

#### Multilayer Perceptron

In [150]:
# Define a Multilayer Perceptron classifier 
mlp_clf = MLPClassifier(hidden_layer_sizes=(500,), alpha=0.0001, random_state=0)

# Create classifier
y_pred = createClassifier(mlp_clf, x_total_train, y_total_train, x_total_test, y_total_test)

# Result: 33% accuracy

Classification report: 
              precision    recall  f1-score   support

                   1.00      0.00      0.00         1
      $10.30       1.00      0.00      0.00         1
      $11.40       1.00      0.00      0.00         1
       $6.60       1.00      0.00      0.00         1
       $7.00       0.00      1.00      0.00         0
       $7.10       1.00      0.00      0.00         2
       $7.60       1.00      0.00      0.00         2
       $8.20       0.38      1.00      0.55         3
       $8.70       0.20      1.00      0.33         2
       $8.90       0.00      1.00      0.00         0
       -1.73       1.00      0.00      0.00         2
    1,007.50       0.33      1.00      0.50         1
        1.38       1.00      1.00      1.00         1
        1.75       1.00      0.00      0.00         1
       10.00       1.00      0.00      0.00         1
       10.20       1.00      0.00      0.00         1
       10.30       1.00      0.00      0.00         1
   

