# Extract .zip files

## Import

In [10]:
import pytesseract
import cv2
import os
import pandas as pd
import re
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from functions.image_preprocessing import image_preprocessing 

## Read data and preprocess image

In [3]:
# Unzip train dataset
!unzip '../data/input/0325updated.task2train(626p)-20220330T115700Z-001' -d '../data/input/'

Archive:  ../data/input/0325updated.task2train(626p)-20220330T115700Z-001.zip
  inflating: ../data/input/0325updated.task2train(626p)/X51008164998.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008114321.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008164996.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008099085.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51009453801.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008099083.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008123447.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008030565.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008042778.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008099087.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008128065.txt  
  inflating: ../data/input/0325updated.task2train(626p)/X51008128062.txt  
  inflating: ../data/i

In [4]:
train_dataset_path = '../data/input/0325updated.task2train(626p)'

data_dict = {'id':[], 'company':[], 'date':[], 'address':[], 'total':[]}

# Read correct entities and store it in the dataframe
for filename in os.listdir(train_dataset_path):
    # Only text files (.txt)
    if '.txt' in filename:
        id = re.search("X\d*", filename).group()
        file = open(train_dataset_path+'/'+filename)
        data = json.load(file)
        # Add infos to data_dict
        data_dict['id'].append(id)
        data_dict['company'].append(data.get('company'))
        data_dict['date'].append(data.get('date'))
        data_dict['address'].append(data.get('address'))
        data_dict['total'].append(data.get('total'))

df = pd.DataFrame(data_dict)

# Read image text and store it the dataframe
for filename in os.listdir(train_dataset_path):
    # Only image files (.jpg)
    if '.jpg' in filename:
        id = re.search("X\d*", filename).group()
        file_path = os.path.join(train_dataset_path, filename)
        if os.path.isfile(file_path):
            # Read image
            image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
            pp_image = image_preprocessing(image)
            # Get text from image
            text = pytesseract.image_to_string(pp_image, lang='eng').replace('\n', ' ').replace('  ',' ')
            df.loc[df['id'] == id, "text"] = text

Corrupt JPEG data: bad Huffman code


In [5]:
df

Unnamed: 0,id,company,date,address,total,text
0,X51006913007,AEON CO. (M) BHD,27/05/2018,"3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...",99.90,"AST. 0 AEON CO. (M) BHD (126926-H) SRD FLR, AE..."
1,X51005705722,ASO ELECTRICAL TRADING SDN BHD,27/09/2017,"NO 31G, JALAN SEPADU C 25/C, SECTION 25, TAMAN...",248.05,ASO ELECTRICAL TRADING SDN BHD 1000131-K NO 31...
2,X51005763940,ELITETRAX MARKETING SDN BHD,11.02.18,"LOT 1F-01&02,1ST FLR,PARADIGM MALL, NO. 1 JALA...",60.00,ee f ¢ Harvey Norman Harvey Norman M'sia P...
3,X51008099071,LEONG HENG SHELL SERVICE STATION,20/06/18,"LOT 26151 BANDAR SG LONG, 11 1/4 MILES CHERAS,...",50.00,|.EONG HENG SHELL SERVICE STATION Company N...
4,X51005757349,GOLDEN KEY MAKER,24-MAR-2018,"NO 5, JALAN KENARI 2, BANDAR PUCHONG JAYA, 471...",21.00,31802053 = 4 . GOLDEN KEY MAKER (000760274-K) ...
...,...,...,...,...,...,...
871,X51006557179,GARDENIA BAKERIES (KL) SDN BHD,22/10/2017,"LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...",34.21,GARDENIA BAKERIES (KE) SDN BILD (139386 X) Lot...
872,X51005442361,TIMELESS KITCHENETTE SDN BHD,2018-03-23,"LOT 50, FLOOR T2,SKY AVENUE GENTING HIGHLANDS ...",593.10,"Morganflelds* % Home, of Sucky Boaea Timeless ..."
873,X51006329399,OLIVE9 PHARMACY SDN BHD,31/03/2017,"69 JALAN NOVA U5/N, SEKSYEN U5 SUBANG BE",12.15,"a, . OLIVES PHARMACY SDN BHD 1080Z10F SST/TAR ..."
874,X51006557213,GARDENIA BAKERIES (KL) SDN BHD,01/10/2017,"LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...",31.32,GARDENIA BAKERIES (KI) SDN BED (139386 X) Lo...


## Split data into train-val-test

In [15]:
ds = Dataset.from_pandas(df)

ds_train_test = ds.train_test_split(test_size=0.2, seed=42)  # train and test
ds_train_val = ds_train_test['train'].train_test_split(test_size=0.125, seed=42)  # train and validation

dataset = DatasetDict({'train': ds_train_val['train'], 
                       'val': ds_train_val['test'], 
                       'test': ds_train_test['test']})

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'company', 'date', 'address', 'total', 'text'],
        num_rows: 612
    })
    val: Dataset({
        features: ['id', 'company', 'date', 'address', 'total', 'text'],
        num_rows: 88
    })
    test: Dataset({
        features: ['id', 'company', 'date', 'address', 'total', 'text'],
        num_rows: 176
    })
})

## Save in different files

In [17]:
dataset['train'].to_csv('../data/input/train_dataset.csv', index=False)
dataset['val'].to_csv('../data/input/val_dataset.csv', index=False)
dataset['test'].to_csv('../data/input/test_dataset.csv', index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.32ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 141.61ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 106.85ba/s]


132870