# Experimento com uma nota fiscal

Este arquivo tem o objetivo de testar um modelo para extrair informações de uma Nota Fiscal e armazenar em um arquivo '.txt'.

## Importação

In [14]:
import pytesseract
import cv2
import re
import json

from functions.image_preprocessing import image_preprocessing

## Leitura de imagens

In [2]:
image_original = cv2.imread('../../data/input/receipt.jpg', cv2.IMREAD_GRAYSCALE)

## Pré-processamento

In [3]:
image_preprocessed = image_preprocessing(image_original)

## Extração de dados

In [4]:
text = pytesseract.image_to_string(image_preprocessed, lang='eng').upper()
print(text)

 

HO 6088

OLD TOWN WHITE COFFE
EXQUISITE GARDEN SBN BHD
(JM 809642-A)
LOT NO R134,GIANT HYPERMARKET PLENTONG
NOS, JALAN MASAI LAMA )
81750 MASAI. JOHOR BAHRU
TEL : +607-3536960
OUTLET: JT] GIANT PLENTONG

    

5
ANSE FCR —— BE IGS 9 WT > 7 — 24G
: DRE OE YA GE 4 PLA X AES |
PEAS % AB I JAYE E YESPAE

SGN

  
    

  
 

A

SST 1D:000750002176

 

2 COUNTERI 4004 ANICA

FIN EMT NE AH SENI. PIAN FETE SITE A OAHE BY MD MEM MH EINE MTS HEY TE TN DH DAE ERENT TINE MEH EMT ATA LETPET + TOT NN WEND MME OE WIRE EMT INKS TWEET EW REM DEME, ETN

CHECK :21063 COVER: |
| 03 JUN 18 03:07:32 PH

TABLE:63 / 1

| --O7-. DINE IN -----
1  SN2 SOUP PAN MEE 10.90 $
1 STEG | 12.97 §
I SNB SP NOD SPECIAL S
| WOT (C) WO S
} VAS FRENCH FRIES BKT 4.15 §
TOTAL: £8.02
ROUNDING: 0.02-
TOTAL: 28.00
SUBTOTAL : 25.47
10% SRV CHG: 2.55
GST INCLUSIVE 0.00 *
TOTAL: 28.02
ROUNDING: 0.02-
JOTAL: 28.00
PAID: 100.00
CASH 100.00
CHANGE DUE: 72.00

~---A003 CLOSED 03 JUN 18 03:34:54 PK--—

THANK YQU
PLEASE COME AGAIN
GST 

In [5]:
company = ""
address = ""
total = ""
date = ""

for row in text.split('\n'):
  if ('BHD' in row or 'RESTAURANT' in row or 'HARDWARE' in row or 'ENTERPRISE' in row or 'RESTORAN' in row or 'S/B' in row or 'PHARMACY' in row) and not company:
    company = row

  numerical_row = re.search('[\d][,|.]', row)
  if ('TOTAL' in row or 'RM' in row or 'DUE' in row or 'BUE' in row) and numerical_row and not total:
    total = row

  time_row = re.search('[\d][\d][\/|-][\w][\w][\w]*[\/|-][\d]*', row)
  if time_row:
      date = row

  time_row = re.search('[D][A|U][T][E]', row)
  if time_row and not date:
      date = row

  time_row = re.search('[\d][\d]\:[\d][\d]', row)
  if time_row and not date:
      date = row

if not company:
  company = text.strip().split('\n')[0]

end_address_string = 'TEL' if 'TEL' in text else 'TAX'
if end_address_string in text:
  if ')' in text:
    address = text[text.index(')')+len(')'):text.index(end_address_string)]
  if address == '' and company:
    address = text[text.index(company)+len(company):text.index(end_address_string)]
else:
  address = ''

if total:
  regex = re.search('[RM]*[ ]*[\\w]*([\\.|\\,]\\d\\d)', total)
  total = "" if regex is None else regex[0]
else:
  total = ""

print('company:', company)
print('address:', address)
print("total:", total)
print("date:", date)

company: EXQUISITE GARDEN SBN BHD
address: 
LOT NO R134,GIANT HYPERMARKET PLENTONG
NOS, JALAN MASAI LAMA )
81750 MASAI. JOHOR BAHRU

total: 
date: | 03 JUN 18 03:07:32 PH


## Transformação de dados

In [6]:
def define_data_values(key, value):
  data_value = value.strip()
    
  if value:      
    if key == 'date':
      data_value = re.sub('[\d.]*[:][\d]*', '', data_value)
      data_value = re.sub('[Dd][Aa][Tt][Ee][:]*', '', data_value)
      if '/' in data_value or '-' in data_value:
        data_value = re.findall("[\d]*[\/-][\d]*[\/-]*[\d]*", data_value)[0]
    data_value = data_value.replace('PM', '').replace('AM', '').replace('\n',' ').strip()
    if key == 'total':
      data_value = data_value.replace(',', '.').replace(' ', '')

  return data_value

In [7]:
data = {}

data['company'] = define_data_values('company', company)
data['date'] = define_data_values('date', date)
data['address'] = define_data_values('address', address)
data['total'] = define_data_values('total', total)

output = json.dumps(data)
print(output)

{"company": "EXQUISITE GARDEN SBN BHD", "date": "| 03 JUN 18  PH", "address": "LOT NO R134,GIANT HYPERMARKET PLENTONG NOS, JALAN MASAI LA ) 81750 MASAI. JOHOR BAHRU", "total": ""}


## Armazenamento de dados

In [12]:
output_file = open('../../data/output/'+'receipt_extracted_regex.txt',"w+")
output_file.write(output)
output_file.close()