## Import necessary libraries (json, pickle, requests).

In [1]:
import json
import pickle
import requests

## Define the API URL for the OCR service.

In [2]:
url = "https://ocr.asprise.com/api/v1/receipt"

## Set the image file path

In [3]:
image = "/kaggle/input/okay-tools-pvt-ltd/Okay Tools Pvt. Ltd-1.jpg"

## Make an API request to the OCR service and save the response in a JSON file.

In [4]:
res = requests.post(url,
                   data = {
                       'api_key': 'TEST',
                       'recognizer': 'auto',
                       'ref_no': 'oct_python_123'
                   },
                   files = {
                       'file': open(image,'rb')
                   })


with open("response22.json","w") as f:
    json.dump(json.loads(res.text), f)

## Load the JSON data from the response file and print the keys of the top-level dictionary.

In [5]:
with open("response22.json","r") as f:
    data = json.load(f)

print(data.keys())

dict_keys(['ocr_type', 'request_id', 'ref_no', 'file_name', 'request_received_on', 'success', 'image_width', 'image_height', 'image_rotation', 'recognition_completed_on', 'receipts'])


## Load the JSON data again and print the keys of the dictionary inside the 'receipts' list.

In [6]:
with open("response22.json","r") as f:
    data = json.load(f)

print(data['receipts'][0].keys())

dict_keys(['merchant_name', 'merchant_address', 'merchant_phone', 'merchant_website', 'merchant_tax_reg_no', 'merchant_company_reg_no', 'region', 'mall', 'country', 'receipt_no', 'date', 'time', 'items', 'currency', 'total', 'subtotal', 'tax', 'service_charge', 'tip', 'payment_method', 'payment_details', 'credit_card_type', 'credit_card_number', 'ocr_text', 'ocr_confidence', 'width', 'height', 'avg_char_width', 'avg_line_height', 'conf_amount', 'source_locations'])


## Begin processing the invoice data by iterating over each invoice.
## Extract and print basic invoice information like invoice number, invoice date, GST number, vendor name, etc.
## Continue processing the invoice data by extracting vendor address, delivery address, buyer name, buyer address, item details, total invoice amount, and total tax amount.

In [7]:
# Assuming data contains the loaded JSON data
invoices = data['receipts']  # Assuming that 'receipts' contains a list of invoices

for invoice in invoices:
    invoice_number = invoice['receipt_no']
    invoice_date = invoice['date']
    gst_number = invoice['merchant_tax_reg_no']
    vendor_name = invoice['merchant_name']

    # Extracting the merchant address assuming it contains vendor's address
    vendor_address = invoice['merchant_address']

    # Extracting the delivery address (Assumption: present under 'merchant_address')
    delivery_address = invoice['merchant_address']  # Update this based on actual data

    # Extracting the buyer name (Assumption: present under 'receipts' dictionary)
    buyer_name = invoice.get('buyer_name', '')  # Default to empty string if not present

    # Extracting the buyer address (Assumption: present under 'receipts' dictionary)
    buyer_address = invoice.get('buyer_address', '')  # Default to empty string if not present

    item_details = invoice['items']
    total_invoice_amount = invoice['total']
    total_tax_amount = invoice['tax']

    # Assuming PO number is not directly present in the JSON
    # You need to implement a method to extract it based on your data structure

    # Print or process the extracted information
    print("Invoice Number:", invoice_number)
    print("Invoice Date:", invoice_date)
    print("GST Number:", gst_number)
    print("Vendor Name:", vendor_name)
    print("Vendor Address:", vendor_address)
    print("Delivery Address:", delivery_address)
    print("Buyer Name:", buyer_name)
    print("Buyer Address:", buyer_address)
    print("Item Details:", item_details)
    print("Total Invoice Amount:", total_invoice_amount)
    print("Total Tax Amount:", total_tax_amount)
    # Print or process the extracted PO number
    # ...
    print("\n")

Invoice Number: OT/964
Invoice Date: 2019-08-19
GST Number: 27AAACO2635C1ZV
Vendor Name: toole pvt. ltd.
Vendor Address: REGD. OFFICE & WORKS: E-16 & E-28, M.I.D.C. Industrial Area, Ambad, Nashik-422 010. Telefax:(0253) 2384829,2307370. OKAY
Delivery Address: REGD. OFFICE & WORKS: E-16 & E-28, M.I.D.C. Industrial Area, Ambad, Nashik-422 010. Telefax:(0253) 2384829,2307370. OKAY
Buyer Name: 
Buyer Address: 
Item Details: [{'amount': 166428.0, 'category': None, 'description': '35316134-01SHAFT (ECS)8538207.00', 'flags': '', 'qty': 804.0, 'remarks': None, 'tags': None, 'unitPrice': None}, {'amount': 29200.0, 'category': None, 'description': '353-16137-01LEVER (ECS)8538200.00', 'flags': '', 'qty': 146.0, 'remarks': None, 'tags': None, 'unitPrice': None}, {'amount': 82800.0, 'category': None, 'description': '37304160-01CONNECTING ROD-373041608538300.00276:', 'flags': '', 'qty': 0, 'remarks': None, 'tags': None, 'unitPrice': None}, {'amount': 84000.0, 'category': None, 'description': '353160

## Extract the PO number from the OCR text using string manipulation based on the "Po No:" line.
Print the extracted PO number or "PO Number: Not Found" if no PO number is available

In [8]:
po_number = None
for line in invoice['ocr_text'].split('\n'):
    if "Po No:" in line:
        po_number = line.split("Po No:")[1].strip()
        break

# Print or process the extracted PO number
if po_number:
    print("PO Number:", po_number)
else:
    print("PO Number: Not Found")

print("\n")

PO Number: CN1809651              Date: 16/04/2019




## Summary of the above code
The provided code demonstrates the process of extracting structured information from unstructured text using Optical Character Recognition (OCR) on receipt images. The code interacts with an OCR API to analyze receipt images, extracting relevant details such as invoice numbers, dates, GST numbers, vendor information, item details, and total amounts. It processes the extracted data and prints it in a formatted manner.

The code begins by importing required libraries and sending an image to the OCR API for analysis. It loads the resulting JSON response containing OCR results, including merchant and receipt information. The extracted data includes merchant details, receipt numbers, dates, item descriptions, amounts, and tax details.

The code iterates through the extracted receipts, extracting and printing relevant information like invoice number, date, GST number, vendor details, delivery and buyer addresses, item details, total invoice amount, and total tax amount. Additionally, the code extracts the Purchase Order (PO) number if present in the OCR text and prints it.

The code demonstrates the challenges of extracting structured data from unstructured text and highlights the importance of approaches beyond rule-based methods, like OCR and machine learning, to efficiently handle diverse document formats and complexities.
