In [None]:
import os
import subprocess
import cv2
import matplotlib.pyplot as plt
from http import client
from urllib import request, parse, error
import base64
import pandas as pd
import requests
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from PIL import Image
from io import BytesIO
import glob

#os.chdir()

## 1 / Parse PDF and convert to image

Most PDFs that you work with will be multiple pages. You will want to parse these pdfs into their individual pages and convert to images. Luckily, images can't be multiple pages so most tools will both steps at the same time!
I am using XpdfReader's command utility pdftopng. 

In [None]:
subprocess.call("pdftopng -q -r 300 Demo_Acord.pdf Demo_Acord")

# Reminder : Show the Image form that we are parsing!

## 2 / Image Pre-processing

##### This is super important but not relevant our sample form
For image classification, you may want to explore 
* resizing
* normalizing values
* dimensionality reduction.
* etc.

For OCR, the goal of preprocessing is to make text more clear. You may want to:
* Scale to a larger size
* Increase contrast
* Deskew (remove rotation)
* etc.

## 3 / OCR Engine

I am using Microsoft Azure's OCR engine.

In [None]:
def Micro_Vision(key = None, image_path = None):
    
    subscription_key = key
    assert subscription_key
    
    image_path = image_path
    
    ocr_url = "https://eastus.api.cognitive.microsoft.com/vision/v2.0/ocr"
    
    with open(image_path, 'rb') as f:
        image = f.read()
    
    headers = {'Ocp-Apim-Subscription-Key': subscription_key,
               'Content-Type' : 'application/octet-stream'}
    
    params  = {'language': 'en', 
               'detectOrientation': 'true'}
    
    response = requests.post(ocr_url, headers=headers, params=params, data=image)
    
    return(response.json())

In [None]:
image_path = "Demo_Acord-000001.png"
response = Micro_Vision(image_path = image_path)

In [None]:
response

## 4 / Identify Bounding Boxes

In [None]:
def get_words(ocr_results):
    line_infos = [region["lines"] for region in ocr_results["regions"]]
    word_infos = []
    for line in line_infos:
        for word_metadata in line:
            for word_info in word_metadata["words"]:
                word_infos.append(word_info)
    return(word_infos)

In [None]:
word_infos = get_words(ocr_results = response)
word_infos

In [None]:
plt.figure(figsize=(30, 30))
image = Image.open(image_path)
ax = plt.imshow(image, alpha=0.5)
for word in word_infos:
    bbox = [int(num) for num in word["boundingBox"].split(",")]
    text = word["text"]
    origin = (bbox[0], bbox[1])
    patch  = Rectangle(origin, bbox[2], bbox[3], fill=False, linewidth=2, color='y')
    ax.axes.add_patch(patch)
    plt.text(origin[0], origin[1], text, fontsize=20, weight="bold", va="top")
plt.axis("off")

### Looks pretty good for the most part, but how do we ensure that we're going to get all of the values we are looking for?

## Zonal (or Template) OCR

First, we need to build a template. To do this we need to understand that an image is like a map in the sense that the location (or index) of pixels is meaningful. So we need to find the x and y as well as width and height of text that we'd like to extract.

In [None]:
bounds = pd.read_csv("bounding_boxes.csv")
bounds

#### View cropped image

In [None]:
field_name = "Lim_Med"
x, y, w, h = bounds[bounds['Field_Name'] == field_name].iloc[0,1:]

In [None]:
plt.figure(figsize=(10, 10))
img = cv2.imread(image_path)
crop_image = img[y:y+h, x:x+w]
plt.imshow(crop_image)

#### Parse each box and write to disk.

In [None]:
def parse_form(image_path, bounds):
    img = cv2.imread(image_path)
    bounds = pd.read_csv(bounds)
    for i in range(len(bounds)):
        x, y, w, h = bounds.iloc[i,1:]
        crop_image = img[y:y+h, x:x+w]
        cv2.imwrite("form_parts/" + bounds.iloc[i,0] + ".png", crop_image)
    
    if len(glob.glob('form_parts/*.png')) == len(bounds):
        print('Successfully parsed image!')

In [None]:
parse_form(image_path = image_path, bounds = "bounding_boxes.csv")

In [None]:
forms = glob.glob('form_parts/*.png')
forms

#### Run OCR on each bounding box.

In [None]:
def part_ocr(field_name = None):
    find_me = [x.find('Agency') for x in forms]
    im_found = find_me.index(max(find_me))
    ocr_part = Micro_Vision(image_path = forms[im_found])
    return(ocr_part)

In [None]:
part_json = part_ocr(field_name = 'Agency')
part_json

In [None]:
get_words(part_json)

In [None]:
results = []
for form_part in forms:
    result = Micro_Vision(image_path = form_part)
    results.append(result)

In [None]:
results

In [None]:
result_words = [get_words(x) for x in results]

In [None]:
result_words