# Azure Form Recognizer

Azure Form Recognizer is a cognitive service that uses machine learning technology to identify and extract key-value pairs and table data from form documents. It then outputs structured data that includes the relationships in the original file.

    

### Overview
*Safety Incident Reports Dataset*: Raw unstructured data is fed into the pipeline in the form of electronically generated PDFs. These reports contain information about injuries that occurred at several locations belonging to a company. This data provides information on injury reports, including the nature, description, date, source and the name of the establishment where it happened. 


### Notebook Organization 
+ Fetch the injury report PDF files from a container under an azure storage account.

+ Convert the PDF files to JSON by querying the azure trained form recognizer model using the REST API.

+ Preprocess the JSON files to extract only relevant information.

+ Push the JSON files to a container under an azure storage account.

## Importing Relevant Libraries

In [1]:
# Please install this specific version of azure storage blob compatible with this notebook.
!pip install azure-storage-blob==2.1.0

# Import the required libraries
import json
import time
import requests
import os
from azure.storage.blob import BlockBlobService
import pprint
from os import listdir
from os.path import isfile, join
import shutil
import pickle
import GlobalVariables as gv



## Create Local Folders

In [2]:
# Create local directories if they don't exist
# *input_forms* contains all the pdf files to be converted to json
if (not os.path.isdir(os.getcwd()+"/input_forms")):
    os.makedirs(os.getcwd()+"/input_forms")
# *output_json* will contain all the converted json files
if (not os.path.isdir(os.getcwd()+"/output_json")):
    os.makedirs(os.getcwd()+"/output_json")

## Downloading the PDF forms from a container in azure storage

- Downloads all PDF forms from a container named *incidentreport* to a local folder *input_forms*

In [3]:
%%time
# Downloading pdf files from a container named *incidentreport* to a local folder *input_forms*
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = gv.STORAGE_ACCOUNT_NAME
STORAGE_ACCOUNT_ACCESS_KEY = gv.ACCOUNT_KEY
STORAGE_CONTAINER_NAME = "incidentreport"

# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY) 

blobs = blob_service.list_blobs(STORAGE_CONTAINER_NAME)
# Downloading pdf files from the container *incidentreport* and storing them locally to *input_forms* folder
for blob in blobs:
    # Check if the blob.name is already present in the folder input_forms. If yes then continue
    try:
        with open('merged_log','rb') as f:
            merged_files = pickle.load(f)
    except FileNotFoundError:
        merged_files = set()
    # If file is already processed then continue to next file
    if (blob.name in merged_files): 
        continue
    download_file_path = os.path.join(os.getcwd(), "input_forms", blob.name)
    blob_service.get_blob_to_path(STORAGE_CONTAINER_NAME, blob.name ,download_file_path)
    merged_files.add(blob.name)
    # Keep trace of all the processed files at the end of your script (to keep track later)
    with open('merged_log', 'wb') as f:
        pickle.dump(merged_files, f)

In [4]:
# Total number of forms to be converted to JSON
files = [f for f in listdir(os.getcwd()+"/input_forms") if isfile(join(os.getcwd()+"/input_forms", f))]

## Querying the custom trained form recognizer model (PDF -> JSON)

- Converts PDF -> JSON by querying the trained custom model.
- Preprocess the JSON file and extract only the relevant information.

In [5]:
%%time
# Endpoint parameters for querying the custom trained form-recognizer model to return the processed JSON
# Processes PDF files one by one and return CLEAN JSON files
endpoint = gv.endpoint
# Change if api key is expired
apim_key = gv.credential
# This model is the one trained on 5 forms
model_id = gv.model_id
post_url = endpoint + "/formrecognizer/v2.0-preview/custom/models/%s/analyze" % model_id
files = [f for f in listdir(os.getcwd()+"/input_forms") if isfile(join(os.getcwd()+"/input_forms", f))]
params = {"includeTextDetails": True}
headers = {'Content-Type': 'application/pdf', 'Ocp-Apim-Subscription-Key': apim_key}

local_path = os.path.join(os.getcwd(), "input_forms//")
output_path = os.path.join(os.getcwd(), "output_json//")

for file in files:
    try:
        with open('json_log','rb') as l:
            json_files = pickle.load(l)
    except FileNotFoundError:
        json_files = set()
    if (file in json_files): 
        continue
    else:
        with open(local_path+file, "rb") as f:
            data_bytes = f.read()
        
    try:
        resp = requests.post(url = post_url, data = data_bytes, headers = headers, params = params)
        print('resp',resp)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % json.dumps(resp.json()))
            quit()
        print("POST analyze succeeded:\n%s" % resp.headers)
        get_url = resp.headers["operation-location"]
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
        quit()
     
    n_tries = 15
    n_try = 0
    wait_sec = 5
    max_wait_sec = 60
    while n_try < n_tries:
        try:
            resp = requests.get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
            resp_json = resp.json()
            if resp.status_code != 200:
                print("GET analyze results failed:\n%s" % json.dumps(resp_json))
                quit()
            status = resp_json["status"]
            if status == "succeeded":
                print("Analysis succeeded:\n%s" % file[:-4])
                allkeys = resp_json['analyzeResult']['documentResults'][0]['fields'].keys()
                new_dict = {}
                for i in allkeys:
                    if resp_json['analyzeResult']['documentResults'][0]['fields'][i] != None:
                        key = i.replace(" ", "_")
                        new_dict[key] = resp_json['analyzeResult']['documentResults'][0]['fields'][i]['valueString']
                    else:
                        key = i.replace(" ", "_")
                        new_dict[key] = None
                # Appending form url to json
                new_dict['form_url'] = 'https://stcognitivesearch0001.blob.core.windows.net/formupload/' + file 
                with open(output_path+file[:-4]+".json", 'w') as outfile:
                    json.dump(new_dict, outfile)
                # Change the encoding of file in case of spanish forms. It will detected random characters.
                with open(output_path+file[:-4]+".json", 'w', encoding='utf-8') as outfile:
                    json.dump(new_dict, outfile, ensure_ascii=False)
                # Once JSON is saved log it otherwise don't log it.
                json_files.add(file)
                with open('json_log', 'wb') as f:
                    pickle.dump(json_files, f)

                break
            if status == "failed":
                print("Analysis failed:\n%s" % json.dumps(resp_json))
                quit()
            # Analysis still running. Wait and retry.
            time.sleep(wait_sec)
            n_try += 1
            wait_sec = min(2*wait_sec, max_wait_sec)     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
            quit()

## Upload the JSON files to a cotainer

- Upload JSON files from local folder *output_json* to the container *formrecogoutput*

In [6]:
# Total number of converted JSON
files = [f for f in listdir(os.getcwd()+"/output_json") if isfile(join(os.getcwd()+"/output_json", f))]

In [7]:
%%time
# Connect to the container for uploading the JSON files
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = ""
STORAGE_ACCOUNT_ACCESS_KEY = ""
# Upload the JSON files in this container
STORAGE_CONTAINER_NAME = "formrecogoutput"
# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY)

CPU times: user 384 µs, sys: 0 ns, total: 384 µs
Wall time: 390 µs


In [8]:
%%time
# Upload JSON files from local folder *output_json* to the container *formrecogoutput*
local_path = os.path.join(os.getcwd(), "output_json")
# print(local_path)
for files in os.listdir(local_path):
#     print(os.path.join(local_path,files))
    blob_service.create_blob_from_path(STORAGE_CONTAINER_NAME, files, os.path.join(local_path,files))