## This notebook covers how you can leverage Form Parser to generate annotations for CDW.
Code used Form Parser form_fields and convert them to entities to gennerate annotations for CDW

In [1]:
# Install necessary Python libraries and restart your kernel after.
# !python -m pip install -r ../requirements.txt

In [33]:
from google.cloud import documentai_v1 as documentai
from google.cloud import storage

import time
import re
import os
import pandas as pd
import simplejson as json
# import Levenshtein
import datetime

## Set your processor variables 

In [25]:
PROJECT_ID = "610517933627"
LOCATION = "us"  # Format is 'us' or 'eu'

PROCESSOR_ID = "51d19a915ff7b336"  # Create processor in Cloud Console
GCS_INPUT_BUCKET = 'hsbedi-docai-bucket'
GCS_INPUT_PREFIX = 'w2/input'
GCS_OUTPUT_URI = 'gs://hsbedi-docai-bucket'
GCS_OUTPUT_URI_PREFIX = 'w2/output'
GCS_OUTPUT_ANNOTATION_BUCKET = 'hsbedi-docai-bucket'
GCS_OUTPUT_ANNOTATION_URI_PREFIX = 'w2/annotated_samples/'

TIMEOUT = 300

The following code calls the batch API and stores response in output GCS location.

In [31]:
def process_document_from_input_file():
    
    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"
    
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    # Instantiates a client
    client_options = {"api_endpoint": "{}-documentai.googleapis.com".format(LOCATION)}
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
    storage_client = storage.Client()
    bucket = storage_client.bucket(GCS_INPUT_BUCKET)
    input_configs = []
    print("Input Files:")
    counter = 0
    api_counter = 0
    documents = []
    
    blobs = bucket.list_blobs(prefix=GCS_INPUT_PREFIX)
    
    for blob in blobs:
        
        counter = counter+1
        source = "gs://{bucket}/{name}".format(bucket = GCS_INPUT_BUCKET, name = blob.name)
        # print(source)
        
        if ".PDF" in source.upper():
            print(source)
            
            document = {"gcs_uri":source , "mime_type": "application/pdf"}
            documents.append(document)
            
            gcs_documents = documentai.GcsDocuments(documents=documents)
            
            input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
            input_configs.append(input_config)
            if counter % 50 == 0:
                
                output_config = documentai.DocumentOutputConfig(
                    gcs_output_config={"gcs_uri": destination_uri}
                )
                  
                if api_counter >= 4:
                    api_counter=0
                    time.sleep(360)
                    
                request = documentai.types.document_processor_service.BatchProcessRequest(
                name=name,
                input_documents=input_config,
                document_output_config=output_config,)    
                operation = client.batch_process_documents(request)
                print("process called")
                api_counter = api_counter + 1 
                
                print(input_config)
                # print(output_config)

                # Wait for the operation to finish
#                 operation.result(timeout=TIMEOUT)
                
                input_configs = []
                documents = []
                print(counter)
        
    if input_configs:
        output_config = documentai.DocumentOutputConfig(
                    gcs_output_config={"gcs_uri": destination_uri}
                )
        request = documentai.types.document_processor_service.BatchProcessRequest(
                name=name,
                input_documents=input_config,
                document_output_config=output_config,)    
        print(input_config)
        operation = client.batch_process_documents(request)
        
        print("process called out")

### Step1: Call Form Parser to batch process 

In [None]:
process_document_from_input_file()

In [32]:
def format_field_name(name,demiliter='_'):
    CDE_field_dict = {'A_EMPLOYEES_SOCIAL_SECURITY_NUMBER': 'EMPL_SSN', 'B_EMPLOYER_IDENTIFICATION_NUMBER': 'EMPLR_ID_NUMBER', 'C_EMPLOYERS_NAME_ADDRESS_AND_ZIP_CODE': 'EMPLR_NAME_ADDRESS','D_CONTROL_NUMBER':'CONTROL_NUMBER',
                        '1_WAGES_TIPS_OTHER_COMPENSATION': 'WAGES_TIPS_OTHER_COMP', '2_FEDERAL_INCOME_TAX_WITHHELD':'FEDERAL_INCOME_TAX_WH', '3_SOCIAL_SECURITY_WAGES':'SS_WAGES', '4_SOCIAL_SECURITY_TAX_WITHHELD':'SS_TAX_WH' }
    name = name.strip()
    name = name.replace('\n',' ')
    name = name.replace(',','')
    name = name.replace("'",'')
    name = name.replace('  ',' ')
    name = name.upper()
    name = name.replace(' ',demiliter)
    
    if name in CDE_field_dict:
        return CDE_field_dict[name]
    else:
        return None
 
def create_entity(form_field_name,form_field_value,form_textSegments,form_boundingPoly):
    
    entity_field_name = format_field_name(form_field_name)
    
    if entity_field_name:
        entity = {}
        entity['mentionText'] = form_field_value
        entity['type'] = entity_field_name
        
        normalizedVertices = []
        for vertex in form_boundingPoly.normalized_vertices:
            x= vertex.x
            y= vertex.y
            normalizedVertices.append({"x":x,"y":y})
        
        pageRefs = []
        pageRefs.append({"boundingPoly":{"normalizedVertices":normalizedVertices}})
        entity['pageAnchor'] = {"pageRefs":pageRefs}
        
        
        textSegments = []
        for segment in form_textSegments:
            textSegments.append({"endIndex":segment.end_index,"startIndex":segment.start_index})
        entity['textAnchor'] = {"content":form_field_value,"textSegments":textSegments}
        
        return entity
        
    else:
        return None
   
def entity_from_formfield(form_field):
    
    field_name = form_field.field_name.text_anchor.content
    field_value = form_field.field_value.text_anchor.content
    boundingPoly = form_field.field_value.bounding_poly
    textSegments = form_field.field_value.text_anchor.text_segments
    entity = create_entity(field_name,field_value,textSegments,boundingPoly)
    return entity

def generate_entities_from_form_fields(document):
    entities = None
    for page in document.pages:
        for form_field in page.form_fields:
            entity = entity_from_formfield(form_field)
            # print(entity)
            if entity:
                if not entities:
                    entities = []
                entities.append(entity)
    print(entities)
    return entities

def parse_sample_files_in_gcsbucket_mod():
  
    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"
    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        lineindex=-1
        if ".json" in blob.name:
            match = re.match(r"(.+)-(\d).json", blob.name.split("/")[-1])
            output_file_name = match.group(1)
            print(output_file_name)
            blob_as_bytes = blob.download_as_string()
            print("downloaded")

            document = documentai.types.Document.from_json(blob_as_bytes)
            document_json = json.loads(blob_as_bytes)
            print(f"Fetched file {i + 1}")
            entities = generate_entities_from_form_fields(document)
            document_json["entities"] = entities
            
            create_json(document_json,output_file_name)
            

def create_json(json_object, filename):
    '''
    this function will create json object in
    google cloud storage
    '''
    # create a blob
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(GCS_OUTPUT_ANNOTATION_BUCKET)
    blob = bucket.blob
    blob = bucket.blob(GCS_OUTPUT_ANNOTATION_URI_PREFIX+filename+'.json')
    # upload the blob 
    blob.upload_from_string(
        data=json.dumps(json_object),
        content_type='application/json'
        )
    result = filename + ' upload complete'
    return {'response' : result}


### Step2: Read output json from Form parser to generate doc proto for CDW.

In [None]:
parse_sample_files_in_gcsbucket_mod()