In [4]:
#!pip install --upgrade google-cloud-storage

In [18]:
# https://cloud.google.com/vision/docs/pdf

In [3]:
import json
import re
from google.cloud import storage
# Imports the Google Cloud client library
from google.cloud import vision
from google.oauth2 import service_account

In [5]:
key_path = "keys/electric-clone-238012-47cdb0d3319b.json"  
credentials = service_account.Credentials.from_service_account_file(
    key_path,
)

In [6]:
client = vision.ImageAnnotatorClient(credentials=credentials)

feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

In [7]:
#https://developers.google.com/resources/api-libraries/documentation/vision/v1/csharp/latest/classGoogle_1_1Apis_1_1Vision_1_1v1_1_1Data_1_1InputConfig.html

In [8]:
help(vision.InputConfig())

Help on InputConfig in module google.cloud.vision_v1.types.image_annotator object:

class InputConfig(proto.message.Message)
 |  InputConfig(mapping=None, *, ignore_unknown_fields=False, **kwargs)
 |  
 |  The desired input location and metadata.
 |  
 |  Attributes:
 |      gcs_source (google.cloud.vision_v1.types.GcsSource):
 |          The Google Cloud Storage location to read the
 |          input from.
 |      content (bytes):
 |          File content, represented as a stream of bytes. Note: As
 |          with all ``bytes`` fields, protobuffers use a pure binary
 |          representation, whereas JSON representations use base64.
 |  
 |          Currently, this field only works for BatchAnnotateFiles
 |          requests. It does not work for AsyncBatchAnnotateFiles
 |          requests.
 |      mime_type (str):
 |          The type of the file. Currently only
 |          "application/pdf", "image/tiff" and "image/gif"
 |          are supported. Wildcards are not supported.
 |  

In [9]:
mime_type = "application/pdf"
gcs_source_uri = "gs://pdf_extraction_iwm/input/pdf_test.pdf"
batch_size = 2

In [10]:
#input_config = vision.InputConfig(content=content_pdf, mime_type= mime_type)

In [11]:
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

In [12]:
help(vision.OutputConfig())

Help on OutputConfig in module google.cloud.vision_v1.types.image_annotator object:

class OutputConfig(proto.message.Message)
 |  OutputConfig(mapping=None, *, ignore_unknown_fields=False, **kwargs)
 |  
 |  The desired output location and metadata.
 |  
 |  Attributes:
 |      gcs_destination (google.cloud.vision_v1.types.GcsDestination):
 |          The Google Cloud Storage location to write
 |          the output(s) to.
 |      batch_size (int):
 |          The max number of response protos to put into each output
 |          JSON file on Google Cloud Storage. The valid range is [1,
 |          100]. If not specified, the default value is 20.
 |  
 |          For example, for one pdf file with 100 pages, 100 response
 |          protos will be generated. If ``batch_size`` = 20, then 5
 |          json files each containing 20 response protos will be
 |          written under the prefix ``gcs_destination``.\ ``uri``.
 |  
 |          Currently, batch_size only applies to GcsDestinat

In [19]:
gcs_destination_uri = "gs://pdf_extraction_iwm/output/pdf_test.json"
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

In [25]:
async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

operation = client.async_batch_annotate_files(requests=[async_request])

print("Waiting for the operation to finish.")
operation.result(timeout=420)

Waiting for the operation to finish.


responses {
  output_config {
    gcs_destination {
      uri: "gs://pdf_extraction_iwm/output/pdf_test.json"
    }
    batch_size: 2
  }
}

In [21]:
# Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
storage_client = storage.Client(credentials=credentials)

match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name)

In [26]:
# List objects with the given prefix, filtering out folders.
blob_list = [
    blob
    for blob in list(bucket.list_blobs(prefix=prefix))
    if not blob.name.endswith("/")
]
print("Output files:")
for blob in blob_list:
    print(blob.name)

Output files:
output/pdf_test.jsonoutput-1-to-2.json


In [23]:
# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]

json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)

# The actual response for the first page of the input file.
first_page_response = response["responses"][0]
annotation = first_page_response["fullTextAnnotation"]

# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print("Full text:\n")
print(annotation["text"])

Full text:

THE PII PROBLEM:
PRIVACY AND A NEW CONCEPT OF
PERSONALLY IDENTIFIABLE
INFORMATION
PAUL M. SCHWARTZ† & DANIEL J. SOLOVE
Personally identifiable information (PII) is one of the most central concepts in
information privacy regulation. The scope of privacy laws typically turns on
whether PII is involved. The basic assumption behind the applicable laws is that if
PII is not involved, then there can be no privacy harm. At the same time, there is no
uniform definition of PII in information privacy law. Moreover, computer science
has shown that in many circumstances non-PII can be linked to individuals, and
that de-identified data can be re-identified. PII and non-PII are thus not immutable
categories, and there is a risk that information deemed non-PII at one time can be
transformed into PII at a later juncture. Due to the malleable nature of what consti-
tutes PII, some commentators have even suggested that PII be abandoned as the
mechanism by which to define the boundaries of pr

In [27]:
type(response)

dict

In [29]:
response.keys()

dict_keys(['inputConfig', 'responses'])

In [31]:
type(response['responses'])

list

In [34]:
response['responses'][0].keys()

dict_keys(['fullTextAnnotation', 'context'])

In [36]:
response['responses'][1]['context']

{'uri': 'gs://pdf_extraction_iwm/input/pdf_test.pdf', 'pageNumber': 2}

In [37]:
response

{'inputConfig': {'gcsSource': {'uri': 'gs://pdf_extraction_iwm/input/pdf_test.pdf'},
  'mimeType': 'application/pdf'},
 'responses': [{'fullTextAnnotation': {'pages': [{'property': {'detectedLanguages': [{'languageCode': 'en',
         'confidence': 0.91798544},
        {'languageCode': 'af', 'confidence': 0.032324255},
        {'languageCode': 'sk', 'confidence': 0.016187862}]},
      'width': 612,
      'height': 792,
      'blocks': [{'boundingBox': {'normalizedVertices': [{'x': 0.21895425,
           'y': 0.060606062},
          {'x': 0.78104573, 'y': 0.060606062},
          {'x': 0.78104573, 'y': 0.12626262},
          {'x': 0.21895425, 'y': 0.12626262}]},
        'paragraphs': [{'boundingBox': {'normalizedVertices': [{'x': 0.3529412,
             'y': 0.060606062},
            {'x': 0.64705884, 'y': 0.060606062},
            {'x': 0.64705884, 'y': 0.077020206},
            {'x': 0.3529412, 'y': 0.077020206}]},
          'words': [{'property': {'detectedLanguages': [{'languageCode