# AI Generation of PDF Indexes

## PDF Parsing

In [101]:
!pip install -q google-cloud-aiplatform google-cloud-documentai google-cloud-storage

In [42]:
import re
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai  # type: ignore
from google.cloud import storage

# TODO(developer): Uncomment these variables before running the sample.
project_id = "indexing-pdfs"
location = "us" # Format is "us" or "eu"
processor_id = "67c13d8a290241eb" # Create processor before running sample
gcs_output_uri = "gs://indexing-pdfs/output/" # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
processor_version_id = "pretrained-ocr-v2.0-2023-06-02" # Optional. Example: pretrained-ocr-v1.0-2020-09-23

# TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
gcs_input_uri = "gs://indexing-pdfs/The Project Gutenberg eBook of Dracula, by Bram Stoker.pdf" # Format: gs://bucket/directory/file.pdf
input_mime_type = "application/pdf"
gcs_input_prefix = "gs://indexing-pdfs/" # Format: gs://bucket/directory/
timeout = 1200

In [43]:
# You must set the `api_endpoint` if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

if gcs_input_uri:
    # Specify specific GCS URIs to process individual documents
    gcs_document = documentai.GcsDocument(
        gcs_uri=gcs_input_uri, mime_type=input_mime_type
    )
    # Load GCS Input URI into a List of document files
    gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
    input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
else:
    # Specify a GCS URI Prefix to process an entire directory
    gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
    input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

# Cloud Storage URI for the Output Directory
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
    gcs_uri=gcs_output_uri,
)

# Where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

if processor_version_id:
    # The full resource name of the processor version, e.g.:
    # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version_id
    )
else:
    # The full resource name of the processor, e.g.:
    # projects/{project_id}/locations/{location}/processors/{processor_id}
    name = client.processor_path(project_id, location, processor_id)

request = documentai.BatchProcessRequest(
    name=name,
    input_documents=input_config,
    document_output_config=output_config,
)

# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)

# Continually polls the operation until it is complete.
# This could take some time for larger files
# Format: projects/{project_id}/locations/{location}/operations/{operation_id}
try:
    print(f"Waiting for operation {operation.operation.name} to complete...")
    operation.result(timeout=timeout)
# Catch exception when operation doesn't finish before timeout
except (RetryError, InternalServerError) as e:
    print(e.message)

# NOTE: Can also use callbacks for asynchronous processing
#
# def my_callback(future):
#   result = future.result()
#
# operation.add_done_callback(my_callback)

# Once the operation is complete,
# get output document information from operation metadata
metadata = documentai.BatchProcessMetadata(operation.metadata)

if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
    raise ValueError(f"Batch Process Failed: {metadata.state_message}")

storage_client = storage.Client()

documents = []
print("Output files:")
# One process per Input Document
for process in list(metadata.individual_process_statuses):
    # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
    # The Cloud Storage API requires the bucket name and URI prefix separately
    matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
    if not matches:
        print(
            "Could not parse output GCS destination:",
            process.output_gcs_destination,
        )
        continue

    output_bucket, output_prefix = matches.groups()

    # Get List of Document Objects from the Output Bucket
    output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

    # Document AI may output multiple JSON files per source file
    for blob in output_blobs:
        # Document AI should only output JSON files to GCS
        if blob.content_type != "application/json":
            print(
                f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
            )
            continue

        # Download JSON File as bytes object and convert to Document Object
        print(f"Fetching {blob.name}")
        document = documentai.Document.from_json(
            blob.download_as_bytes(), ignore_unknown_fields=True
        )
        documents.append(document)

Waiting for operation projects/942189414340/locations/us/operations/5643725656911370544 to complete...
Output files:
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-0.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-1.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-10.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-11.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-12.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-13.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project Gutenberg eBook of Dracula by Bram Stoker-14.json
Fetching output/5643725656911370544/0/b401b9b85850ce41-The Project

## Processing of Parsed Results

In [44]:
import re

def filter_content(text):
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        if not any([
            "The Project Gutenberg eBook of Dracula, by Bram Stoker" in line,
            re.match(r'\d+ of \d+', line.strip()),
            re.match(r'\d+/\d+/\d+, \d+:\d+ [AP]M', line.strip()),
            "https://www.gutenberg.org/cache/epub/345/pg345-images.html" in line
        ]):
            filtered_lines.append(line)
    return '\n'.join(filtered_lines)

In [119]:
page_texts = []

for document in documents:
    for page in document.pages:
        page_text = document.text[
            page.layout.text_anchor.text_segments[0].start_index:
            page.layout.text_anchor.text_segments[0].end_index
        ]

        filtered_page_text = filter_content(page_text.strip())
        
        page_texts.append({
            "page_number": page.page_number,
            "page_content": filtered_page_text
        })

print(f"Processed {len(page_texts)} pages.")

Processed 324 pages.


In [142]:
sorted_page_texts = sorted(page_texts, key=lambda x: x['page_number'])

## LLM-Based Identification of Keywords

In [146]:
import base64
import time
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models

def generate(text):
    vertexai.init(project="indexing-pdfs", location="us-central1")
    model = GenerativeModel(
     "gemini-1.5-flash-001",
    )
    responses = model.generate_content(
        [f"""
I am building an index for a book. What keywords or key terms, from the text below, should be represented in the index? Please respond with only a list of values in a single line, separated by "|" characters.

{text}
"""],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )
    response_text = ""
    for response in responses:
        response_text += response.text
    
    return response_text

generation_config = {
    "max_output_tokens": 2038,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_UNSPECIFIED: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

for sorted_page_text in sorted_page_texts:
    try:
        sorted_page_text["keywords"] = generate(sorted_page_text["page_content"])
    except ValueError:
#         Potentially use some backup models here
        continue
    except FailedPrecondition:
        continue
    time.sleep(0.5)

In [147]:
unindexed_pages = [sorted_page_text["page_content"] for sorted_page_text in sorted_page_texts if "keywords" not in sorted_page_text.keys()]

In [148]:
len(unindexed_pages)

11

## Compiling Index Items

In [152]:
compiled_index_items = {}
for sorted_page_text in sorted_page_texts:
    if "keywords" in sorted_page_text.keys():
        index_items = sorted_page_text["keywords"].split("|")
        for index_item in index_items:
            index_item_lower = index_item.lower().strip()
            if index_item_lower not in compiled_index_items.keys():
                compiled_index_items[index_item_lower] = [sorted_page_text["page_number"]]
            elif sorted_page_text["page_number"] not in compiled_index_items[index_item_lower]:
                compiled_index_items[index_item_lower].append(sorted_page_text["page_number"])

In [153]:
for compiled_index_item in compiled_index_items.keys():
    print(
        compiled_index_item,
        ", ".join([str(item) for item in compiled_index_items[compiled_index_item]])
    )

dracula 1, 2, 6, 7, 29, 30, 47, 106, 174, 197, 200, 220, 238, 265, 305, 307, 308, 312, 314, 316
bram stoker 1, 2
project gutenberg 1, 317, 319, 321, 322, 324
ebook 1
license 1, 320
united states 1, 318, 319
english 1, 22
new york 2
grosset & dunlap 2, 315, 316
publishers 2
copyright 2, 318, 320, 321, 324
1897 2
united states of america 2
country life press 2
garden city, n.y. 2
dedication 3
friend 3, 44, 52, 113, 136, 139, 140, 142, 153, 173, 177, 193, 244
hommy-beg 3
jonathan harker's journal 4, 270, 277, 296
mina murray's journal 4, 77, 79
lucy westenra's diary 4
dr. seward's diary 4, 109, 267
the dailygraph 4, 187
van helsing 4, 6, 97, 98, 99, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 115, 121, 124, 125, 126, 127, 128, 129, 130, 134, 135, 136, 137, 138, 140, 142, 143, 146, 147, 151, 157, 158, 159, 165, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 204, 213, 222, 224, 226, 227, 231, 232, 233, 234, 235, 238, 242, 243, 244, 246, 247, 249, 