Copyright 2024 Google LLC <br>
|Author(s) | [Holt Skinner](https://github.com/holtskinner), [Drew Gillson](https://github.com/drewgillson) |

In [3]:
%pip install --upgrade --quiet google-cloud-aiplatform

Note: you may need to restart the kernel to use updated packages.


In [4]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


In [2]:
PROJECT_ID = "llm-studies"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
import json

from IPython.display import Markdown, display_pdf
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
)

In [6]:
#Load the Gemini 1.5 Flash model
model = GenerativeModel(
    "gemini-1.5-flash",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH
    },
)
# This Generation Config sets the model to respond in JSON format.
generation_config = GenerationConfig(
    temperature=0.0, response_mime_type="application/json"
)

In [7]:
PDF_MIME_TYPE = "application/pdf"


def print_multimodal_prompt(contents: list) -> None:
    """
    Given contents that would be sent to Gemini,
    output the full multimodal prompt for ease of readability.
    """
    for content in contents:
        if not isinstance(content, Part):
            print(content)
        elif content.inline_data:
            display_pdf(content.inline_data.data)
        elif content.file_data:
            gcs_url = (
                "https://storage.googleapis.com/"
                + content.file_data.file_uri.replace("gs://", "").replace(" ", "%20")
            )
            print(f"PDF URL: {gcs_url}")


# Send Google Cloud Storage Document to Vertex AI
def process_document(
    prompt: str,
    file_uri: str,
    mime_type: str = PDF_MIME_TYPE,
    generation_config: GenerationConfig | None = None,
    print_prompt: bool = False,
    print_raw_response: bool = False,
) -> str:
    # Load file directly from Google Cloud Storage
    file_part = Part.from_uri(
        uri=file_uri,
        mime_type=mime_type,
    )

    # Load contents
    contents = [file_part, prompt]

    # Send to Gemini
    response = model.generate_content(contents, generation_config=generation_config)

    if print_prompt:
        print("-------Prompt--------")
        print_multimodal_prompt(contents)

    if print_raw_response:
        print("\n-------Raw Response--------")
        print(response)

    return response.text

In [36]:
blogger_extraction_prompt = """You are a document entity extraction specialist. Given a document, your task is to extract the text value of the following entities:
{
	"authors": [
		{
			"author": "",
		}
	],
	"theme": "",
	"philosophical area": "",
}

- The JSON schema must be followed during the extraction.
- The values must only include text found in the document
- Do not normalize any entity value.
- If an entity is not found in the document, set the entity value to null.
- An author is any person name or philosopher found in the document.
- If an author is not found in the document, set the entity value to Luis Quissak.

"""

In [34]:
# Download a PDF from Google Cloud Storage
! gsutil cp "gs://blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf" ./post1.pdf

Copying gs://blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf...
/ [0 files][    0.0 B/ 99.4 KiB]                                                
-
- [1 files][ 99.4 KiB/ 99.4 KiB]                                                

Operation completed over 1 objects/99.4 KiB.                                     


In [21]:
# Load file bytes
with open("post1.pdf", "rb") as f:
    file_part = Part.from_data(data=f.read(), mime_type="application/pdf")

# Load contents
contents = [file_part, blogger_extraction_prompt]

# Send to Gemini with GenerationConfig
response = model.generate_content(contents, generation_config=generation_config)

In [37]:
print("-------Prompt--------")
print_multimodal_prompt(contents)

print("\n-------Raw Response--------")
print(response.text)

-------Prompt--------
You are a document entity extraction specialist. Given a document, your task is to extract the text value of the following entities:
{
	"authors": [
		{
			"author": "",
		}
	],
	"theme": "",
	"philosophical area": "",
}

- The JSON schema must be followed during the extraction.
- The values must only include text found in the document
- Do not normalize any entity value.
- If an entity is not found in the document, set the entity value to null.


-------Raw Response--------
{"authors": [{"author": "Platão"}], "theme": "Teoria do Conhecimento", "philosophical area": null}


This response can then be parsed as JSON into a Python dictionary for use in other applications.

In [17]:
print("\n-------Parsed Entities--------")
json_object = json.loads(response.text)
print(json_object)


-------Parsed Entities--------
{'authors': [{'author': 'Platão'}], 'theme': 'Teoria do Conhecimento', 'philosophical area': None}


You can see that Gemini extracted all of the relevant fields from the document.

In [40]:
response_text = process_document(
    blogger_extraction_prompt,
#    "gs://blog-files-2024/all/pdf2/2013-11-08_Liberdade.pdf",
    "gs://blog-files-2024/all/pdf2/2024-01-26_As_abstrações_do_Bispo_Berkeley.pdf",
    #"gs://blog-files-2024/all/pdf2/2014-03-12_Tzvetan,_Pondé,_Safatle.pdf",
    #"gs://blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf",
    generation_config=generation_config,
    print_prompt=True,
)

-------Prompt--------
PDF URL: https://storage.googleapis.com/blog-files-2024/all/pdf2/2024-01-26_As_abstrações_do_Bispo_Berkeley.pdf
You are a document entity extraction specialist. Given a document, your task is to extract the text value of the following entities:
{
	"authors": [
		{
			"author": "",
		}
	],
	"theme": "",
	"philosophical area": "",
}

- The JSON schema must be followed during the extraction.
- The values must only include text found in the document
- Do not normalize any entity value.
- If an entity is not found in the document, set the entity value to null.
- An author is any person name or philosopher found in the document.
- If an author is not found in the document, set the entity value to Luis Quissak.




In [41]:
print("\n-------Parsed Entities--------")
json_object = json.loads(response_text)
print(json_object)


-------Parsed Entities--------
{'authors': [{'author': 'Bispo Berkeley'}, {'author': 'Locke'}, {'author': 'Boyle'}, {'author': 'Hacking'}, {'author': 'Hobbes'}, {'author': 'Platão'}, {'author': 'Wittgenstein'}, {'author': 'Descartes'}], 'theme': 'As abstrações do Bispo Berkeley', 'philosophical area': 'Filosofia corpuscular'}


## Document Classification

Document classification is the process for identifying the type of document. For example, invoice, W-2, receipt, etc.

In this example, you will use a sample tax form (W-9) and get the specific type of document from a specified list.

In [24]:
classification_prompt = """You are a document classification assistant. Given a document, your task is to find which category the document belongs to from the list of document categories provided below.

 Antropologia
 Ciência
 Crônica
 Educação
 Epistemologia
 Ética
 Liberdade
 Linguagem
 Marxismo
 Mente
 Ontologia Social
 Poesia
 Política
 Psicanálise
 Tecnologia

Which category does the above document belong to? Answer with one of the predefined document categories only.
"""

In [25]:
response_text = process_document(
    classification_prompt,
    "gs://blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf",
    print_prompt=True,
)

-------Prompt--------
PDF URL: https://storage.googleapis.com/blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf
You are a document classification assistant. Given a document, your task is to find which category the document belongs to from the list of document categories provided below.

 Antropologia
 Ciência
 Crônica
 Educação
 Epistemologia
 Ética
 Liberdade
 Linguagem
 Marxismo
 Mente
 Ontologia Social
 Poesia
 Política
 Psicanálise
 Tecnologia

Which category does the above document belong to? Answer with one of the predefined document categories only.



In [26]:
print("\n-------Document Classification--------")
print(response_text)


-------Document Classification--------
Epistemologia 



You can see that Gemini successfully categorized the document.

### Chaining Classification and Extraction

These techniques can also be chained together to extract any number of document types. For example, if you have multiple types of documents to process, you can send each document to Gemini with a classification prompt, then based on that output, you can write logic to decide which extraction prompt to use.

In [None]:
generic_document_prompt = """You are a document entity extraction specialist. Given a document, your task is to extract the text value of the following entities:

{}

- The JSON schema must be followed during the extraction.
- The values must only include text found in the document
- Do not normalize any entity value.
- If an entity is not found in the document, set the entity value to null.
"""

w2_extraction_prompt = generic_document_prompt.format(
    """
{
    "ControlNumber": "",
    "EIN": "",
    "EmployeeAddress_City": "",
    "EmployeeAddress_State": "",
    "EmployeeAddress_StreetAddressOrPostalBox": "",
    "EmployeeAddress_Zip": "",
    "EmployeeName_FirstName": "",
    "EmployeeName_LastName": "",
    "EmployerAddress_City": "",
    "EmployerAddress_State": "",
    "EmployerAddress_StreetAddressOrPostalBox": "",
    "EmployerAddress_Zip": "",
    "EmployerName": "",
    "EmployerStateIdNumber_Line1": "",
    "FederalIncomeTaxWithheld": "",
    "FormYear": "",
    "MedicareTaxWithheld": "",
    "MedicareWagesAndTips": "",
    "SocialSecurityTaxWithheld": "",
    "SocialSecurityWages": "",
    "StateIncomeTax_Line1": "",
    "StateWagesTipsEtc_Line1": "",
    "State_Line1": "",
    "WagesTipsOtherCompensation": "",
    "a_Code": "",
    "a_Value": "",
}
"""
)

drivers_license_prompt = generic_document_prompt.format(
    """
{
    "Address": "",
    "Date Of Birth": "",
    "Document Id": "",
    "Expiration Date": "",
    "Family Name": "",
    "Given Names": "",
    "Issue Date": "",
}
"""
)

# Map classification types to extraction prompts
classification_to_prompt = {
    "invoice": invoice_extraction_prompt,
    "w2": w2_extraction_prompt,
    "driver_license": drivers_license_prompt,
}

In [None]:
gcs_uris = [
    "gs://cloud-samples-data/documentai/SampleDocuments/US_DRIVER_LICENSE_PROCESSOR/dl3.pdf",
    "gs://cloud-samples-data/documentai/SampleDocuments/INVOICE_PROCESSOR/google_invoice.pdf",
    "gs://cloud-samples-data/documentai/SampleDocuments/FORM_W2_PROCESSOR/2020FormW-2.pdf",
]

for gcs_uri in gcs_uris:
    print(f"\nFile: {gcs_uri}\n")

    # Send to Gemini with Classification Prompt
    doc_classification = process_document(classification_prompt, gcs_uri).strip()

    print(f"Document Classification: {doc_classification}")

    # Get Extraction prompt based on Classification
    extraction_prompt = classification_to_prompt.get(doc_classification)

    if not extraction_prompt:
        print(f"Document does not belong to a specified class {doc_classification}")
        continue

    # Send to Gemini with Extraction Prompt
    extraction_response_text = process_document(
        extraction_prompt,
        gcs_uri,
        generation_config=generation_config,
        print_prompt=True,
    ).strip()

    print("\n-------Extracted Entities--------")
    json_object = json.loads(extraction_response_text)
    print(json_object)

## Document Question Answering

Gemini can be used to answer questions about a document.

This example answers a question about the Transformer model paper "Attention is all you need".

In [None]:
qa_prompt = """What is attention in the context of transformer models? Give me the answer first, followed by an explanation."""

In [None]:
# Send Q&A Prompt to Gemini
response_text = process_document(
    qa_prompt,
    "gs://cloud-samples-data/generative-ai/pdf/1706.03762v7.pdf",
)

print(f"Answer: {response_text}")

## Document Summarization

Gemini can also be used to summarize or paraphrase a document's contents. Your prompt can specify how detailed the summary should be or specific formatting, such as bullet points or paragraphs.

In [44]:
summarization_prompt = """You are a very professional document summarization specialist and a philosopher. Given a document, your task is to provide a detailed summary of the content of the document.

If it includes images, provide descriptions of the images.
If it includes tables, extract all elements of the tables.
If it includes graphs, explain the findings in the graphs.
Do not include any numbers that are not mentioned in the document.
Answer in portuguese.
"""

In [45]:
# Send Summarization Prompt to Gemini
response_text = process_document(
    summarization_prompt,
    "gs://blog-files-2024/all/pdf2/2015-12-26_Platão_à_guisa_de_introdução.pdf",
)

print(f"Summarization: {response_text}")

Summarization: O documento apresenta uma análise da filosofia de Platão, dividida em três tópicos: teoria do conhecimento, escatologia e antropologia. 

* **Teoria do conhecimento:** Platão critica a definição de conceitos como "coragem" e "beleza" a partir de exemplos concretos, pois argumenta que tais conceitos não são apreensíveis a partir do mundo sensível. Para ele, o verdadeiro conhecimento reside na compreensão das formas (ou ideias) que estão por trás das coisas e que são universais e imutáveis. Para acessar esse mundo das formas, o indivíduo deve se valer da razão e não das emoções. O Bem, que é o sol que ilumina tudo, é a principal forma a ser buscada.

* **Escatologia:** Platão acredita que a alma existe antes do corpo e que ela tem acesso às formas. O corpo polui a alma e a impede de voltar à sua pureza original. A busca pela virtude e a vida frugal permitem que a alma se liberte do corpo e se aproxime de Deus.

* **Antropologia:** Para Platão, a alma é composta por três pa

## Table parsing from documents

Gemini can parse contents of a table and return it in a structured format, such as HTML or markdown.

In [None]:
table_extraction_prompt = """What is the html code of the table in this document?"""

In [None]:
# Send Table Extraction Prompt to Gemini
response_text = process_document(
    table_extraction_prompt,
    "gs://cloud-samples-data/generative-ai/pdf/salary_table.pdf",
)
display(Markdown(response_text))

## Document Translation

Gemini can translate documents between languages. This example translates meeting notes from English into French and Spanish.

In [None]:
translation_prompt = """Translate the first paragraph into French and Spanish. Label each paragraph with the target language."""

In [None]:
# Send Translation Prompt to Gemini
response_text = process_document(
    translation_prompt,
    "gs://cloud-samples-data/generative-ai/pdf/fdic_board_meeting.pdf",
)

print(response_text)

## Document Comparison

Gemini can compare and contrast the contents of multiple documents. This example finds the changes in the IRS Form 1040 between 2013 and 2023.

Note: when working with multiple documents, the order can matter and should be specified in your prompt.

In [None]:
comparison_prompt = """The first document is from 2013, the second one from 2023. How did the standard deduction evolve?"""

In [None]:
# Send Comparison Prompt to Gemini
file_part1 = Part.from_uri(
    uri="gs://cloud-samples-data/generative-ai/pdf/form_1040_2013.pdf",
    mime_type=PDF_MIME_TYPE,
)

file_part2 = Part.from_uri(
    uri="gs://cloud-samples-data/generative-ai/pdf/form_1040_2023.pdf",
    mime_type=PDF_MIME_TYPE,
)

# Load contents
contents = [file_part1, file_part2, comparison_prompt]

# Send to Gemini
response = model.generate_content(contents)

print("-------Prompt--------")
print_multimodal_prompt(contents)

print("-------Output--------")
print(response.text)