In [None]:
# conda activate VertexAI

In [3]:
# import kagglehub

# Download latest version
# path = kagglehub.dataset_download("jenswalter/receipts")

# print("Path to dataset files:", path)

In [3]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

In [4]:
from typing import List

### Creating and Processor and Utilizing Created Processor (OCR) to parse PDF file

In [6]:
# TODO(developer): Uncomment these variables before running the sample.
project_id = "glossy-premise-439318-d8"
location = "us"  # Format is "us" or "eu"
file_path = "/home/mgfos207/Desktop/PetProjects/DocumentAI/jenswalter/receipts/versions/15/2024/ca/20240909_Hilton.pdf"
processor_display_name = "INIT_PROCESSOR_MF" # Must be unique per project, e.g.: "My Processor"


def quickstart(
    project_id: str = "glossy-premise-439318-d8",
    location: str = "us",
    file_path: str = "/home/mgfos207/Desktop/PetProjects/DocumentAI/jenswalter/receipts/versions/15/2024/ca/20240909_Hilton.pdf",
    processor_display_name: str = "INIT_PROCESSOR_MF",
):
    # You must set the `api_endpoint`if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location, e.g.:
    # `projects/{project_id}/locations/{location}`
    # parent = client.common_location_path(project_id, location)
    name = client.processor_path(project_id, location, "d1b1d9be60cc199")

    # Create a Processor
    # processor = client.create_processor(
    #     parent=parent,
    #     processor=documentai.Processor(
    #         type_="OCR_PROCESSOR",  # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
    #         display_name=processor_display_name,
    #     ),
    # )

    #get processor name
    processor = client.get_processor(name=name)
    # Print the processor information
    print(f"Processor Name: {processor.name}")

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(
        content=image_content,
        mime_type="application/pdf",  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
    )

    # Configure the process request
    # `processor.name` is the full resource name of the processor, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}`
    request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)

In [3]:
project_id = "glossy-premise-439318-d8"
location = "us"  # Format is "us" or "eu"
file_path = "/home/mgfos207/Desktop/PetProjects/DocumentAI/jenswalter/receipts/versions/15/2024/ca/20240909_Hilton.pdf"
processor_display_name = "INIT_PROCESSOR_MF" # Must be unique per project, e.g.: "My Processor"
# quickstart(project_id, location, file_path, processor_display_name)

### Batch Process Documents from GCS (Google Cloud Storage) and Store Data in a Seperate GCS Bucket

In [5]:
import re
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai  # type: ignore
from google.cloud import storage


"""
project_id = "glossy-premise-439318-d8"
location = "us"  # Format is "us" or "eu"
file_path = "/home/mgfos207/Desktop/PetProjects/DocumentAI/jenswalter/receipts/versions/15/2024/ca/20240909_Hilton.pdf"
processor_display_name = "INIT_PROCESSOR_MF" # Must be unique per project, e.g.: "My Processor"
"""
# TODO(developer): Uncomment these variables before running the sample.
project_id = "glossy-premise-439318-d8"
location = "us" # Format is "us" or "eu"
processor_id = "d1b1d9be60cc199" # Create processor before running sample
gcs_output_uri = "gs://document_ai_expense_mf_output_new/" # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Example: pretrained-ocr-v1.0-2020-09-23
processor_version_id = None

# TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
gcs_input_uri = None
input_mime_type = "application/pdf"
gcs_input_prefix = "gs://document_ai_input_receipts" # Format: gs://bucket/directory/
# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
field_mask = None

def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    gcs_input_uri: Optional[str] = None,
    input_mime_type: Optional[str] = None,
    gcs_input_prefix: Optional[str] = None,
    field_mask: Optional[str] = None,
    timeout: int = 400,
) -> List[dict]:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if gcs_input_uri:
        # Specify specific GCS URIs to process individual documents
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        # Load GCS Input URI into a List of document files
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        # Specify a GCS URI Prefix to process an entire directory
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}
        name = client.processor_path(project_id, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    # BatchProcess returns a Long Running Operation (LRO)
    operation = client.batch_process_documents(request)

    # Continually polls the operation until it is complete.
    # This could take some time for larger files
    # Format: projects/{project_id}/locations/{location}/operations/{operation_id}
    try:
        print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    # Catch exception when operation doesn't finish before timeout
    except (RetryError, InternalServerError) as e:
        print(e.message)

    # NOTE: Can also use callbacks for asynchronous processing
    #
    # def my_callback(future):
    #   result = future.result()
    #
    # operation.add_done_callback(my_callback)

    # After the operation is complete,
    # get output document information from operation metadata
    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

    storage_client = storage.Client()
    documents_list = list()
    print("Output files:")
    # One process per Input Document
    for process in list(metadata.individual_process_statuses):
        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
        # The Cloud Storage API requires the bucket name and URI prefix separately
        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
        if not matches:
            print(
                "Could not parse output GCS destination:",
                process.output_gcs_destination,
            )
            continue

        output_bucket, output_prefix = matches.groups()

        # Get List of Document Objects from the Output Bucket
        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

        # Document AI may output multiple JSON files per source file
        for blob in output_blobs:
            # Document AI should only output JSON files to GCS
            if blob.content_type != "application/json":
                print(
                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
                )
                continue

            # Download JSON File as bytes object and convert to Document Object
            print(f"Fetching {blob.name}")
            document = documentai.Document.from_json(
                blob.download_as_bytes(), ignore_unknown_fields=True
            )
            documents_list.append({"doc": document, "name": blob.name})

    return documents_list
    #         # For a full list of Document object attributes, please reference this page:
    #         # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document

    #         # Read the text recognition output from the processor
    #         print("The document contains the following text:")
    #         print(document.text)

list_of_docs = batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_output_uri,
    processor_version_id,
    gcs_input_uri,
    input_mime_type,
    gcs_input_prefix,
    field_mask,
    400
)

Waiting for operation projects/93824089870/locations/us/operations/302035705050588268 to complete...
Output files:
Fetching 302035705050588268/0/applebees_8267120140431-0.json
Fetching 302035705050588268/1/beerhaus_20181208_006-0.json
Fetching 302035705050588268/2/beerhaus_20181208_007-0.json
Fetching 302035705050588268/3/shakeshack_20181208_004-0.json
Fetching 302035705050588268/4/sliders-454353423425-0.json
Fetching 302035705050588268/5/sw_20181208_005-0.json
Fetching 302035705050588268/6/sw_20181208_3630-0.json


### Document Processing

This section will extract the insights derived from the DocumentAI API and extract relevant data that can be used in BigQuery for analysis. Additionaly we can use LLM's to summarize the insight not just on a document per document base but also in batches.

In [6]:
print(len(list_of_docs))

7


In [7]:
# list_of_docs[0]

Looking at this document it appears that entities that are associated with a confidence less than 50% tend to have noisy results. So what we'll do is filter only the entities that have a 50% confidence or better when identifying valid entities.

In [8]:
docs_list = list()

for document in list_of_docs:
    doc = document['doc']
    doc_content = [entity for entity in doc.entities if entity.confidence >= .5]
    docs_list.append({"entities": doc_content, "text": doc.text, "name": document['name']})

In [9]:
# docs_list[1]

In [10]:
import pandas as pd
import os

In [11]:
df = pd.DataFrame(docs_list)

In [12]:
df.head()

Unnamed: 0,entities,text,name
0,[text_anchor {\n text_segments {\n start_i...,MARICELA C\nTB#21\nDATE: 12-01-18 TIME: 07:44 ...,4050496363705400313/0/applebees_8267120140431-...
1,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nCheck: 303004\nServer: Allison T.\...,4050496363705400313/1/beerhaus_20181208_006-0....
2,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nP\nBeerHaus\nCheck: 985278\nServer...,4050496363705400313/2/beerhaus_20181208_007-0....
3,[text_anchor {\n text_segments {\n start_i...,SHAKE SHACK\n3790 Las Vegas Blvd South\nHost: ...,4050496363705400313/6/shakeshack_20181208_004-...
4,[text_anchor {\n text_segments {\n start_i...,Sliders Diner\n1202 Sutter Street\nSan Francis...,4050496363705400313/3/sliders-454353423425-0.json


In [13]:
# for index, row in df.iterrows():
#     file_name = f"./processed/{}"

In [14]:
import vertexai
from vertexai.language_models import TextEmbeddingModel

In [15]:
PROJECT_ID = "glossy-premise-439318-d8"
REGION = "us-central1"
MODEL_ID = "text-embedding-004"

In [16]:
vertexai.init(project=PROJECT_ID, location=REGION)

In [17]:
model = TextEmbeddingModel.from_pretrained(MODEL_ID)

In [22]:
def apply_embeddings(text:str):
    return model.get_embeddings([text])

df['embedded_text'] = df['text'].apply(apply_embeddings)



In [23]:
df.head()

Unnamed: 0,entities,text,name,embedded_text
0,[text_anchor {\n text_segments {\n start_i...,MARICELA C\nTB#21\nDATE: 12-01-18 TIME: 07:44 ...,10983299351657151161/0/applebees_8267120140431...,"[TextEmbedding(values=[0.040389422327280045, 0..."
1,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nCheck: 303004\nServer: Allison T.\...,10983299351657151161/1/beerhaus_20181208_006-0...,"[TextEmbedding(values=[0.08371221274137497, 0...."
2,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nP\nBeerHaus\nCheck: 985278\nServer...,10983299351657151161/2/beerhaus_20181208_007-0...,"[TextEmbedding(values=[0.04445113241672516, 0...."
3,[text_anchor {\n text_segments {\n start_i...,SHAKE SHACK\n3790 Las Vegas Blvd South\nHost: ...,10983299351657151161/3/shakeshack_20181208_004...,"[TextEmbedding(values=[0.026838034391403198, -..."
4,[text_anchor {\n text_segments {\n start_i...,Sliders Diner\n1202 Sutter Street\nSan Francis...,10983299351657151161/4/sliders-454353423425-0....,"[TextEmbedding(values=[0.03435202315449715, 0...."


In [28]:
# df.iloc[0]['embedded_text']

In [29]:
df['embedded_text'] = df['embedded_text'].apply(lambda x: x[0].values)

In [30]:
df.head()

Unnamed: 0,entities,text,name,embedded_text
0,[text_anchor {\n text_segments {\n start_i...,MARICELA C\nTB#21\nDATE: 12-01-18 TIME: 07:44 ...,10983299351657151161/0/applebees_8267120140431...,"[0.040389422327280045, 0.021436145529150963, -..."
1,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nCheck: 303004\nServer: Allison T.\...,10983299351657151161/1/beerhaus_20181208_006-0...,"[0.08371221274137497, 0.007932025007903576, 0...."
2,[text_anchor {\n text_segments {\n start_i...,11/30/2018\nP\nBeerHaus\nCheck: 985278\nServer...,10983299351657151161/2/beerhaus_20181208_007-0...,"[0.04445113241672516, 0.019029725342988968, 0...."
3,[text_anchor {\n text_segments {\n start_i...,SHAKE SHACK\n3790 Las Vegas Blvd South\nHost: ...,10983299351657151161/3/shakeshack_20181208_004...,"[0.026838034391403198, -0.004848463460803032, ..."
4,[text_anchor {\n text_segments {\n start_i...,Sliders Diner\n1202 Sutter Street\nSan Francis...,10983299351657151161/4/sliders-454353423425-0....,"[0.03435202315449715, 0.03089391253888607, -0...."


In [31]:
for idx, row in df.iterrows():
    name = row['name'].split('/')[-1].replace('.json', '')
    row[['embedded_text']].to_frame().to_parquet(f"doc_ai_insights/{name}.parquet")

In [32]:
df.dtypes

entities         object
text             object
name             object
embedded_text    object
dtype: object

In [88]:
# df['text'].to_frame().to_parquet("test.parquet")

In [17]:
entities_df = df[['entities', 'name']]

In [18]:
# entities_df.iloc[0]['entities'][10].properties

In [20]:
entities_list = list()
for idx, row in entities_df.iterrows():
    for entity in row['entities']:
        num_entity_props = len(entity.text_anchor.text_segments)
        entities_list.append({"file_name": row['name'],"entity_name": entity.type_, "text": entity.mention_text, 'parent_entity_id': None, "entity_id": entity.id})
        if num_entity_props > 1:
            for sub_entity in entity.properties:
                entities_list.append({"file_name": row['name'], "entity_name": sub_entity.type_, "text": sub_entity.mention_text, 'parent_entity_id': entity.id, "entity_id": sub_entity.id})
# entities_list

In [21]:
entities_df = pd.DataFrame(entities_list)

In [22]:
entities_df.head()

Unnamed: 0,file_name,entity_name,text,parent_entity_id,entity_id
0,4050496363705400313/0/applebees_8267120140431-...,total_tax_amount,9.56,,0
1,4050496363705400313/0/applebees_8267120140431-...,net_amount,115.67,,1
2,4050496363705400313/0/applebees_8267120140431-...,supplier_address,"3340 S. Maryland Parkway\nLas Vegas, Nevada, 8...",,2
3,4050496363705400313/0/applebees_8267120140431-...,total_amount,125.23,,3
4,4050496363705400313/0/applebees_8267120140431-...,supplier_name,APPLEBEE'S\nNEIGHBORHOOD GRILL & BAR,,4


In [23]:
entities_df['file_name'] = entities_df['file_name'].apply(lambda x: x.split("/")[-1].replace(".json", ""))

In [87]:
entities_df.to_parquet("doc_ai_insights/processed/recepits_entities.parquet")

In [24]:
raw_df = df[['name', 'text']]

In [25]:
raw_df['file_name'] = raw_df['name'].apply(lambda x: x.split("/")[-1].replace(".json", ""))

In [26]:
raw_df.drop(columns=['name'], inplace=True)

In [27]:
raw_df.to_parquet("doc_ai_insights/processed/receipts_raw_text.parquet")

In [89]:
df_vector = df[['name', 'text', 'embedded_text']]

In [90]:
df_vector['file_name'] = df_vector['name'].apply(lambda x: x.split("/")[-1].replace(".json", ""))

In [93]:
df_vector.drop(columns=['name'], inplace=True)

In [94]:
df_vector.head()

Unnamed: 0,text,embedded_text,file_name
0,MARICELA C\nTB#21\nDATE: 12-01-18 TIME: 07:44 ...,"[0.040389422327280045, 0.021436145529150963, -...",applebees_8267120140431-0
1,11/30/2018\nCheck: 303004\nServer: Allison T.\...,"[0.08371221274137497, 0.007932025007903576, 0....",beerhaus_20181208_006-0
2,11/30/2018\nP\nBeerHaus\nCheck: 985278\nServer...,"[0.04445113241672516, 0.019029725342988968, 0....",beerhaus_20181208_007-0
3,SHAKE SHACK\n3790 Las Vegas Blvd South\nHost: ...,"[0.026838034391403198, -0.004848463460803032, ...",shakeshack_20181208_004-0
4,Sliders Diner\n1202 Sutter Street\nSan Francis...,"[0.03435202315449715, 0.03089391253888607, -0....",sliders-454353423425-0


In [96]:
df_vector.to_parquet("doc_ai_insights/processed/receipts_w_vectors.parquet")

In [28]:
storage_client = storage.Client()

In [29]:
bucket = storage_client.get_bucket('document_ai_expense_mf_output_processed')

blob = bucket.blob('receipts_raw_text.parquet')
blob.upload_from_filename('doc_ai_insights/processed/receipts_raw_text.parquet')

In [100]:
bucket = storage_client.get_bucket('document_ai_expense_mf_output_processed')

blob = bucket.blob('receipts_w_vectors.parquet')
blob.upload_from_filename('doc_ai_insights/processed/receipts_w_vectors.parquet')

In [101]:
blob = bucket.blob('recepits_entities.parquet')
blob.upload_from_filename('doc_ai_insights/processed/recepits_entities.parquet')

In [7]:
from google.cloud import bigquery

In [8]:
bq_client = bigquery.Client()

In [9]:
job_config = bigquery.QueryJobConfig()

In [6]:
sql_model = """
CREATE OR REPLACE MODEL `glossy-premise-439318-d8.vector_poc.embedding_model`
  REMOTE WITH CONNECTION `projects/glossy-premise-439318-d8/locations/us/connections/external_connection_poc`
  OPTIONS (ENDPOINT = 'text-embedding-004');
"""
sql_create_ext_table = """
CREATE OR REPLACE EXTERNAL TABLE `glossy-premise-439318-d8.vector_poc.receipts_raw_text`
WITH CONNECTION `projects/glossy-premise-439318-d8/locations/us/connections/external_connection_poc`
OPTIONS(
    format="PARQUET",
    uris = [
        'gs://document_ai_expense_mf_output_processed/receipts_raw_text.parquet'
    ]
)

"""
sql_embed = """
CREATE OR REPLACE TABLE `glossy-premise-439318-d8.vector_poc.embeddings` AS
SELECT * FROM ML.GENERATE_EMBEDDING(
  MODEL `vector_poc.embedding_model`,
  (
    SELECT *, text AS content
    FROM `glossy-premise-439318-d8.vector_poc.receipts_raw_text`
  )
)
WHERE LENGTH(ml_generate_embedding_status) = 0;
"""

sql_vec_index = """
CREATE OR REPLACE VECTOR INDEX `receipts_vec` ON
`glossy-premise-439318-d8.vector_poc.embeddings`(ml_generate_embedding_result)
OPTIONS(
  distance_type="COSINE",
  index_type="IVF"
)
"""

sql_vec_query = """
SELECT query.query, base.file_name
FROM VECTOR_SEARCH(
   TABLE `glossy-premise-439318-d8.vector_poc.embeddings`, 'ml_generate_embedding_result',
   (SELECT ml_generate_embedding_result, content AS query
     FROM ML.GENERATE_TEXT_EMBEDDING(
         MODEL `vector_poc.embedding_model`,
         (SELECT 'applebees' AS content))
   ), top_k => 5)

"""

sql_vec_big_table = """
CREATE TABLE `glossy-premise-439318-d8.vector_poc.big_embeddings` AS
   SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(
     MODEL `vector_poc.embedding_model`,
     (SELECT *, abstract AS content
       FROM `patents-public-data.google_patents_research.publications`
       WHERE LENGTH(abstract) > 0 AND LENGTH(title) > 0 AND country = 'Singapore'))
   WHERE ARRAY_LENGTH(text_embedding) > 0;

"""

sql_vec_big_index = """
CREATE VECTOR INDEX patents_index ON vector_poc.big_embeddings(embedding_v1)
OPTIONS(distance_type='COSINE', index_type='IVF', ivf_options='{"num_lists": 1000}');
"""

sql_query_big_table = """
SELECT query.query, base.publication_number, base.title, base.abstract
FROM VECTOR_SEARCH(
  TABLE `vector_poc.big_embeddings`, 'text_embedding',
  (
  SELECT ml_generate_embedding_result, content AS query
  FROM ML.GENERATE_EMBEDDING(
  MODEL `vector_poc.embedding_model`,
  (SELECT 'improving password security' AS content))
  ),
  top_k => 5, options => '{"fraction_lists_to_search": 0.01}')
"""

In [46]:
bq_client.query_and_wait(sql_model)

<google.cloud.bigquery.table.RowIterator at 0x7fd3d64615b0>

In [43]:
bq_client.query_and_wait(sql_create_ext_table, job_config=job_config)

<google.cloud.bigquery.table.RowIterator at 0x7fd3d634fa00>

In [44]:
bq_client.query_and_wait(sql_embed, job_config=job_config)

<google.cloud.bigquery.table.RowIterator at 0x7fd3d6461af0>

In [61]:
bq_client.query_and_wait(sql_vec_big_table, job_config=job_config)

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fd3e2b33400>

In [70]:
bq_client.query_and_wait(sql_vec_big_index, job_config=job_config)

<google.cloud.bigquery.table.RowIterator at 0x7fd3c9bf21c0>

In [17]:
df = bq_client.query(sql_query_big_table).to_dataframe()



In [19]:
df.shape

(5, 4)

In [20]:
df.head()

Unnamed: 0,query,publication_number,title,abstract
0,improving password security,SG-128634-A1,Active new password entry dialog with compact ...,An active new password entry dialog provides a...
1,improving password security,SG-10201610585W-A,Passsword management system and process,PASSSWORD MANAGEMENT SYSTEM AND PROCESS There ...
2,improving password security,SG-148888-A1,Improved system and method for random entry of...,IMPROVED SYSTEM AND METHOD FOR RANDOM ENTRY OF...
3,improving password security,SG-194267-A1,Method and system for protecting a password du...,A system for providing security for a personal...
4,improving password security,SG-120868-A1,Data storage device security method and apparatus,Methods for improving security in data storage...


In [62]:
# bq_client.query_and_wait(sql_vec_index, job_config=job_config) #Note need at least 5000 records to create index

In [63]:
# bq_client.query_and_wait(sql_vec_query, job_config=job_config)