In [1]:
%pip install --upgrade google-cloud-aiplatform==1.35.0 google-cloud-documentai==2.20.1 backoff==2.2.1 --user

Note: you may need to restart the kernel to use updated packages.


In [6]:
import sys

if "google.colab" in sys.modules:
    # Automatically restart kernel after installs so that your environment can access the new packages
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
else:
    # Otherwise, attempt to discover local credentials as described on https://cloud.google.com/docs/authentication/application-default-credentials
    pass

In [7]:
from __future__ import annotations
import backoff
from tenacity import retry, stop_after_attempt, wait_random_exponential
from google.api_core.exceptions import ResourceExhausted
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import AlreadyExists
from google.cloud import documentai
import numpy as np
import glob
import os
from typing import Dict, List
import pandas as pd
from logging import error
import re
import textwrap
from typing import Tuple, List
import vertexai
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel
#import fitz 
import json
import time
import numpy as np

2024-02-27 10:17:50.149181: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
#Once the project is created in the console, extract the parameters here
PROJECT_ID = !gcloud config get project
PROJECT_ID = PROJECT_ID.n
LOCATION = "europe-west2"
LOCATION_DEPLOY = "europe-west2" #Location to deploy GCP resources

#!gcloud services enable documentai.googleapis.com storage.googleapis.com aiplatform.googleapis.com

In [11]:
# Edit these variables before running the code.
project_id = PROJECT_ID

# See https://cloud.google.com/document-ai/docs/regions for all options.
location = LOCATION

# Must be unique per project, e.g.: "My Processor"
processor_display_name = "my_processo"

# You must set the `api_endpoint` if you use a location other than "us".
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")


#1. Create the processor: you can not create multiple processors with the same display name
def create_processor(
    project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    return client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_="OCR_PROCESSOR" #we are using the pre-trained OCR processor
        ),
    )


try:
    processor = create_processor(project_id, location, processor_display_name)
    print(f"Created Processor {processor.name}")
except AlreadyExists as e:
    print(
        f"Processor already exits, change the processor name and rerun this code. {e.message}"
    )

    

#2. Define process document function which takes the processor name and file path of the document and extracts the text from the document.  
def process_document(
    processor_name: str,
    file_path: str,
) -> documentai.Document:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Configure the process request
    request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

    result = client.process_document(request=request)

    return result.document

Created Processor projects/59574701027/locations/europe-west2/processors/2ad548530a0bdf96


In [12]:
import fitz
def split_and_save_pdf(input_pdf_path: str):
    # Create a folder to store the split PDFs
    output_folder = os.path.join(os.path.dirname(input_pdf_path), 'pdf_chunks')
    os.makedirs(output_folder, exist_ok=True)
    
    pdf_paths = []

    # Open the input PDF
    with fitz.open(input_pdf_path) as pdf_document:
        num_pages = pdf_document.page_count

        # Calculate the number of files needed
        #num_files = (num_pages + max_pages_per_file - 1) // max_pages_per_file

        # Split the PDF into multiple files
        for i in range(num_pages):
            start_page, end_page = i, i+1 

            pdf_writer = fitz.open()
            pdf_writer.insert_pdf(pdf_document, from_page=start_page, to_page=end_page - 1)

            output_pdf_path = os.path.join(output_folder, f'pdf_{i + 1}.pdf')
            pdf_writer.save(output_pdf_path)

            #print(f'Saved: {output_pdf_path}')
            pdf_paths.append(output_pdf_path)
            
    waiting_time = len(pdf_paths)*2.52 
    print(f'Estimated waiting time: {waiting_time} seconds')
    return pdf_paths


In [13]:
# Set the desired parameters
input_pdf_path = "data/10_BMFashion_aa_2021-07-30.pdf" # Replace with your actual input PDF path
#max_pages_per_file = 15 # Set the desired maximum number of pages per file
processor_name = processor.name # Assign the created processor name


# 2. Iterate through the pdf chunks and extract and join their text
def pdf_get_text_per_page(pdf_paths):
    processor_name = processor.name 
    texts = []
    page_numbers = list(range(1,len(pdf_paths)+1))
    
    for pdf_path in pdf_paths:
        try: 
            document = process_document(processor_name, file_path = pdf_path)
            texts.append(document.text)
        except: 
            print('Document AI API resources exhausted. Waiting for 60 secs')
            time.sleep(60)
            document= process_document(processor_name, file_path = pdf_path)
            texts.append(document.text)
            
    
    df = pd.DataFrame({'Text':texts, 'PageNumber': page_numbers})

    return df


def split_text_except_near_number(text):
    pattern = r'(?<!\d)\n(?!\d)'
    splitted_text = re.split(pattern,text)
    return ' '.join(splitted_text)


In [14]:
#3 create the embeddings for each pdf chunk
# Call the GCP models
generation_model = TextGenerationModel.from_pretrained("text-bison@002")
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")


# This decorator is used to handle exceptions and apply exponential backoff in case of ResourceExhausted errors.
# It means the function will be retried with increasing time intervals in case of this specific exception.
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def text_generation_model_with_backoff(**kwargs):
    return generation_model.predict(**kwargs).text


def get_embedding(text):
    get_embedding.counter += 1
    try:
        if get_embedding.counter % 100 == 0:
            time.sleep(3)
        return embedding_model.get_embeddings([text])[0].values #Send request to embedding model
    except:
        print('waiting for 60 secs')
        time.sleep(60)
        return embedding_model.get_embeddings([text])[0].values #Send request to embedding model
    
    

#1. perform rag retrieval with the corresponding 
def get_context_from_question(
    question: str, vector_store: pd.DataFrame, sort_index_value: int = 3
) -> Tuple[str, pd.DataFrame]:
    query_vector = np.array(get_embedding(question))
    vector_store["dot_product"] = vector_store["embedding"].apply(
        lambda row: np.dot(row, query_vector)
    )
    # Similarity matching by dot product 
    top_matched = vector_store.sort_values(by="dot_product", ascending=False)[
        :sort_index_value
    ].index
    
    top_matched_df = vector_store.loc[top_matched, ["PageNumber", "Text"]]
    context = "\n".join(top_matched_df["Text"].values)
    
    return context, top_matched_df

def pdf_qa(embedded_df):
    results =[]
    
    questions = ["Give me the company name",
        "Give me the scope 1 emisions",
                 "Give me the scope 2 emissions",
                 "Give me the scope 3 emissions",
                 "Give me the total energy consumed in tCO2"]
    for question in questions: 
        # get the custom relevant chunks from all the chunks in vector store.
        context, top_matched_df = get_context_from_question(
        question,
        vector_store=embedded_df,
        sort_index_value=4,  # Top N results to pick from embedding vector search
    )
        prompt = f""" Answer the question as precise as possible using the provided context. \n\n
            Context: \n {context}?\n
            Question: \n {question} \n
            Answer:
  
  """
        answer = text_generation_model_with_backoff(prompt=prompt)
        results.append(print(question+"\n\n"+"PaLM Predicted:"+answer+"\n\n"))
    return results, top_matched_df.iloc[0,0]

In [15]:
def pdf_rag_model(input_pdf_path):
    #1. create a dataframe where each row is the text for each page of the pdf
    text_per_page_df = pdf_get_text_per_page(split_and_save_pdf(input_pdf_path))
    
    #2. clean each page text of unecessary \n
    text_per_page_df['Text'] = text_per_page_df['Text'].apply(split_text_except_near_number)
    
    #3. calculate embeddings for each page text
    get_embedding.counter = 0
    text_per_page_df["embedding"] = text_per_page_df["Text"].apply(lambda x: get_embedding(x))
    
    #4. Q&A results and page info
    results, info_page_number = pdf_qa(text_per_page_df)
    return results, info_page_number
    

In [16]:
#pdf_rag_model('data/13_Four_CS_aa_2021-08-31.pdf')
pdf_rag_model('data/25_GREP1_aa_2021-09-30.pdf')

Estimated waiting time: 88.2 seconds
Give me the company name

PaLM Predicted: GREP1 LIMITED


Give me the scope 1 emisions

PaLM Predicted: The scope 1 emissions for GREP1 Limited for the year ending 30 September 2021 are:

- Boiler ignition: 226 tonnes of CO2e
- Diesel consumption (direct): 909 tonnes of CO2e


Give me the scope 2 emissions

PaLM Predicted: The scope 2 emissions for GREP1 Limited for the year ending 30 September 2021 were 387 tonnes of CO2e.


Give me the scope 3 emissions

PaLM Predicted: The scope 3 emissions for GREP1 Limited for the year ending 30 September 2021 are as follows:

- Delivery of input material: 10,886 MWh
- Embodied carbon of fuel carbon: 1,547 MWh

Total scope 3 emissions: 12,433 MWh


Give me the total energy consumed in tCO2

PaLM Predicted: The total energy consumed in tCO2 is 13,082.




([None, None, None, None, None], 9)

In [23]:
pdf_rag_model('data/26_StonegateFarmers_aa_2021-10-02.pdf')

Estimated waiting time: 110.88 seconds


([['Give me the scope 1 emisions',
   '\n\n',
   'PaLM Predicted:',
   ' Scope 1 emissions for 2021 were 3,077.00 metric tonnes.',
   '\n\n'],
  ['Give me the scope 2 emissions',
   '\n\n',
   'PaLM Predicted:',
   ' The scope 2 emissions for 2021 were 58 metric tonnes.',
   '\n\n'],
  ['Give me the scope 3 emissions',
   '\n\n',
   'PaLM Predicted:',
   ' The scope 3 emissions for 2021 were 58.00 metric tonnes, while in 2020 they were 66.00 metric tonnes.',
   '\n\n'],
  ['Give me the total energy consumed in tCO2',
   '\n\n',
   'PaLM Predicted:',
   ' The total gross emissions for 2021 were 3,193.00 metric tonnes of CO2 equivalent.',
   '\n\n']],
 10)

In [21]:
pdf_rag_model('data/24_Whitemeadow_aa_2020-12-31.pdf')

Estimated waiting time: 110.88 seconds


([['Give me the scope 1 emisions',
   '\n\n',
   'PaLM Predicted:',
   ' The scope 1 emissions for 2020 were 143.36 metric tonnes.',
   '\n\n'],
  ['Give me the scope 2 emissions',
   '\n\n',
   'PaLM Predicted:',
   ' The scope 2 emissions for 2020 were 295.25 metric tonnes.',
   '\n\n'],
  ['Give me the scope 3 emissions',
   '\n\n',
   'PaLM Predicted:',
   ' The scope 3 emissions for 2020 were 37.38 metric tonnes.',
   '\n\n'],
  ['Give me the total energy consumed in tCO2',
   '\n\n',
   'PaLM Predicted:',
   ' The provided context does not contain information about the total energy consumed in tCO2.',
   '\n\n']],
 9)

In [24]:
results, page = pdf_rag_model('data/25_GREP1_aa_2021-09-30.pdf')
print(results, page)

Estimated waiting time: 88.2 seconds
[['Give me the scope 1 emisions', '\n\n', 'PaLM Predicted:', ' The scope 1 emissions for GREP1 Limited for the year ending 30 September 2021 are:\n\n- Boiler ignition: 226 tonnes of CO2e\n- Diesel consumption (direct): 909 tonnes of CO2e', '\n\n'], ['Give me the scope 2 emissions', '\n\n', 'PaLM Predicted:', ' The scope 2 emissions for GREP1 Limited for the year ending 30 September 2021 were 387 tonnes of CO2e.', '\n\n'], ['Give me the scope 3 emissions', '\n\n', 'PaLM Predicted:', ' The scope 3 emissions for GREP1 Limited for the year ending 30 September 2021 are as follows:\n\n- Delivery of input material: 10,886 MWh\n- Embodied carbon of fuel carbon: 1,547 MWh\n\nTotal scope 3 emissions: 12,433 MWh', '\n\n'], ['Give me the total energy consumed in tCO2', '\n\n', 'PaLM Predicted:', ' The total energy consumed in tCO2 is 13,082.', '\n\n']] 9


In [28]:
question = 'noelle'
print(question +'\n''zeng' )

noelle
zeng


In [38]:
import os

folder_path = 'data'

results_dict={}

star_index = 1
end_index = 3

for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(folder_path,filename)
        output = pdf_rag_model(pdf_path)
        results_dict[filename] = output
            
print(results_dict)            
            


Estimated waiting time: 105.84 seconds
Give me the company name

PaLM Predicted: SMITH'S (GLOUCESTER) LIMITED


Give me the scope 1 emisions

PaLM Predicted: The total scope 1 emissions for the company are 15,150,502.87 kWh.


Give me the scope 2 emissions

PaLM Predicted: The total annual Scope 2 emissions for the company are 15,150,502.87 kWh.


Give me the scope 3 emissions

PaLM Predicted: The total Scope 3 emissions for the year ended 30 September 2021 were 1241.28 TCO2e.


Give me the total energy consumed in tCO2

PaLM Predicted: The total energy consumed in tCO2e is 15,150,50287.


Estimated waiting time: 78.12 seconds
Give me the company name

PaLM Predicted: Underwood Meat Company Limited


Give me the scope 1 emisions

PaLM Predicted: Scope 1 direct emissions:
   - Gas combustion: 40.00 metric tonnes
   - Fuel consumed for owned transport: 655.00 metric tonnes
   Total Scope 1 emissions: 695.00 metric tonnes


Give me the scope 2 emissions

PaLM Predicted: The scope 2 indire

{'33_SMITHSGloucester_aa_2021-09-30.pdf': ([None, None, None, None, None], 7),
 'Underwooedmeat_aa_2021-10-31.pdf': ([None, None, None, None, None], 7),
 '28_TrustPayment_2021.pdf': ([None, None, None, None, None], 7),
 'StougardenL_aa_2021-07-31.pdf': ([None, None, None, None, None], 7),
 'NCRFinancialSolutions_aa_2020-12-31.pdf': ([None, None, None, None, None],
  7),
 '35_LawsonsHoldings_aa_2021-06.pdf': ([None, None, None, None, None], 10),
 'JHsons_aa_2021-03-31.pdf': ([None, None, None, None, None], 11),
 'BreezeMG_aa_2021-12-31.pdf': ([None, None, None, None, None], 7),
 '31_Andrew_Brownsword_aa_2021-01-03.pdf': ([None, None, None, None, None], 7),
 'SpecialSteel_aa_2021-05-31.pdf': ([None, None, None, None, None], 7),
 'ACESAC_aa_2021-08-31.pdf': ([None, None, None, None, None], 13),
 '34_PukkaPies_aa_2021-05.pdf': ([None, None, None, None, None], 10),
 'SC145746_aa_2021-10-29.pdf': ([None, None, None, None, None], 8),
 '25_GREP1_aa_2021-09-30.pdf': ([None, None, None, None, No

In [2]:
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install IPython

Note: you may need to restart the kernel to use updated packages.


In [19]:
# build user interface

# widgets display:

#1. choice
import ipywidgets as widgets 
from IPython.display import display, Image
import fitz

#pdf_rag_model

pdf_paths = ['data/27_Kilnbridge_aa_2021-06-30.pdf', 'data/28_TrustPayment_2021.pdf', 'data/29_HotelFolk_aa_2021.pdf']

dropdown = widgets.Dropdown(options = pdf_paths, description = 'Select PDF:')
input_path = widgets.Text(placeholder='Enter PDF path', description = 'Custom Path:')
page_number_input = widgets.IntText(value =1, description = 'Page Number:')
button = widgets.Button(description = 'Process PDF')
output = widgets.Output()

def render_pdf_page_as_image(pdf_path, page_number):
    pdf_document = fitz.open(pdf_path)
    page_number = int(page_number)
    page = pdf_document.load_page(page_number-1)
    pix = page.get_pixmap()
    return pix

def on_button_click(b):
    with output:
        pdf_path = dropdown.value if dropdown.value else input_path.value
        #page_number = page_number_input.value
        result, page_number = pdf_rag_model(pdf_path)
        print(result)
        
        #render image and display PDF page as image
        image = render_pdf_page_as_image(pdf_path, page_number)
        display(Image(data=image.tobytes(), format='png'))
        
button.on_click(on_button_click)

display(dropdown)
display(input_path)
display(page_number_input)
display(button)
display(output)
        


Dropdown(description='Select PDF:', options=('data/27_Kilnbridge_aa_2021-06-30.pdf', 'data/28_TrustPayment_202…

Text(value='', description='Custom Path:', placeholder='Enter PDF path')

IntText(value=1, description='Page Number:')

Button(description='Process PDF', style=ButtonStyle())

Output()

In [18]:
image = render_pdf_as_image(pdf_path, page_number)
display(Image(data=image.tobytes(), format='png'))

NameError: name 'pdf_path' is not defined