In [1]:
import sys

if "google.colab" in sys.modules:
    # Automatically restart kernel after installs so that your environment can access the new packages
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
else:
    # Otherwise, attempt to discover local credentials as described on https://cloud.google.com/docs/authentication/application-default-credentials
    pass

In [2]:
from __future__ import annotations
import backoff
from tenacity import retry, stop_after_attempt, wait_random_exponential
from google.api_core.exceptions import ResourceExhausted
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import AlreadyExists
from google.cloud import documentai
import numpy as np
import glob
import os
from typing import Dict, List
import pandas as pd
from logging import error
import re
import textwrap
from typing import Tuple, List
import vertexai
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel
from PyPDF2 import PdfReader, PdfWriter
import json
import time
import numpy as np
import glob

2024-01-08 17:13:43.118666: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from set_processor import create_processor, process_document

In [7]:
from text_extraction_from_pdf import split_and_save_pdf, text_extraction_from_pdf
from text_chunking import text_to_sentences, text_to_paragraph, paragraphs_to_df
from create_embeddings import get_embedding, get_context_from_question, text_generation_model_with_backoff

## 1. Import validation data

In [8]:
df_companies = pd.read_csv("df_csv")

## 2. Process given text

In [9]:
# Use on Breeze Motor's report text 
text = df_companies.iloc[1,5]

# 1. Chunk the text into paragraph and transform into a df: when using a size 40, chunk 3 has the relevant info
chunk_size = 20
sentence_chunks = text_to_paragraph(text, chunk_size)
df = paragraphs_to_df(sentence_chunks)
# df = df.drop(df.index[-1]) only for index 8 special steel group

# 2. Get the embeddings vector using Gecko for each chunk stored in a new column p
get_embedding.counter = 0
df["embedding"] = df["text"].apply(lambda x: get_embedding(x)) # This may take several minutes to complete.
df

Unnamed: 0,paragraph_number,text,embedding
0,1,Breeze Motor CompanyLimitedRegistered number: ...,"[0.002161432756111026, -0.01833339035511017, -..."
1,2,5th Floor Merck HouseSeldown LanePooleDorsetBH...,"[-0.013716379180550575, -0.015292800031602383,..."
2,3,Group Strategic Report1-5Director's Report6-8I...,"[2.6124604119104333e-05, -0.023879574611783028..."
3,4,BREEZE MOTOR COMPANY LIMITEDGROUP STRATEGIC RE...,"[0.00749810878187418, -0.02356719598174095, -0..."
4,5,to look to grow the business where the managem...,"[7.044868834782392e-05, 0.0037446788046509027,..."
...,...,...,...
109,110,"2,656,9222,851,545Bank overdrafts(14,370)14,37...","[0.026068534702062607, -0.0286362636834383, -0..."
110,111,"351,42630.Pension commitmentsThe Company and G...","[-0.007525721564888954, -0.03144488483667374, ..."
111,112,"£££Not later than 1 year1,057,698976,913893,84...","[0.007913406006991863, -0.015880122780799866, ..."
112,113,"6,275,7464,501,46732.Other financial commitmen...","[0.00831800140440464, -0.029883651062846184, -..."


In [12]:
type(get_embedding(text))

list

## 3. Q&A

In [10]:
%%time
# your question for the documents
question = "Give me the scope 1 emissions"

# get the custom relevant chunks from all the chunks in vector store.
context, top_matched_df = get_context_from_question(
    question,
    vector_store=df,
    sort_index_value=3,  # Top N results to pick from embedding vector search
)
prompt = f""" Answer the question as precise as possible using the provided context. \n\n
            Context: \n {context}?\n
            Question: \n {question} \n
            Answer:
  
  """

# Call the PaLM API on the prompt.
print(question)
print("PaLM Predicted:", text_generation_model_with_backoff(prompt=prompt), "\n\n")
# top 5 data that has been picked by model based on user question. This becomes the context.
print(top_matched_df)

# df.iloc[2,1] + df.iloc [7,1]

# your question for the documents
question = "Give me the scope 2 emissions"

# get the custom relevant chunks from all the chunks in vector store.
context, top_matched_df = get_context_from_question(
    question,
    vector_store=df,
    sort_index_value=3,  # Top N results to pick from embedding vector search
)
prompt = f""" Answer the question as precise as possible using the provided context. \n\n
            Context: \n {context}?\n
            Question: \n {question} \n
            Answer:
  
  """

# Call the PaLM API on the prompt.
print(question)
print("PaLM Predicted:", text_generation_model_with_backoff(prompt=prompt), "\n\n")
# top 5 data that has been picked by model based on user question. This becomes the context.
print(top_matched_df)

# your question for the documents
question = "Give me the scope 3 emissions"

# get the custom relevant chunks from all the chunks in vector store.
context, top_matched_df = get_context_from_question(
    question,
    vector_store=df,
    sort_index_value=3,  # Top N results to pick from embedding vector search
)
prompt = f""" Answer the question as precise as possible using the provided context. \n\n
            Context: \n {context}?\n
            Question: \n {question} \n
            Answer:
  
  """

# Call the PaLM API on the prompt.
print(question)
print("PaLM Predicted:", text_generation_model_with_backoff(prompt=prompt), "\n\n")
# top 5 data that has been picked by model based on user question. This becomes the context.
print(top_matched_df)

# your question for the documents
question = "Give me the total carbon emissions"

# get the custom relevant chunks from all the chunks in vector store.
context, top_matched_df = get_context_from_question(
    question,
    vector_store=df,
    sort_index_value=3,  # Top N results to pick from embedding vector search
)
prompt = f""" Answer the question as precise as possible using the provided context. \n\n
            Context: \n {context}?\n
            Question: \n {question} \n
            Answer:
  
  """

# Call the PaLM API on the prompt.
print(question)
print("PaLM Predicted:", text_generation_model_with_backoff(prompt=prompt), "\n\n")
# top 5 data that has been picked by model based on user question. This becomes the context.
print(top_matched_df)

Give me the scope 1 emissions
PaLM Predicted:  The provided context does not mention anything about scope 1 emissions, so I cannot answer this question. 


    paragraph_number                                               text
22                23  20212020Note££Turnover499,000,30077,409,820Cos...
89                90  2021Company20202021££££Consignment stock1,330,...
87                88  Profit/(Loss)££Breeze (Southampton) Limited1,0...
Give me the scope 2 emissions
PaLM Predicted:  The provided context does not contain any information about scope 2 emissions. 


    paragraph_number                                               text
89                90  2021Company20202021££££Consignment stock1,330,...
22                23  20212020Note££Turnover499,000,30077,409,820Cos...
2                  3  Group Strategic Report1-5Director's Report6-8I...
Give me the scope 3 emissions
PaLM Predicted:  The provided context does not contain any information about scope 3 emissions. 


    paragr