In [1]:
import os
import sys
sys.path.append('..')

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

from financial_qabot_table_reader.src.table2json_copy import extract_tables
from scripts.table_and_text_parser import start_blob_client, extract_blob_paths, parse_pdfs, page_text_and_tables

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env

True

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
openai.api_type="azure"
# openai.api_version="2023-09-15-preview"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [4]:
container_client = start_blob_client(
    azure_storage_connection_string=os.environ['AZURE_STORAGE_CONNECTION_STRING'],
    azure_storage_container_name=os.environ['AZURE_STORAGE_CONTAINER_NAME']
    )

In [5]:
list_of_blob_paths = extract_blob_paths(container_client)
result_dicts = parse_pdfs(list_of_blob_paths)

In [6]:
# print('Number of reports:', len(result_dicts))
# print('\nReport names:')
# for result_dict in result_dicts:
    # print(result_dict.get('tables'))
    # print(result_dict.get('report_name'))

In [7]:
paged_text_and_tables = page_text_and_tables(result_dicts)

In [8]:
def convert_pages_to_table_docs(paged_text_and_tables):

    lang_doc_tables = []

    count = 0
    for i, report in enumerate(paged_text_and_tables):
        num_pages = max(list(report.keys()))
        for page_num, tables_and_text in report.items():
            for table in tables_and_text.get('tables'):
                # print(table, '\n')
                # continue
                # print('Length of original text:', len(tables_and_text.get('text')))
                # print(tables_and_text.get('text'), '\n')
                # tables_and_text.get('text')[:] = \
                #     [text for text in tables_and_text.get('text') if text not in table.values]
                # print('Length of deduplicated text:', len(tables_and_text.get('text')))
                # print(tables_and_text.get('text'))
                # return

                tables_and_text.get('text')[:] = \
                    [text for text in tables_and_text.get('text') if text not in table.values]
                metadata = ''.join(tables_and_text.get('text'))
                lang_doc_tables.append(Document(page_content=table.to_string(), \
                    metadata={'text': metadata, 'page_num': page_num,}))
                                # 'report_name': report.get('report_name')}))

                # if page_num > 1:
                #     metadata = ''.join(report[page_num-1].get('text')) \
                #                + ''.join(report[page_num].get('text')) \
                #                + ''.join(report[page_num+1].get('text'))
                #     # print('Metadata for page:', page_num, '\n', metadata)
                #     # if metadata is None:
                #     #     metadata = ''
                # elif page_num == num_pages:
                #     metadata = ''.join(report[page_num-1].get('text')) \
                #                + ''.join(report[page_num].get('text'))
                #     # if metadata is None:
                #     #     metadata = ''
                #     # print('Metadata for page:', page_num, '\n', metadata)
                # else:
                #     metadata = ''.join(report[page_num+1].get('text')) \
                #                + ''.join(report[page_num].get('text'))
                #     # if metadata is None:
                #     #     metadata = ''
                #     # print('Metadata for page:', page_num, '\n', metadata)

    return lang_doc_tables
    
lang_doc_tables = convert_pages_to_table_docs(paged_text_and_tables)

In [9]:
def cleanup_whitespace(s:str):
    return re.sub("\s+", " ", s)

def preprocess_docs(lang_chunks):
    for doc in lang_chunks:
        doc.page_content = cleanup_whitespace(doc.page_content)
    return lang_chunks

lang_docs_tables = preprocess_docs(lang_doc_tables)

In [10]:
# text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=0)
lang_doc_tables_chunks = text_splitter.split_documents(lang_doc_tables)

In [11]:
# from pandas.compat import StringIO #if this doesn't work try: from io import StringIO
# import StringIO

for i, chunk in enumerate(lang_doc_tables_chunks):
    # read from df.to_string() back to dataframe??
    # df = pd.read_csv(StringIO(chunk.page_content), sep='\s+')
    print('\nChunk:', i, 'Chunk length:', len(chunk.page_content), '\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)
    if i == 3:
        break


Chunk: 0 Chunk length: 1740 
 0 Selected consolidated financial information(in Canadian $ millions, except per share amounts) Selected consolidated financial informationAs at or for the three months endedDec. 31 2023 Selected consolidated financial informationAs at or for the three months endedSept. 30 2023 Selected consolidated financial informationAs at or for the three months endedDec. 31 2022 (Restated) Selected consolidated financial informationFor the twelve months endedDec. 31 2023 Selected consolidated financial informationFor the twelve months endedDec. 31 2022 (Restated) 1 Base earnings 1,5 $ 971 $ 950 $ 894 $ 3,667 $ 3,318 2 Net earnings from continuing operations3 743 936 478 2,862 3,628 3 Net earnings - common shareholders 740 905 452 2,738 3,596 4 Per common share 5 Basic: 6 Base earnings2,5 1.04 1.02 0.96 3.94 3.56 7 Net earnings from continuing operations 0.80 1.01 0.51 3.07 3.89 8 Net earnings 0.79 0.97 0.48 2.94 3.86 9 Dividends paid 0.52 0.52 0.49 2.08 1.96 10 Book 

In [12]:
azure_search_endpoint = "https://" + os.environ['AZURE_AI_SEARCH_SERVICE_NAME'] + ".search.windows.net"
search_client = SearchIndexClient(azure_search_endpoint, AzureKeyCredential(os.environ['AZURE_AI_SEARCH_KEY']))

In [13]:
# result = container_client.upload_documents(documents=lang_doc_tables_chunks)

In [14]:
# re-upload documents to index  
# def upload_docs_to_index(container_client, lang_docs):
#    DOCUMENT = {
#        "category": "Hotel",
#        "hotelId": "1000",
#        "rating": 4.0,
#        "rooms": [],
#        "hotelName": "Azure Inn",
#    }

# result = search_client.upload_documents(documents=lang_doc_tables_chunks)

# print("Upload of new document succeeded: {}".format(result[0].succeeded))    

In [15]:
embeddings = OpenAIEmbeddings(
    deployment='text-embedding-ada-002-v2',
    openai_api_base=os.environ['OPENAI_API_BASE'],
    openai_api_type=os.environ['OPENAI_API_TYPE'],
    openai_api_key=os.environ['OPENAI_API_KEY'],
    openai_api_version=os.environ['OPENAI_API_VERSION'],
    # chunk_size = 1
    )

embedding_function=embeddings.embed_query

  warn_deprecated(


In [23]:
# we need to clear out exising index before adding documents
search_client.delete_index('financial-reports')

# create new index
# name = "financial-reports"
# fields = [
#     SimpleField(
#         name="page_content",
#         type=SearchFieldDataType.String,
#         key=True,
#         filterable=True,
#     ),
# ]

# index = SearchIndex(name=name, fields=fields)
# result = search_client.create_index(index) # declaration is not necessary?

In [24]:
# azure_ai_search_name = 'nlp-ai-search1'
azure_search_endpoint = "https://" + os.environ['AZURE_AI_SEARCH_SERVICE_NAME'] + ".search.windows.net"

acs_vector_store = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=os.environ['AZURE_AI_SEARCH_KEY'],
    index_name=os.environ['AZURE_AI_SEARCH_INDEX_NAME'],
    embedding_function=embedding_function,
    # fields=fields,
)

In [25]:
acs_vector_store.add_documents(documents=lang_doc_tables_chunks)

['MzJlZGI0ZmQtNjcxYy00MjhlLWI1NjItYWU0ODg4MjJlNGIy',
 'MmYwYjNjMGMtMzI0ZC00YjY0LWJlNjMtMzBhZmYzODEyY2Ex',
 'MGQ2MWQ1YzUtY2JmYy00NDAxLTljNjctMjkxNzNkMjUzZTAw',
 'NTFiYmM5OGMtNTI3MC00YzhjLTk0M2EtYTY2MDk5ZjQyODYy',
 'MDgzMDE4ZjktZmNkYS00MzZlLWExNWItNTlhNmU1YzdkMmVl',
 'YjI0MTY5ZTMtMGQzMC00NThlLTg3NjgtNzQwOThiZTY1Yzdk',
 'NDEzNTdhZDEtZTEyZC00YWE2LTg1M2YtODAwMTBmZDZjMWEz',
 'NDE4MWI3ODUtMmEyMC00YWEwLTljYmItODA2Zjk5ZDgwYzEy',
 'YTVhNWY1MzYtMzFmYi00NDE3LWE0MjEtMTViMmI3OTc3Mjcy',
 'MWY1OGY3MjEtYzk4OC00MzUzLTliMWMtNzExNmYxZDczY2Q5',
 'MDYyZDkyMjEtYTUwNi00ZGJiLTgxYWQtZGRiMWJmY2I5NDhi',
 'MzY3YmZmNzgtY2UwMC00NGM5LTkxZTgtMmI4M2JlOGZiMWIy',
 'Y2RmOTdmOTMtYjE2YS00YjQyLWFiOWEtNDBjNzMwYTQ0ZDg5',
 'MDFjNTg4NjUtZTY0My00YmNiLWJlNjMtZGRjZWRiMmZkMDAy',
 'NjlmYTg2ZTUtNzg5ZS00ZDk1LTk1MmEtYTU1ZGIxMzQ3YzI5',
 'M2Y2NWJlMGYtZTJlNi00ZjllLThjNGQtMTkwZDNiZmFiODM2',
 'N2NjZjU3ZGMtM2I2Mi00NmYwLWE1OWYtNTVlNmI2NWNmYzcy',
 'NDM2ZTUxNWYtNjJhNS00YjEzLTljN2MtYjc5ZGZjMWJmMDk2',
 'ODQ0NTQxNTItN2NmZi00MTljLTg3ZjMtYTFhZjRiMjJm

In [27]:
# input_type = 'List of JSONs'
# input_type = 'List of DataFrames'

query = """
What are the full year core earnings?
"""

returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=3,
    search_type="similarity",
)

for i, chunk in enumerate(returned_chunks):
    print('Chunk', i, '\n', chunk.page_content, '\n')

input = []
for chunk in returned_chunks:
    input.append(chunk.page_content)

Chunk 0 
 0 from core earnings: ($ millions) Quarterly Results4Q23 Quarterly Results3Q23 Quarterly Results4Q22 Full Year Results2023 Full Year Results2022 0 Core earnings 1 Asia $ 564 $ 522 $ 496 $ 2,048 $ 1,812 2 Canada 352 408 296 1,487 1,387 3 U.S. 474 442 408 1,759 1,566 4 Global Wealth and Asset Management 353 361 274 1,321 1,299 5 Corporate and Other 30 10 69 69 (263) 6 Total core earnings $ 1,773 $ 1,743 $ 1,543 $ 6,684 $ 5,801 7 Items excluded from core earnings: 8 Market experience gains (losses) (133) (1,022) (655) (1,790) (2,585) 9 Change in actuarial methods and assumptions that flow directly through income 119 (14) - 105 26 10 Restructuring charge (36) - - (36) - 11 Reinsurance transactions, tax-related items and other (64) 306 340 140 256 12 Net income attributed to shareholders / Transitional $ 1,659 $ 1,013 $ 1,228 $ 5,103 $ 3,498 

Chunk 1 
 0 Quarterly Results4Q23 Quarterly Results3Q23 Quarterly Results2Q23 Quarterly Results1Q23 Quarterly Results4Q22 Full Year Results

In [28]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The full year core earnings for 2023 are $6,684 million.'