In [1]:
import os
import sys
sys.path.append('..')

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

from financial_qabot_table_reader.src.table2json_copy import extract_tables
from scripts.table_and_text_parser import parse_pdf, page_text_and_tables

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env

True

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
openai.api_type="azure"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [4]:
# create the client class to connect to the container
blob_service_client = BlobServiceClient.from_connection_string(os.environ['AZURE_STORAGE_CONNECTION_STRING'])
# use the client to connect to the container
container_client = blob_service_client.get_container_client(os.environ['AZURE_STORAGE_CONTAINER_NAME'])

In [5]:
# list all the files/blobs in my container
for blob in container_client.list_blobs():
    print(blob.name)

MDA_GWO_2023_Q4.pdf
MFC_QPR_2023_Q4_EN.pdf
RBC_Preview_may_1_2023.pdf


In [6]:
def extract_blob_paths(container_client):
    list_of_blob_paths = []
    for i, blob in enumerate(container_client.list_blobs()):
        path = "https://" + os.environ["AZURE_STORAGE_CONTAINER_ACCOUNT"] + \
            ".blob.core.windows.net/" + os.environ['AZURE_STORAGE_CONTAINER_NAME'] + "/" + blob.name
        list_of_blob_paths.append(path)
    return list_of_blob_paths

In [7]:
list_of_blob_paths = extract_blob_paths(container_client)

In [8]:
def parse_pdfs(list_of_blob_paths):
    result_dicts = []
    for path in list_of_blob_paths:
        result_dicts.append(parse_pdf(path))
    return result_dicts

In [9]:
result_dicts = parse_pdfs(list_of_blob_paths)

In [10]:
print('Number of reports:', len(result_dicts)) # list of list of tables for each report 

Number of reports: 3


In [11]:
paged_text_and_tables = []
for result_dict in result_dicts:
    paged_text_and_tables.append(page_text_and_tables(result_dict))

In [12]:
def convert_pages_to_table_docs(paged_text_and_tables):

    tables_and_text_docs = []

    for report in paged_text_and_tables:
        num_pages = max(report.keys())
        for page_num, tables_and_text in report.items():
            for table in tables_and_text.get('tables'):
                if page_num > 1:
                    metadata = ''.join(report[page_num-1].get('text')) \
                               + ''.join(report[page_num].get('text')) \
                               + ''.join(report[page_num+1].get('text'))
                    # print('Metadata for page:', page_num, '\n', metadata)
                    # if metadata is None:
                    #     metadata = ''
                elif page_num == num_pages:
                    metadata = ''.join(report[page_num-1].get('text')) \
                               + ''.join(report[page_num].get('text'))
                    # if metadata is None:
                    #     metadata = ''
                    # print('Metadata for page:', page_num, '\n', metadata)
                else:
                    metadata = ''.join(report[page_num+1].get('text')) \
                               + ''.join(report[page_num].get('text'))
                    # if metadata is None:
                    #     metadata = ''
                    # print('Metadata for page:', page_num, '\n', metadata)
                tables_and_text_docs.append(Document(page_content=table.to_string(), \
                                            metadata={'text': metadata}))

    return tables_and_text_docs
    
list_of_table_docs = convert_pages_to_table_docs(paged_text_and_tables)

In [13]:
def cleanup_whitespace(s:str):
    return re.sub("\s+", " ", s)

def preprocess_docs(lang_chunks):
    for doc in lang_chunks:
        doc.page_content = cleanup_whitespace(doc.page_content)
    return lang_chunks

lang_chunks_clean = preprocess_docs(list_of_table_docs)

In [14]:
# text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=20)
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=20)
lang_chunks = text_splitter.split_documents(list_of_table_docs)

In [15]:
# for i, chunk in enumerate(lang_chunks):
#     print('Chunk:', i, 'Chunk length:', len(chunk.page_content), '\n', chunk.page_content, '\n')

In [16]:
embeddings = OpenAIEmbeddings(
    deployment='text-embedding-ada-002-v2',
    openai_api_base=os.environ['OPENAI_API_BASE'],
    openai_api_type=os.environ['OPENAI_API_TYPE'],
    openai_api_key=os.environ['OPENAI_API_KEY'],
    openai_api_version=os.environ['OPENAI_API_VERSION'],
    # chunk_size = 1
    )

embedding_function=embeddings.embed_query

  warn_deprecated(


In [17]:
fields = [
    # SimpleField(
    #     name="id",
    #     type=SearchFieldDataType.String,
    #     key=True,
    #     filterable=True,
    # ),
    # SearchableField(
    #     name="content",
    #     type=SearchFieldDataType.String,
    #     searchable=True,
    # ),
    # SearchField(
    #     name="content_vector",
    #     type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
    #     searchable=True,
    #     vector_search_dimensions=len(embedding_function("Text")),
    #     vector_search_configuration="default", ##the "default" option is explained below
    # ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
        filterable=True,
    ),
    # # Additional field to store the title
    # SimpleField(
    #     name="policy_number",
    #     type=SearchFieldDataType.String,
    #     filterable=True,
    #     searchable=True,
    # )
]

In [18]:
# azure_ai_search_name = 'nlp-ai-search1'
azure_search_endpoint = "https://" + os.environ['AZURE_AI_SEARCH_SERVICE_NAME'] + ".search.windows.net"

acs_vector_store = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=os.environ['AZURE_AI_SEARCH_KEY'],
    index_name=os.environ['AZURE_AI_SEARCH_INDEX_NAME'],
    embedding_function=embedding_function,
    fields=fields,
)

In [19]:
# # [START delete_index]
# client = SearchIndexClient(service_endpoint, AzureKeyCredential(key))
# name = "hotels"
# client.delete_index(name)
# # [END delete_index]

In [20]:
# we need to clear out exising index before adding documents
# acs_vector_store.add_documents(documents=lang_chunks_clean)

In [23]:
# input_type = 'List of JSONs'
# input_type = 'List of DataFrames'

query = """
What are the full year core earnings?
"""

returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=1,
    search_type="similarity",
)

for i, chunk in enumerate(returned_chunks):
    print('Chunk', i, '\n', chunk.page_content, '\n')

input = []
for chunk in returned_chunks:
    input.append(chunk.page_content)

Chunk 0 
 0 None None None None None None 0 Core earnings 1 Asia $ 564 $ 522 $ 496 $ 2,048 $ 1,812 2 Canada 352 408 296 1,487 1,387 3 U.S. 474 442 408 1,759 1,566 4 Global Wealth and Asset Management 353 361 274 1,321 1,299 5 Corporate and Other 30 10 69 69 (263) 6 Total core earnings $ 1,773 $ 1,743 $ 1,543 $ 6,684 $ 5,801 7 Items excluded from core earnings: 8 Market experience gains (losses) (133) (1,022) (655) (1,790) (2,585) 9 Change in actuarial methods and assumptions that flow directly through income 119 (14) - 105 26 10 Restructuring charge (36) - - (36) - 11 Reinsurance transactions, tax-related items and other (64) 306 340 140 256 12 Net income attributed to shareholders / Transitional $ 1,659 $ 1,013 $ 1,228 $ 5,103 $ 3,498 



In [24]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The full year core earnings are $6,684.'