In [1]:
import os

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

from financial_qabot_table_reader.src.table2json_copy import extract_tables
from table_and_text_parser import parse_pdf, page_text_and_tables

from dotenv import load_dotenv

In [2]:
load_dotenv() # load environment variables from .env

True

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
openai.api_type="azure"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [5]:
# create the client class to connect to the container
blob_service_client = BlobServiceClient.from_connection_string(os.environ['AZURE_STORAGE_CONNECTION_STRING'])
# use the client to connect to the container
container_client = blob_service_client.get_container_client(os.environ['AZURE_STORAGE_CONTAINER_NAME'])

In [6]:
# list all the files/blobs in my container
for blob in container_client.list_blobs():
    print(blob.name)

MDA_GWO_2023_Q4.pdf
MFC_QPR_2023_Q4_EN.pdf
RBC_Preview_may_1_2023.pdf


In [7]:
def extract_blob_paths(container_client):
    list_of_blob_paths = []
    for i, blob in enumerate(container_client.list_blobs()):
        path = "https://" + os.environ["AZURE_STORAGE_CONTAINER_ACCOUNT"] + \
            ".blob.core.windows.net/" + os.environ['AZURE_STORAGE_CONTAINER_NAME'] + "/" + blob.name
        list_of_blob_paths.append(path)
    return list_of_blob_paths

In [8]:
list_of_blob_paths = extract_blob_paths(container_client)

In [10]:
def parse_pdfs(list_of_blob_paths):
    result_dicts = []
    for path in list_of_blob_paths:
        result_dicts.append(parse_pdf(path))
    return result_dicts

In [11]:
result_dicts = parse_pdfs(list_of_blob_paths)

In [12]:
print('Number of reports:', len(result_dicts)) # list of list of tables for each report 

Number of reports: 3


In [13]:
paged_text_and_tables = []
for result_dict in result_dicts:
    paged_text_and_tables.append(page_text_and_tables(result_dict))

In [14]:
def convert_pages_to_table_docs(paged_text_and_tables):

    tables_and_text_docs = []

    for report in paged_text_and_tables:
        num_pages = max(report.keys())
        for page_num, tables_and_text in report.items():
            for table in tables_and_text.get('tables'):
                if page_num > 1:
                    metadata = ''.join(report[page_num-1].get('text')) \
                               + ''.join(report[page_num].get('text')) \
                               + ''.join(report[page_num+1].get('text'))
                    # print('Metadata for page:', page_num, '\n', metadata)
                    # if metadata is None:
                    #     metadata = ''
                elif page_num == num_pages:
                    metadata = ''.join(report[page_num-1].get('text')) \
                               + ''.join(report[page_num].get('text'))
                    # if metadata is None:
                    #     metadata = ''
                    # print('Metadata for page:', page_num, '\n', metadata)
                else:
                    metadata = ''.join(report[page_num+1].get('text')) \
                               + ''.join(report[page_num].get('text'))
                    # if metadata is None:
                    #     metadata = ''
                    # print('Metadata for page:', page_num, '\n', metadata)
                tables_and_text_docs.append(Document(page_content=table.to_string(), \
                                            metadata={'text': metadata}))

    return tables_and_text_docs
    
list_of_table_docs = convert_pages_to_table_docs(paged_text_and_tables)

In [15]:
def cleanup_whitespace(s:str):
    return re.sub("\s+", " ", s)

def preprocess_docs(lang_chunks):
    for doc in lang_chunks:
        doc.page_content = cleanup_whitespace(doc.page_content)
    return lang_chunks

lang_chunks_clean = preprocess_docs(list_of_table_docs)

In [16]:
# text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=20)
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=20)
lang_chunks = text_splitter.split_documents(list_of_table_docs)

In [26]:
# for i, chunk in enumerate(lang_chunks):
#     print('Chunk:', i, 'Chunk length:', len(chunk.page_content), '\n', chunk.page_content, '\n')

In [18]:
embeddings = OpenAIEmbeddings(
    deployment='text-embedding-ada-002-v2',
    openai_api_base=os.environ['OPENAI_API_BASE'],
    openai_api_type=os.environ['OPENAI_API_TYPE'],
    openai_api_key=os.environ['OPENAI_API_KEY'],
    openai_api_version=os.environ['OPENAI_API_VERSION'],
    # chunk_size = 1
    )

embedding_function=embeddings.embed_query

  warn_deprecated(


In [20]:
fields = [
    # SimpleField(
    #     name="id",
    #     type=SearchFieldDataType.String,
    #     key=True,
    #     filterable=True,
    # ),
    # SearchableField(
    #     name="content",
    #     type=SearchFieldDataType.String,
    #     searchable=True,
    # ),
    # SearchField(
    #     name="content_vector",
    #     type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
    #     searchable=True,
    #     vector_search_dimensions=len(embedding_function("Text")),
    #     vector_search_configuration="default", ##the "default" option is explained below
    # ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
        filterable=True,
    ),
    # # Additional field to store the title
    # SimpleField(
    #     name="policy_number",
    #     type=SearchFieldDataType.String,
    #     filterable=True,
    #     searchable=True,
    # )
]

In [21]:
# azure_ai_search_name = 'nlp-ai-search1'
azure_search_endpoint = "https://" + os.environ['AZURE_AI_SEARCH_SERVICE_NAME'] + ".search.windows.net"

acs_vector_store = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=os.environ['AZURE_AI_SEARCH_KEY'],
    index_name=os.environ['AZURE_AI_SEARCH_INDEX_NAME'],
    embedding_function=embedding_function,
    fields=fields,
)

In [22]:
# # [START delete_index]
# client = SearchIndexClient(service_endpoint, AzureKeyCredential(key))
# name = "hotels"
# client.delete_index(name)
# # [END delete_index]

In [23]:
# we need to clear out exising index before adding documents
acs_vector_store.add_documents(documents=lang_chunks_clean)

['OGI3ZDQyZDItZWVmNi00NzI0LWFjOTYtOGRhNzkxYmQ5NzNh',
 'OTFhNjBiOTAtMGVhNi00MDc1LWI5YzktMDEwMTM2Y2EyNzJi',
 'Y2EwNzVmZDMtNTE1OS00YjU0LWJkNjktNDdmMjc5YWY0Y2Iy',
 'OWZlMzYzMjAtZjI4Ni00NmFlLWJmMjMtY2I3NDQ4MzUyMWI0',
 'NWUxNTE0Y2ItMjVlNC00YTI3LWEwN2UtMGUwOTk1NGU0YjVh',
 'ZWMwYzkxM2MtZDRkYS00ZTU2LThlZTQtOTNlMzQzOGM5MGRh',
 'N2U0NTVhMTgtNmQ2ZS00MzIxLTg1ZWYtOGMxMTUyZDRiMGFi',
 'ZDM4NmQyM2YtMDU3OC00ODQ3LWFiMWMtMzlkZTM4NjdiNmQ2',
 'ODU5YjZlYTYtZmUxOC00ZmJlLWEwNjgtYTdkMGNlNzFkM2Q3',
 'NzQ3OWQ3ZDMtODI3ZS00NzMxLWJlNTEtODIyOGQwNTRmNThk',
 'OWQ0ZjQzMjUtMzM5My00NDJkLWI1YmQtMzgzYjkwYWM4ODE1',
 'OGZmNmJmZjEtMTM3ZS00YWJhLWI3NmYtNTY4ZTQ2MTMxY2U1',
 'ODQzMGVmYTMtYjFmZi00MTY5LWJiMzgtZWIwNjQxNGYyYjgx',
 'NjA3NDIxYjAtMmFjYy00M2YzLWE0MjMtYjM2NTExM2NlOGM5',
 'OGQzMTVlMzctNmVkNC00NDI5LWE1NDAtZmU4MTk2ZWE0OTNk',
 'NzFkMzU4OTctMjRlZS00YTc3LWE2ZTAtNTI3MGU1MDVjOGRk',
 'MjY1MTY4NTYtZmFjZC00N2UwLWI3YzItMDRmN2RmY2RmOWFm',
 'OWI2N2Q5ODUtMmNjMC00MTM2LTgyMTctZWY0MDgxZGYyMjQ3',
 'YzA1ZTRlY2YtYzRhZS00NDAzLThkYzAtNmQ0YzdmMTk4

In [31]:
# input_type = 'List of JSONs'
# input_type = 'List of DataFrames'

query = """
What is the New Business CSM in Asia for 2023?
"""

returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=1,
    search_type="similarity",
)

for i, chunk in enumerate(returned_chunks):
    print('Chunk', i, '\n', chunk, '\n')

input = []
for chunk in returned_chunks:
    input.append(chunk.page_content)

Chunk 0 
 page_content='0 None None None None None None None None 0 New business CSM, net of NCI 1 Hong Kong $ 199 $ 167 $ 191 $ 119 $ 110 $ 676 $ 437 2 Japan 42 29 19 36 28 126 140 3 Asia Other 173 206 222 146 186 747 732 4 International High Net Worth 231 197 5 Mainland China 138 12 6 Singapore 244 189 7 Vietnam 87 305 8 Other Emerging Markets 47 29 9 Asia 414 402 432 301 324 1,549 1,309 10 Canada 70 51 57 46 47 224 199 11 U.S. 142 54 103 95 71 394 387 12 Total new business CSM net of NCI 626 507 592 442 442 2,167 1,895 13 Asia NCI 39 46 38 19 - 142 20 14 Total impact of new insurance business in CSM $ 665 $ 553 $ 630 $ 461 $ 442 $ 2,309 $ 1,915 15 New business CSM, net of NCI, CER adjustment(1),(2) 16 Hong Kong $ - $ 3 $ 3 $ 1 $ - $ 7 $ 20 17 Japan - (1) (1) (3) (1) (5) (10) 18 Asia Other - 5 (1) (3) 2 1 22 19 International High Net Worth 3 7 20 Mainland China - - 21 Singapore 2 13 22 Vietnam (2) 3 23 Other Emerging Markets (2) (1) 24 Asia - 7 1 (5) 1 3 32 25 Canada - - - - - - - 26

In [32]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

"I'm sorry, but the data provided does not include information on the New Business CSM in Asia for 2023."