In [35]:
import os
import sys
sys.path.append('..')

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# from scripts.table_and_text_parser import start_blob_client, extract_blob_paths, parse_pdfs, page_text_and_tables

from financialqa.ingestion.parser import parse_pdfs, page_text_and_tables
from financialqa.ingestion.ingestionpipeline import IngestionPipeline

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
openai.api_type="azure"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [20]:
ingestion_pipeline = IngestionPipeline()

In [39]:
container_client = ingestion_pipeline.get_blob_container_client()

In [200]:
report_contents = ingestion_pipeline.extract_report_contents(container_client)
report_contents.popitem()

19:51:42 INFO financialqa.ingestion.ingestionpipeline line 77  Extracting report contents...


('UNM_2023_Q4.pdf',
 {'company_name': 'UNM',
  'report_quarter': 'Q4',
  'report_blob_path': 'https://eliasstorage2.blob.core.windows.net/messiah-test/UNM_2023_Q4.pdf'})

In [201]:
for report_name in report_contents.keys():
    print(report_name)

AIA_2023_Q4.pdf
BHF_2023_Q1.pdf
BHF_2023_Q2.pdf
BHF_2023_Q3.pdf
IA_Financial_2023_Q2.pdf
IA_Financial_2023_Q3.pdf
LNC_2023_Q4.pdf
LNC_Lincoln_National_Corp_2023_Q1.pdf
LNC_Lincoln_National_Corp_2023_Q2.pdf
LNC_Lincoln_National_Corp_2023_Q3.pdf
MDA_AIA_2023_Q4.pdf
MDA_GWO_2023_Q4.pdf
MDA_IA_Financial_2023_Q1.pdf
MDA_IA_Financial_2023_Q2.pdf
MDA_IA_Financial_2023_Q3.pdf
MDA_IA_Financial_2023_Q4.pdf
MDA_PRU_2023_Q4.pdf
MDA_SLF_2023_Q4.pdf
MET_2023_Q4.pdf
MFC_2023_Q2.pdf
MFC_2023_Q3.pdf
MFC_2023_Q4.pdf
MFC_QPR_2023_Q4_EN.pdf
PFG_2023_Q4.pdf
PRU_2023_Q4.pdf
PRU_2023_Q5.pdf
PRU_2023_Q6.pdf
PRU_prudential-financial_2023_Q1.pdf
PRU_prudential-financial_2023_Q2.pdf
PRU_prudential-financial_2023_Q3.pdf
RBC_Preview_may_1_2023.pdf
SLF_2023_Q1.pdf
SLF_2023_Q2.pdf
SLF_2023_Q3.pdf
SLF_2023_Q4.pdf
SLF_4Q2023.pdf
SLF_MDNA_Q4_2023.pdf


In [202]:
result_dicts = parse_pdfs(report_contents)

19:52:21 INFO financialqa.ingestion.parser line 22  Parsing PDFs...


In [276]:
paged_text_and_tables = page_text_and_tables(result_dicts)

In [277]:
lang_doc_tables = ingestion_pipeline.convert_pages_to_table_docs(paged_text_and_tables)

20:30:18 INFO financialqa.ingestion.ingestionpipeline line 103  Converting pages to langchain table documents...


In [278]:
lang_doc_tables_chunks = ingestion_pipeline.chunk_docs(lang_doc_tables, chunk_size=400)

20:30:18 INFO financialqa.ingestion.ingestionpipeline line 130  Chunking langchain documents...


In [279]:
for i, chunk in enumerate(lang_doc_tables_chunks):
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)
    if i == 2:
        break

print('\nTotal number of chunks:', len(lang_doc_tables_chunks))


Chunk 0 table
 0   ($ millions, unless otherwise stated)  Quarterly Results 4Q23  Quarterly Results 4Q22  Quarterly Results Change2,5  Full Year Results 2023  Full Year Results 2022  Full Year Results Change
0   Net Income attributed to shareholders                 $ 1,659                 $ 915 /                        81% /                 $ 5,103             $ (1,933) /                      nm /
1                          / Transitional                                         $ 1,228                          35%                                         $ 3,498                       40%
2                           Core Earnings                 $ 1,773                 $ 1,543                          15%                 $ 6,684                 $ 5,801                       13%
3                  EPS / Transitional ($)                  $ 0.86                $ 0.43 /                        97% /                  $ 2.61              $ (1.15) /                      nm /
4                  

In [280]:
search_client = ingestion_pipeline.get_search_client()

20:30:21 INFO financialqa.ingestion.ingestionpipeline line 140  Getting search client...


In [281]:
embedding_model = ingestion_pipeline.get_embedding_model()

20:30:22 INFO financialqa.ingestion.ingestionpipeline line 149  Getting OpenAI embedding model...


In [282]:
acs_vector_store = ingestion_pipeline.index_docs(
    search_client, lang_doc_tables_chunks, 
    embedding_model, create_new_index=True, add_docs=False
    )

20:30:23 INFO financialqa.ingestion.ingestionpipeline line 164  Uploading documents to Azure AI Search index...


Deleting existing index financial-reports in search service nlp-ai-search1
Creating new index financial-reports in search service nlp-ai-search1
Pushing documents to Azure vector store...
24 documents successfully indexed in 3 seconds


In [283]:
# from pandas.compat import StringIO

def process_chunks_for_input(returned_chunks):
    augmented_input = []
    for i, chunk in enumerate(returned_chunks):
        # print('Chunk', i, '\n', chunk.page_content, '\n')
        # print('Chunk', i, 'page content', '\n', chunk.page_content)
        # print('Chunk', i, 'metadata' '\n', chunk.metadata.get('text'), '\n')
        augmented_input.append([chunk.page_content])
        # augmented_input.append([chunk.page_content, chunk.metadata.get('text')])
    return augmented_input

# print('Chunk', i, '\n', chunk.text, '\n')
# for chunk in returned_chunks:
#     input.append([chunk.page_content. chunk.metadata])

In [284]:
query = """
What is the Full Year Core Earnings in Asia for 2023? 
"""

import logging
logging.disable(logging.WARNING)
returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=3,
    search_type="similarity",
)

# docs = acs_vector_store.similarity_search(
#     query=query,
#     k=3,
#     search_type="hybrid",
# )

# returned_chunks = acs_vector_store.similarity_search_with_relevance_scores(
#     query=query,
#     k=3,
#     score_threshold=0.80,
# )

for i, chunk in enumerate(returned_chunks):
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)

input = process_chunks_for_input(returned_chunks)


Chunk 0 table
 0  (Canadian $ millions, post-tax and based on actual foreign exchange rates in effect in the applicable reporting period, unless otherwise stated)  2023 Asia  2023 Canada  2023 U.S.  2023 Global WAM  2023 Corporate and Other  2023 Total
0                                                                                                                          Core earnings (post-tax)    $ 2,048      $ 1,487    $ 1,759          $ 1,321                      $ 69     $ 6,684
1                                                                                                                                 CER adjustment(1)       (10)            -         15                7                         2          14
2                                                                                                               Core earnings, CER basis (post-tax)    $ 2,038      $ 1,487    $ 1,774          $ 1,328                      $ 71     $ 6,698
3                               

In [285]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The Full Year Core Earnings in Asia for 2023 is $2,048 million.'