In [2]:
import os
import sys
sys.path.append('..')
import random

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# from scripts.table_and_text_parser import start_blob_client, extract_blob_paths, parse_pdfs, page_text_and_tables

from financialqa.ingestion.parser import parse_pdfs, page_text_and_tables
from financialqa.ingestion.ingestionpipeline import IngestionPipeline

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env

%load_ext autoreload
%autoreload 2

In [3]:
openai.api_type="azure"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [4]:
ingestion_pipeline = IngestionPipeline()

In [5]:
container_client = ingestion_pipeline.get_blob_container_client()

17:36:01 INFO financialqa.ingestion.ingestionpipeline line 61  Getting Azure Storage blob container client...


In [6]:
# test_files = [
#     'MFC_QPR_2023_Q4_EN.pdf', 
#     'RBC_Preview_may_1_2023.pdf', 
#     'MDA_GWO_2023_Q4.pdf', 
#     'MDA_SLF_2023_Q4.pdf',
#     'MDA_PRU_2023_Q4.pdf',
# ]
report_contents = ingestion_pipeline.extract_report_contents(container_client)
for report_name, vals in report_contents.items():
    print(vals.get('report_blob_path'))

print('\nNumber of reports:', len(report_contents))

17:36:01 INFO financialqa.ingestion.ingestionpipeline line 73  Extracting report contents...


https://eliasstorage2.blob.core.windows.net/messiah-test/Barclays_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/CIBC_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/Canaccord_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/Cormark_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/CreditSuisse_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/Desjardins_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/Evercore_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/GWO_2023_Q4.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/MFC_QPR_2023_Q4.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/NB_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/RBC_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/Scotiabank_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/mes

In [7]:
result_dicts = parse_pdfs(report_contents)

17:36:01 INFO financialqa.ingestion.parser line 22  Parsing PDFs...


In [19]:
paged_text_and_tables = page_text_and_tables(result_dicts)

In [20]:
lang_doc_tables = ingestion_pipeline.convert_pages_to_table_docs(paged_text_and_tables)

In [21]:
print('Number of table documents:', len(lang_doc_tables))

Number of table documents: 315


In [22]:
lang_doc_tables_chunks = ingestion_pipeline.chunk_docs(lang_doc_tables, chunk_size=1500)
print('Total number of chunks:', len(lang_doc_tables_chunks))

Total number of chunks: 754


In [12]:
for i, chunk in enumerate(lang_doc_tables_chunks):
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)
    if i == 2:
        break


Chunk 0 table
 0            First Quarter Earnings   5
0                 Great-West Lifeco   9
1                Manulife Financial  11
2                Sun Life Financial  13
3  Q1 Outlook - Macro Factors Mixed  15
4                          Appendix  18
Chunk metadata:
 {'text': 'Barclays | L&H Insurers CONTENTS Mixed Capital Market Factors and New Accounting Standard to Muddy First Quarter Earnings 5', 'page_num': 3, 'company_name': 'Barclays', 'report_quarter': 'Q1', 'report_blob_path': 'https://eliasstorage2.blob.core.windows.net/messiah-test/Barclays_Preview_Q1.pdf', 'page_titles': '', 'page_headers': '', 'section_headers': '', 'page_footers': '13 April 2023'}

Chunk 1 table
 0                                 Company  Rating Old  Rating New  Price 12-Apr- 23  Price Target EPS FY1 (E) Old  Price Target EPS FY1 (E) New  Price Target EPS FY1 (E) %Chg  Price Target EPS FY1 (E) Old  Price Target EPS FY1 (E) New  Price Target EPS FY1 (E) %Chg  EPS FY2 (E) Old  EPS FY2 (E) New  EPS FY2 

In [23]:
search_client = ingestion_pipeline.get_search_client()

In [14]:
embedding_model = ingestion_pipeline.get_embedding_model()

17:39:53 INFO financialqa.ingestion.ingestionpipeline line 161  Getting OpenAI embedding model...


In [15]:
acs_vector_store = ingestion_pipeline.index_docs(
    search_client, lang_doc_tables_chunks, 
    embedding_model, create_new_index=True, add_docs=False
)

17:39:53 INFO financialqa.ingestion.ingestionpipeline line 176  Uploading documents to Azure AI Search index...


Deleting existing index financial-reports in search service nlp-ai-search1
Creating new index financial-reports in search service nlp-ai-search1
Pushing documents to Azure vector store...
754 documents successfully indexed in 78 seconds


In [16]:
def process_chunks_for_input(returned_chunks):
    augmented_input = []
    for i, chunk in enumerate(returned_chunks):
        # print('Chunk', i, '\n', chunk.page_content, '\n')
        # print('Chunk', i, 'page content', '\n', chunk.page_content)
        # print('Chunk', i, 'metadata' '\n', chunk.metadata.get('text'), '\n')
        augmented_input.append([chunk.page_content])
        # augmented_input.append([chunk.page_content, chunk.metadata.get('text')])
    return augmented_input

In [41]:
query = """
What are the APE sales in the U.S. for 2022?
"""

import logging
logging.disable(logging.WARNING)
returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=3,
    search_type="similarity",
    # filters="report_quarter eq 'Q1'"
)

# returned_chunks = acs_vector_store.similarity_search(
#     query=query,
#     k=3,
#     search_type="hybrid",
# )

# returned_chunks = acs_vector_store.similarity_search_with_relevance_scores(
#     query=query,
#     k=3,
#     score_threshold=0.80,
# )

for i, chunk in enumerate(returned_chunks):
    print('Chunk', i, 'from page', chunk.metadata['page_num'], 'in report blob path:', chunk.metadata['report_blob_path'], '\n\n', chunk.page_content)
    print('\nChunk metadata text:\n', chunk.metadata['text'])
    print('\nChunk metadata page title:\n', chunk.metadata['page_titles'])
    print('\nChunk metadata page header:\n', chunk.metadata['page_headers'])
    print('\nChunk metadata section header:\n', chunk.metadata['section_headers'])
    print('\nChunk metadata page footer:\n', chunk.metadata['page_footers'], '\n-------------')

input = process_chunks_for_input(returned_chunks)

Chunk 0 from page 2 in report blob path: https://eliasstorage2.blob.core.windows.net/messiah-test/MFC_QPR_2023_Q4.pdf 

 0   ($ millions, unless otherwise stated)  Quarterly Results 4Q23  Quarterly Results 4Q22  Quarterly Results Change  Full Year Results 2023  Full Year Results 2022  Full Year Results Change
1                              Asia (US$)                                                                                                                                                    
2   Net Income attributed to shareholders                   $ 452                 $ 231 /                     84% /                   $ 995                 $ 516 /                     43% /
3                          / Transitional                                             363                       22%                                             481                       73%
4                           Core Earnings                     414                     365                       14%    

In [42]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The APE sales in Canada for the fourth quarter of 2023 are $141 million. The APE sales for all of 2023 in Canada are $416 million.'