In [55]:
import os
import sys
sys.path.append('..')

import openai

from src.financialqa.ingestion.parser import parse_pdfs, page_text_and_tables
from src.financialqa.ingestion.ingestionpipeline import IngestionPipeline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
# openai.api_type="azure"
# openai.api_version="2023-05-15"
# openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
# openai.api_key=os.getenv('OPENAI_API_KEY')

In [57]:
ingestion_pipeline = IngestionPipeline()

In [63]:
select_files = [
    'Desjardins_Preview_Q1.pdf',
    'RBC_Preview_Q1.pdf', 
    'MFC_QPR_2023_Q4.pdf',
]
# select_files = []
report_contents = ingestion_pipeline.extract_report_contents(select_files=select_files)
for report_name, vals in report_contents.items():
    print(vals.get('report_blob_path'))

print('\nNumber of reports:', len(report_contents))

22:04:15 INFO src.financialqa.ingestion.ingestionpipeline line 70  Extracting report contents from Azure Blob Storage container...


https://eliasstorage2.blob.core.windows.net/messiah-test/Desjardins_Preview_Q1.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/MFC_QPR_2023_Q4.pdf
https://eliasstorage2.blob.core.windows.net/messiah-test/RBC_Preview_Q1.pdf

Number of reports: 3


In [64]:
result_dicts = parse_pdfs(report_contents)

22:04:16 INFO src.financialqa.ingestion.parser line 22  Parsing PDFs using Azure Document Intelligence...


In [65]:
paged_text_and_tables = page_text_and_tables(result_dicts)

22:05:04 INFO src.financialqa.ingestion.parser line 51  Paging text and tables...


In [66]:
lang_doc_tables = ingestion_pipeline.convert_pages_to_table_docs(paged_text_and_tables)

22:05:05 INFO src.financialqa.ingestion.ingestionpipeline line 95  Converting pages to LangChain documents...


In [67]:
print('Number of table documents:', len(lang_doc_tables))

Number of table documents: 61


In [69]:
lang_doc_tables_chunks = ingestion_pipeline.chunk_docs(lang_doc_tables, chunk_size=800)
print('Total number of chunks:', len(lang_doc_tables_chunks))

22:05:12 INFO src.financialqa.ingestion.ingestionpipeline line 137  Chunking LangChain Documents...


Total number of chunks: 311


In [70]:
for i, chunk in enumerate(lang_doc_tables_chunks):
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)
    if i == 2:
        break


Chunk 0 table
 0  Exhibit 1: Canadian lifecos-recommendations, target prices and key metrics Rank Company  \
2                               1 Sun Life Financial                                         
3                                     2 iA Financial                                         
4                               3 Manulife Financial                                         
5                                4 Great-West Lifeco                                         

0  Exhibit 1: Canadian lifecos-recommendations, target prices and key metrics  Rating  \
2                                            Buy-AAR                                    
3                                            Buy-AAR                                    
4                                           Hold-AAR                                    
5                                           Hold-AAR                                    

0  Exhibit 1: Canadian lifecos-recommendations, target prices and k

In [71]:
acs_vector_store = ingestion_pipeline.get_search_index(
    add_docs=lang_doc_tables_chunks, overwrite_index=True,
)

22:05:19 INFO src.financialqa.ingestion.ingestionpipeline line 145  Getting Azure AI Search index messiah_search


Found existing index with name messiah_search in search service nlp-ai-search1
Overwriting existing index messiah_search in search service nlp-ai-search1
Adding 311 new documents to messiah_search
311 new documents successfully added to index messiah_search in 34s


In [72]:
def process_chunks_for_input(returned_chunks):
    augmented_input = []
    for i, chunk in enumerate(returned_chunks):
        # print('Chunk', i, '\n', chunk.page_content, '\n')
        # print('Chunk', i, 'page content', '\n', chunk.page_content)
        # print('Chunk', i, 'metadata' '\n', chunk.metadata.get('text'), '\n')
        augmented_input.append([chunk.page_content])
        # augmented_input.append([chunk.page_content, chunk.metadata.get('text')])
    return augmented_input

In [75]:
query = """
What is the Sun Life Financial Reporting Date?
"""

import logging
logging.disable(logging.WARNING)
returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=3,
    search_type="similarity",
    # filters="report_quarter eq 'Q1'"
)

# returned_chunks = acs_vector_store.similarity_search(
#     query=query,
#     k=3,
#     search_type="hybrid",
# )

# returned_chunks = acs_vector_store.similarity_search_with_relevance_scores(
#     query=query,
#     k=3,
#     score_threshold=0.80,
# )

for i, chunk in enumerate(returned_chunks):
    print('Chunk', i, 'from page', chunk.metadata['page_num'], 'in report blob path:', chunk.metadata['report_blob_path'], '\n\n', chunk.page_content)
    print('\nChunk metadata text:\n', chunk.metadata['text'])
    print('\nChunk metadata page title:\n', chunk.metadata['page_titles'])
    print('\nChunk metadata page header:\n', chunk.metadata['page_headers'])
    print('\nChunk metadata section header:\n', chunk.metadata['section_headers'])
    print('\nChunk metadata page footer:\n', chunk.metadata['page_footers'], '\n-------------')

input = process_chunks_for_input(returned_chunks)

Chunk 0 from page 2 in report blob path: https://eliasstorage2.blob.core.windows.net/messiah-test/Desjardins_Preview_Q1.pdf 

 0  Exhibit 2: Canadian lifecos-1Q23 conference call details   \
2                                 Sun Life Financial           
3                                    Great West Life           
4                                       iA Financial           
5                                 Manulife Financial           
6                            Source: Company reports           

0  Exhibit 2: Canadian lifecos-1Q23 conference call details Reporting date  \
2                                          11-May-23                         
3                                           9-May-23                         
4                                          10-May-23                         
5                                          10-May-23                         
6                                                                            

0  Exhibit 2: Canad

In [79]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The Sun Life Financial Reporting Date is on 11-May-23.'