In [56]:
import os
import sys
sys.path.append('..')

import openai
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# from scripts.table_and_text_parser import start_blob_client, extract_blob_paths, parse_pdfs, page_text_and_tables

from financialqa.ingestion.parser import parse_pdfs, page_text_and_tables
from financialqa.ingestion.ingestionpipeline import IngestionPipeline

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
openai.api_type="azure"
# openai.api_version="2023-09-15-preview"
openai.api_version="2023-05-15"
openai.api_base="https://use-gaa-openai-test1.openai.azure.com/"
openai.api_key=os.getenv('OPENAI_API_KEY')

In [58]:
ingestion_pipeline = IngestionPipeline()

In [59]:
container_client = ingestion_pipeline.get_blob_container_client()

In [94]:
blob_paths = ingestion_pipeline.extract_blob_paths(container_client)

AIA_2023_Q4.pdf
BHF_2023_Q1.pdf
BHF_2023_Q2.pdf
BHF_2023_Q3.pdf
IA_Financial_2023_Q2.pdf
IA_Financial_2023_Q3.pdf
LNC_2023_Q4.pdf
LNC_Lincoln_National_Corp_2023_Q1.pdf
LNC_Lincoln_National_Corp_2023_Q2.pdf
LNC_Lincoln_National_Corp_2023_Q3.pdf
MDA_AIA_2023_Q4.pdf
MDA_GWO_2023_Q4.pdf
MDA_IA_Financial_2023_Q1.pdf
MDA_IA_Financial_2023_Q2.pdf
MDA_IA_Financial_2023_Q3.pdf
MDA_IA_Financial_2023_Q4.pdf
MDA_PRU_2023_Q4.pdf
MDA_SLF_2023_Q4.pdf
MET_2023_Q4.pdf
MFC_2023_Q2.pdf
MFC_2023_Q3.pdf
MFC_2023_Q4.pdf
MFC_QPR_2023_Q4_EN.pdf
PFG_2023_Q4.pdf
PRU_2023_Q4.pdf
PRU_2023_Q5.pdf
PRU_2023_Q6.pdf
PRU_prudential-financial_2023_Q1.pdf
PRU_prudential-financial_2023_Q2.pdf
PRU_prudential-financial_2023_Q3.pdf
RBC_Preview_may_1_2023.pdf
SLF_2023_Q1.pdf
SLF_2023_Q2.pdf
SLF_2023_Q3.pdf
SLF_2023_Q4.pdf
SLF_4Q2023.pdf
SLF_MDNA_Q4_2023.pdf
UNM_2023_Q4.pdf


In [63]:
result_dicts = parse_pdfs(blob_paths)

In [64]:
paged_text_and_tables = page_text_and_tables(result_dicts)

In [67]:
lang_doc_tables = ingestion_pipeline.convert_pages_to_table_docs(paged_text_and_tables)

In [74]:
lang_doc_tables_chunks = ingestion_pipeline.chunk_docs(lang_doc_tables, chunk_size=800)

In [150]:
# from pandas.compat import StringIO #if this doesn't work try: from io import StringIO
# import StringIO

for i, chunk in enumerate(lang_doc_tables_chunks):
    # read from df.to_string() back to dataframe??
    # df = pd.read_csv(StringIO(chunk.page_content), sep='\s+')
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)
    if i == 3:
        break

print('\nTotal number of chunks:', len(lang_doc_tables_chunks))


Chunk 0 table
 0 Operator: Next, we'll go to Tom Gallagher with Evercore ISI. Your line is open.  
0                                                           Analyst, Evercore ISI  
Chunk metadata:
 {'text': 'Lincoln National Corp. (LNC) Q1 2023 Earnings Call C Corrected Transcript 10-May-2023 from those stress tests and the estimated potential credit loss and ratings migration impacts on capital in a stressed scenario. Ellen Gail Cooper President, Chief Executive Officer & Director, Lincoln National Corp. A Sure. So Ill take that question as well. So we have been talking for some time, first of all, about the fact that our investment portfolio, overall credit quality has been improving. And we saw, as Chris mentioned, the seventh consecutive quarter now of net positive ratings migration. We also have talked about the fact that we utilize a multi-manager framework across the entire portfolio. And so we leverage our external managers and their very professional advice in terms of how 

In [76]:
search_client = ingestion_pipeline.get_search_client()

In [77]:
embedding_model = ingestion_pipeline.get_embedding_model()

In [81]:
acs_vector_store = ingestion_pipeline.index_docs(search_client, lang_doc_tables_chunks, embedding_model, create_new_index=True, add_docs=False)

Existing index financial-reports in search service nlp-ai-search1
Deleting existing index financial-reports in search service nlp-ai-search1
Creating new index financial-reports in search service nlp-ai-search1
Pushing documents to Azure vector store...
3306 documents successfully indexed in 335 seconds


In [82]:
# from pandas.compat import StringIO

def process_chunks_for_input(returned_chunks):
    augmented_input = []
    for i, chunk in enumerate(returned_chunks):
        # print('Chunk', i, '\n', chunk.page_content, '\n')
        # print('Chunk', i, 'page content', '\n', chunk.page_content)
        # print('Chunk', i, 'metadata' '\n', chunk.metadata.get('text'), '\n')
        augmented_input.append([chunk.page_content])
        # augmented_input.append([chunk.page_content, chunk.metadata.get('text')])
    return augmented_input

# print('Chunk', i, '\n', chunk.text, '\n')
# for chunk in returned_chunks:
#     input.append([chunk.page_content. chunk.metadata])

In [235]:
# input_type = 'List of JSONs'
# input_type = 'List of DataFrames'

query = """
What is the Full Year Core Earnings in Asia for 2023? 
"""

returned_chunks = acs_vector_store.similarity_search(
    query=query,
    k=3,
    search_type="similarity",
)

# docs = acs_vector_store.similarity_search(
#     query=query,
#     k=3,
#     search_type="hybrid",
# )

# returned_chunks = acs_vector_store.similarity_search_with_relevance_scores(
#     query=query,
#     k=3,
#     score_threshold=0.80,
# )

for i, chunk in enumerate(returned_chunks):
    # read from df.to_string() back to dataframe??
    # df = pd.read_csv(StringIO(chunk.page_content), sep='\s+')
    print('\nChunk', i, 'table\n', chunk.page_content)
    print('Chunk metadata:\n', chunk.metadata)

# from pprint import pprint
# pprint(returned_chunks)

input = process_chunks_for_input(returned_chunks)


Chunk 0 table
 0                                                from core earnings: ($ millions)  Quarterly Results 4Q23  Quarterly Results 3Q23  Quarterly Results 4Q22  Full Year Results 2023  Full Year Results 2022
0                                                                   Core earnings                                                                                                                        
1                                                                            Asia                   $ 564                   $ 522                   $ 496                 $ 2,048                 $ 1,812
2                                                                          Canada                     352                     408                     296                   1,487                   1,387
3                                                                            U.S.                     474                     442                     408                 
Chunk

In [236]:
prompt = f"""
Answer the QUESTION enclosed in the dollar signs (i.e, $) from the data enclosed in triple backticks (i.e., ```).
Do not answer from memory. If you do not know an answer, just say I do not know.

QUESTION: 
$
{query}
$

```
{input}
```
"""

message_text = [{"role":"system","content":"You are an AI assistant that helps people find information."},
{"role": "user","content": prompt}]

completion = openai.ChatCompletion.create(
  engine="gpt-4-32k", # model = "deployment_name" # try gpt-4
  messages = message_text,
  temperature=0.7, # 0.7
  max_tokens=800,
  top_p=0.95
)

# print(prompt)
completion.get('choices')[0].get('message').get('content')

'The Full Year Core Earnings in Asia for 2023 is $ 2,048 million.'