In [1]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
from dotenv import load_dotenv
from sec_api import RenderApi, XbrlApi, ExtractorApi
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

load_dotenv()

endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) if len(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) > 0 else DefaultAzureCredential()
index_name = "sec-10k"
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = "text-embedding-ada-002"


search_client = SearchClient(endpoint, index_name, credential=credential)


In [2]:
system_prompt = """
You're a helpful assistant that helps user understand company risks for insurance underwriting process.mbers, math and calculations are required.
Please explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.
Please present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list, and put in a section named "EVIDENCE COLLECTED".

Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

Below is the DATA SOURCES AND EVIDENCES we collected about the company.

### START OF DATA SOURCES AND EVIDENCES
{{data_sources}}
### END OF DATA SOURCES AND EVIDENCES

### IMPORTANT NOTES for output
Your answer should be in JSON format with BOTH "Answer" and "Links" where your answers are based from.  Both "Answer" and "Links" keys are required for JSON output, MUST have them both.

Requirements for "Answer" field:
    Your answer MUST contain into 3 sections:
    - "SUMMERY" for conclusive answer to the question
    - "ANALYSIS" sections to provide detailed analysis and walk through how the conclusion is reached
    - "EVIDENCE COLLECTED" combine all items from "SOURCES AND EVIDENCES" section, where you used in your anlysis and answer
 
    Please stick to the fact, do not fabricate and answer don't know if you don't have an answer and be honest.
    Please be extra careful with numbers, math and when calculations are required, think step by step if numbers, math and calculations are required.

    Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

Requirements for "Links" field:
    A list of links or source names if URL isn't available.
    Links could be URL for the link to the source documents or websites. It will be used to verify the sources by the users. leave it a empty list if no sources detected.
### END OF IMPORTANT NOTES for output 

### OUTPUT JSON format (All fields including "Answers", "SUMMARY", "ANALYSIS", "EVIDENCE COLLECTED", "SOURCES in JSON schema are STRICTLY MANDATORY. ):
{{output_json}}
### END OF JSON format

START OF USER QUESTION:
"""

output_json = """{
"Answer": {
    "SUMMARY":"<The conclusive answer for the question, this is a TEXT only field>",
    "ANALYSIS":"<provide detailed analysis and walk through how the conclusion is reached. This is where you explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.>",
    "EVIDENCE COLLECTED":"<combine all items from ""SOURCES AND EVIDENCES"" section, where you used in your anlysis and answer. This is where you present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list>"
    },
"Links": ["<MANDATORY field, Links should a list of TEXT only.>"]
}
"""

question_list =["What was the year founded?",
      "What type of real estate - owned vs. leased; residential v commercial; number locations",
      "Is the company U.S. based?",
      "What is the insured's business?",
      "Have there been any acquisitions in the last five years? Please provide details",
      "Have there been any divestures in the last five years? Please provide details.",
      "Any litigation mentioned? Please provide details",
      "Historical revenues for last 5 to 10 years. Does it include acquisitions and divestures?",
      "US vs Canada vs All other revenues",
      "Any medical, hospital or pharmaceutical exposures",
      "Any auto manufacturing exposure",
      "Any third party trucking or truck brokering exposure",
      "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and resturants.",
      "PFAS exposures","Dams and/or mining exposures.",
      "Railroad exposures","Construction exposures",
      "Cannabis and/or tobacco exposures",
      "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and restaurants.",
      "electrical generating for self or others",
      "Aviation exposures - aircraft owned or non owned; hangar",
      "Watercraft exposures - ships/wharf",
      "Does the insured have discontinued operations",
      "If yes, to above, please explain",
      "Comment on premises / operations exposure if applicable",
      "Comment on products / completed ops exposure",
      "Liquor liability?",
      "Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)",
      "Other Named Insureds: Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)"]

In [3]:
response_list = []

from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2024-05-01-preview"
)

for q in question_list:
    vector_query = VectorizableTextQuery(text=q, k_nearest_neighbors=3, fields="vector", exhaustive=True)

    results = search_client.search(  
        search_text=None,  
        vector_queries= [vector_query],
        select=["parent_id", "chunk_id", "title", "chunk"],
        top=3
    )  
      
    #concatenate all the 10-Ks into a single string
    context = ""
    for result in results:  
        # print(f"Source: {result['title']}\nContent: {result['chunk']}\n\n")
        context = context + f"Source: {result['title']}\nContent: {result['chunk']}\n\n"

    sys_message = system_prompt.format(data_sources=context, output_json=output_json)


    # Generate the system message
    response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{"role":"system", "content":sys_message},
                {"role":"user","content": q,}],
        max_tokens=2000,
        response_format={"type":"json_object"},
        )
    
    response_list.append({q:response.choices[0].message.content})

In [4]:
for a in response_list:
    for k in a.keys():
        print(k)
        print(a[k])
        print('='*100)

What was the year founded?
{
    "Answer": {
        "SUMMARY": "The year the company was founded is not available in the provided data.",
        "ANALYSIS": "Upon reviewing the provided data sources and evidences concerning the company, there is no specific mention of the year when the company was founded. The documents and information appear to focus on other aspects of the company's operations, financial data, or recent updates rather than historical or foundational details.",
        "EVIDENCE COLLECTED": "After thoroughly reviewing the available context, there is no explicit information regarding the founding year of the company. The provided data does not contain such historical detail."
    },
    "Links": []
}
What type of real estate - owned vs. leased; residential v commercial; number locations
{
  "Answer": {
    "SUMMARY": "The company primarily engages with commercial real estate and it owns rather than leases its properties. They have multiple locations for their busines

: 

In [55]:
a.items()

dict_items([('What was the year founded?', '{\n    "Answer": {\n        "SUMMERY": "The year founded is not specified in the provided data sources.",\n        "ANALYSIS": "Upon reviewing the data sources, the specific year the company was founded is not mentioned in the available details.",\n        "EVIDENCE COLLECTED": "There is no explicit evidence or mention of the founding year in the provided data sources."\n    },\n    "Links": []\n}')])