In [46]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
from dotenv import load_dotenv
from sec_api import RenderApi, XbrlApi, ExtractorApi
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery, VectorizedQuery

load_dotenv()

endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) if len(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) > 0 else DefaultAzureCredential()

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = "text-embedding-ada-002"

system_prompt = """
You're a helpful assistant that helps user understand company risks for insurance underwriting process.mbers, math and calculations are required.
Please explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.
Please present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list, and put in a section named "EVIDENCE COLLECTED".

Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

Below is the DATA SOURCES AND EVIDENCES we collected about the company.

### START OF DATA SOURCES AND EVIDENCES
{{data_sources}}
### END OF DATA SOURCES AND EVIDENCES

### IMPORTANT NOTES for output
Your answer should be in JSON format with BOTH "Answer" and "Links" where your answers are based from.  Both "Answer" and "Links" keys are required for JSON output, MUST have them both.

Requirements for "Answer" field:
    Your answer MUST contain into 3 sections:
    - "SUMMERY" for conclusive answer to the question
    - "ANALYSIS" sections to provide detailed analysis and walk through how the conclusion is reached
    - "EVIDENCE COLLECTED" combine all items from "SOURCES AND EVIDENCES" section, where you used in your anlysis and answer
 
    Please stick to the fact, do not fabricate and answer don't know if you don't have an answer and be honest.
    Please be extra careful with numbers, math and when calculations are required, think step by step if numbers, math and calculations are required.

    Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

Requirements for "Links" field:
    A list of links or source names if URL isn't available.
    Links could be URL for the link to the source documents or websites. It will be used to verify the sources by the users. leave it a empty list if no sources detected.
### END OF IMPORTANT NOTES for output 

### OUTPUT format 
    "SUMMARY":"<The conclusive answer for the question, this is a TEXT only field>",
    "ANALYSIS":"<provide detailed analysis and walk through how the conclusion is reached. This is where you explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.>",
    "EVIDENCE COLLECTED":"<combine all items from ""SOURCES AND EVIDENCES"" section, where you used in your anlysis and answer. This is where you present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list>"


### END OF JSON format

START OF USER QUESTION:
"""

output_json = """{
"Answer": {
    "SUMMARY":"<The conclusive answer for the question, this is a TEXT only field>",
    "ANALYSIS":"<provide detailed analysis and walk through how the conclusion is reached. This is where you explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.>",
    "EVIDENCE COLLECTED":"<combine all items from ""SOURCES AND EVIDENCES"" section, where you used in your anlysis and answer. This is where you present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list>"
    },
"Links": [List sections based on the data sources, leave it a empty list if no sources provided. DO NOT create your own.]
}
"""

# question_list =["What was the year founded?",
#       "What type of real estate - owned vs. leased; residential v commercial; number locations",
#       "Is the company U.S. based?",
#       "What is the insured's business?",
#       "Have there been any acquisitions in the last five years? Please provide details",
#       "Have there been any divestures in the last five years? Please provide details.",
#       "Any litigation mentioned? Please provide details",
#       "Historical revenues for last 5 to 10 years. Does it include acquisitions and divestures?",
#       "US vs Canada vs All other revenues",
#       "Any medical, hospital or pharmaceutical exposures",
#       "Any auto manufacturing exposure",
#       "Any third party trucking or truck brokering exposure",
#       "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and resturants.",
#       "PFAS exposures","Dams and/or mining exposures.",
#       "Railroad exposures","Construction exposures",
#       "Cannabis and/or tobacco exposures",
#       "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and restaurants.",
#       "electrical generating for self or others",
#       "Aviation exposures - aircraft owned or non owned; hangar",
#       "Watercraft exposures - ships/wharf",
#       "Does the insured have discontinued operations",
#       "If yes, to above, please explain",
#       "Comment on premises / operations exposure if applicable",
#       "Comment on products / completed ops exposure",
#       "Liquor liability?",
#       "Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)",
#       "Other Named Insureds: Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)"]
question_list=["What is insured company's businss?",
"Have there been any acquisitions in the last five years? Please provide details.",
"Please provide the company's historical revenuses for the last 5 to 10 years. Does it include acquisitions and divesttures?",
"Is there any auto manufacturing expsoure for the company?",
"Is the company U.S. based?",
"What was the year compnay was founded?",
"Provide an overall narrative of the operations of the company.",
"Please provide a geographical breakdown of U.S. vs. Canada vs all others."]

In [None]:
index_name = "excessive-casualty-index"
search_client = SearchClient(endpoint, index_name, credential=credential)
embedding_model_name = "text-embedding-ada-002"

response_list = []

from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2024-05-01-preview"
)

for q in question_list:
    # vector_query = VectorizableTextQuery(text=q, k_nearest_neighbors=5, fields="content_vector", exhaustive=True)
    embedding = client.embeddings.create(input=q, model=embedding_model_name, dimensions=1536).data[0].embedding

    vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=5, fields="content_vector")

    results = search_client.search(  
        search_text=None,  
        vector_queries= [vector_query],
        select=["fileurl", "section_name", "section_description", "section_chunk_id", "content"],
        top=3
    )  
    # print(results)
    #concatenate all the 10-Ks into a single string
    context = ""
    for result in results:  
        # print(f"Source: {result['fileurl']}")
        context = context + f"Source: {result['fileurl']} Section: {result['section_name']}\nContent: {result['content']}\n\n"
    # print(context)
    sys_message = system_prompt.format(data_sources=context, output_json=output_json)


    # Generate the system message
    response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{"role":"system", "content":sys_message},
                {"role":"user","content": q,}],
        max_tokens=4000,
        response_format={"type":"json_object"},
        )
    
    response_list.append({q:response.choices[0].message.content})

# now save the responses to a csv file
import csv
with open(output_filename, 'w', newline='') as csvfile:
    fieldnames = ['Question', 'Response']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for response in response_list:
        for key in response.keys():
            writer.writerow({'Question': key, 'Response': response[key]})

Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm
Source: https://www.sec.gov/ix?doc=/A