# Create Index


In [9]:
# index_name = "excessive-casualty-conagra"
# tenK_filing_url="https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm"
company_name="conagra"
# tenK_filing_url="https://www.sec.gov/ix?doc=/Archives/edgar/data/0000732712/000073271225000006/vz-20241231.htm"  #verizon
tenK_filing_url="https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm"  #conagra

index_name = f"excessive-casualty-{company_name.lower()}"
output_filename=f"output/{index_name}.csv"
tenK_filename=f"output/{index_name}.json"
print(company_name)

conagra


In [10]:
from dotenv import load_dotenv
import os
load_dotenv()
### CREATE AZURE SEARCH INDEX [UPDATED FOR AZURE SEARCH SDK VERSION 11.6.0b5]

from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# VARIABLES - AZURE SEARCH

# INSTANTIATE THE AZURE SEARCH INDEX CLIENT AND DEFINE AZURE SEACH SCHEMA
fields = [
    SearchField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchField(name="fileurl", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="section_id", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="section_name", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="section_description", type=SearchFieldDataType.String),
    SearchField(name="section_chunk_id", type=SearchFieldDataType.Int32, filterable=True, sortable=True),
    SearchField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="ec_hnsw_profile_01"),
]

# VECTOR SEARCH CONFIGURATION
vector_search = VectorSearch(
    algorithms=[HnswAlgorithmConfiguration(
        name="ec_hnsw_algorithm_01",
        parameters=HnswParameters(
            m=4,
            ef_construction=400,
            ef_search=500,
            metric=VectorSearchAlgorithmMetric.COSINE))],
    profiles=[VectorSearchProfile(
        name="ec_hnsw_profile_01",
        algorithm_configuration_name="ec_hnsw_algorithm_01")])

# SEMANTIC SEARCH CONFIGURATION
semantic_config = SemanticConfiguration(
    name="ec_semantic_config_01",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="section_name"),
        content_fields=[SemanticField(field_name="content"), SemanticField(field_name="section_description")]))
semantic_search = SemanticSearch(configurations=[semantic_config])  

# CREATE THE AZURE SEARCH INDEX [INJECT FIELDS, VECTOR CONFIG, SEMANTIC CONFIG]
azure_search_index_client = SearchIndexClient(endpoint=os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"], 
            credential=AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]))
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = azure_search_index_client.create_or_update_index(index)  
print(f"{result.name} created")

excessive-casualty-conagra created


# Download SEC file

In [11]:

from sec_api import QueryApi
from sec_api import ExtractorApi
import os
import openai

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import QueryType

import html2text
from dotenv import load_dotenv
load_dotenv()


from openai import AzureOpenAI
import uuid

In [12]:
import pandas as pd

# LIST OF SECTIONS TO BE EXTRACTED FROM THE 10Ks. EACH SECTION IS A SEPARATE API CALL
# sections = ["1", "1A", "3", "7", "7A", "8", "15" ]
sections = ['1','1A', '1B', '2', '3', '4', '5', '6', '7', '7A', '8', '9', '9A', '9B', '10', '11', '12', '13', '14', '15']

sec_api_key = "0828bf6c6b1b743c93f03184fa831baf6347807f6ec4b036d6c86701a6c9e976"
section_description= pd.read_csv('excess_casualty_10k_prep_qbatch_v02_ning.csv', index_col=0)

section_description_dict={}
for item in section_description.iterrows():
    # section_description_dict
    # print(item[1])
    section_description_dict[item[0]]={'section_name':item[1]['section_name'],'section_description':item[1]['section_description']}

In [13]:
# DEFINE CHUNKING FUNCTION - BREAKS EACH SECTION'S MARKDOWN CONTENT INTO A LIST OF CHUNKS
md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, 
    chunk_size=4000, 
    chunk_overlap=200)

# FUNCTION TO EXTRACT AND CHUNK EACH SECTION OF THE 10K
def tenK_extractor(fileurl):
    # LOOP THROUGH EACH PRE-DEFINED SECTION WHICH NEEDS TO BE EXTRACTED
    # & SEND THE URL AND SECTIONID TO SEC EXTRACTOR API
    # & CONVERT TO MARKDOWN, CHUNK INTO SMALLER PIECES, PERFORM SECTION DESCRIPTION LOOKUP, AND APPEND ONTO A DICT
    extractorApi = ExtractorApi(api_key=sec_api_key)
    sections_list = []
    for section in sections:
        # LOOKUP SECTION NAME AND DESCRIPTION FROM REFERENCE DATA BASED ON CURRECT SECTION ITEM
        section_description_lookup = section_description_dict.get(section, None)
        section_name = section_description_lookup["section_name"] if section_description_lookup!=None else None
        section_description = section_description_lookup["section_description"] if section_description_lookup!=None else None
        # EXTRACT SECTION CONTENT FROM 10K, CONVERT TO MARKDOWN, CHUNK, ASSIGN TO SECTION DICT
        section_dict = {}
        section_html = extractorApi.get_section(fileurl, section, "html")

        # markdown_content = process_html(section_html)
        markdown_content = html2text.html2text(section_html)
        chunks = md_splitter.split_text(markdown_content)

        section_dict["fileurl"] = fileurl
        section_dict["section_id"] = section
        section_dict["section_name"] = section_name
        section_dict["section_description"] = section_description
        section_dict["section_content_chunks"] = chunks
        sections_list.append(section_dict)
    # RETURN OUTPUT
    return sections_list


# FUNCTION TO GENERATE EMBEDDINGS [TAKES TEXT AND CREATES A VECTORIZED REPRESENTATION TO CAPTURE THE MEANING OF THE TEXT]
def get_embedding(text):
    
    client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
    api_key=os.getenv("AZURE_OPENAI_KEY"),  
    api_version="2024-02-15-preview"
    )
    model=os.getenv("EMBEDDING_MODEL_NAME")
    embedding = client.embeddings.create(input=[text], model=model)
    return embedding.data[0].embedding

tenK_html_sections = tenK_extractor(tenK_filing_url)
for section in tenK_html_sections:
    print(section)

{'fileurl': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm', 'section_id': '1', 'section_name': 'Business Overview', 'section_description': 'This section provides a detailed description of the company’s business activities, including the company’s history, products and services, market environment, competitive landscape, and business model.', 'section_content_chunks': ['ITEM 1. BUSINESS\n\nGeneral Development of Business\n\nConagra Brands, Inc. (the "Company", "Conagra Brands", "we", "us", or "our"),\nheadquartered in Chicago, is one of North America\'s leading branded food\ncompanies. We combine a 100-year history of making quality food with agility\nand a relentless focus on collaboration and innovation. The company\'s\nportfolio is continuously evolving to satisfy consumers\' ever-changing food\npreferences. Conagra\'s brands include _Birds Eye_ ®, _Duncan Hines_ ®,\n_Healthy Choice_ ®, _Marie Callender \'s_®, _Reddi-wip_ ®, _Slim

In [14]:
# save tenK_html_sections to a JSON file
import json
with open(tenK_filename, 'w') as f:
    json.dump(tenK_html_sections, f)

In [15]:
tenK_sections = []
for tenK_html_section in tenK_html_sections:
    for idx, section_chunk in enumerate(tenK_html_section["section_content_chunks"]):
        # BUILD AZURE SEARCH DOCUMENTS (ONE PER CHUNK)
        tenK_section = {}
        tenK_section["id"] = str(uuid.uuid4())
        tenK_section["fileurl"] = tenK_html_section["fileurl"]
        tenK_section["section_id"] = tenK_html_section["section_id"]
        tenK_section["section_name"] = tenK_html_section["section_name"]
        tenK_section["section_description"] = tenK_html_section["section_description"]
        tenK_section["section_chunk_id"] = idx + 1
        tenK_section["content"] = section_chunk
        tenK_section["content_vector"] = get_embedding(section_chunk)
        tenK_sections.append(tenK_section)

for x in tenK_sections:
    print(x)

# Save tekK_sections to a JSON file under 'ConAgra' folder
import json

with open('ConAgra/tenK_sections.json', 'w') as f:
    json.dump(tenK_sections, f)



from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

azure_search_client = SearchClient(os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"], 
                                index_name,
                                AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]))
result = azure_search_client.upload_documents(documents=tenK_sections)

{'id': 'fe8233ca-bd12-4b50-b13a-ab07a86bedd5', 'fileurl': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm', 'section_id': '1', 'section_name': 'Business Overview', 'section_description': 'This section provides a detailed description of the company’s business activities, including the company’s history, products and services, market environment, competitive landscape, and business model.', 'section_chunk_id': 1, 'content': 'ITEM 1. BUSINESS\n\nGeneral Development of Business\n\nConagra Brands, Inc. (the "Company", "Conagra Brands", "we", "us", or "our"),\nheadquartered in Chicago, is one of North America\'s leading branded food\ncompanies. We combine a 100-year history of making quality food with agility\nand a relentless focus on collaboration and innovation. The company\'s\nportfolio is continuously evolving to satisfy consumers\' ever-changing food\npreferences. Conagra\'s brands include _Birds Eye_ ®, _Duncan Hines_ ®,\n_Healthy Ch

# Generate Answers

In [16]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
from dotenv import load_dotenv
from sec_api import RenderApi, XbrlApi, ExtractorApi
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery, VectorizedQuery

load_dotenv()

endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) if len(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) > 0 else DefaultAzureCredential()

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = "text-embedding-ada-002"

system_prompt = """
You're a helpful assistant that helps user understand company risks for insurance underwriting process.mbers, math and calculations are required.
Please explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.
Please present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list, and put in a section named "EVIDENCE COLLECTED".

Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

Below is the DATA SOURCES AND EVIDENCES we collected about the company.

### START OF DATA SOURCES AND EVIDENCES
{{data_sources}}
### END OF DATA SOURCES AND EVIDENCES

### IMPORTANT NOTES for output
Requirements for "Answer" field:
    Your answer MUST contain into 3 sections:
    - "SUMMERY" for conclusive answer to the question
    - "ANALYSIS" sections to provide detailed analysis and walk through how the conclusion is reached
    - "EVIDENCE COLLECTED" combine all items from "SOURCES AND EVIDENCES" section, where you used in your anlysis and answer
 
    Please stick to the fact, do not fabricate and answer don't know if you don't have an answer and be honest.
    Please be extra careful with numbers, math and when calculations are required, think step by step if numbers, math and calculations are required.

    Not only you should provide a direct answer, you would also justify the answer using facts provided in the context and your justification based on these facts.

### END OF IMPORTANT NOTES for output 

### OUTPUT format 
    "SUMMARY":"<The conclusive answer for the question, this is a TEXT only field>",
    "ANALYSIS":"<provide detailed analysis and walk through how the conclusion is reached. This is where you explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.>",
    "EVIDENCE COLLECTED":"<combine all items from ""SOURCES AND EVIDENCES"" section, where you used in your anlysis and answer. This is where you present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list>"


### END OF JSON format

START OF USER QUESTION:
# """

# output_json = """{
# "Answer": {
#     "SUMMARY":"<The conclusive answer for the question, this is a TEXT only field>",
#     "ANALYSIS":"<provide detailed analysis and walk through how the conclusion is reached. This is where you explain your logic to help user understand why you come up with the answers, along with a list of evidences collected.>",
#     "EVIDENCE COLLECTED":"<combine all items from ""SOURCES AND EVIDENCES"" section, where you used in your anlysis and answer. This is where you present the original context of how you generate the answer along with SOURCE_NAME, numbered them in bullet list>"
#     },
# "Links": [List sections based on the data sources, leave it a empty list if no sources provided. DO NOT create your own.]
# }
# """

# question_list =["What was the year founded?",
#       "What type of real estate - owned vs. leased; residential v commercial; number locations",
#       "Is the company U.S. based?",
#       "What is the insured's business?",
#       "Have there been any acquisitions in the last five years? Please provide details",
#       "Have there been any divestures in the last five years? Please provide details.",
#       "Any litigation mentioned? Please provide details",
#       "Historical revenues for last 5 to 10 years. Does it include acquisitions and divestures?",
#       "US vs Canada vs All other revenues",
#       "Any medical, hospital or pharmaceutical exposures",
#       "Any auto manufacturing exposure",
#       "Any third party trucking or truck brokering exposure",
#       "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and resturants.",
#       "PFAS exposures","Dams and/or mining exposures.",
#       "Railroad exposures","Construction exposures",
#       "Cannabis and/or tobacco exposures",
#       "Any alcohol mfg and/or sales (includes mfg exposures and retail sales including stores and bars and restaurants.",
#       "electrical generating for self or others",
#       "Aviation exposures - aircraft owned or non owned; hangar",
#       "Watercraft exposures - ships/wharf",
#       "Does the insured have discontinued operations",
#       "If yes, to above, please explain",
#       "Comment on premises / operations exposure if applicable",
#       "Comment on products / completed ops exposure",
#       "Liquor liability?",
#       "Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)",
#       "Other Named Insureds: Provide an overall narrative about the operations (use dell risk to prompt it to show how detailed)"]
question_list=["What is insured company's businss?",
"Have there been any acquisitions in the last five years? Please provide details.",
"Please provide the company's historical revenuses for the last 5 to 10 years. Does it include acquisitions and divesttures?",
"Is there any auto manufacturing expsoure for the company?",
"Is the company U.S. based?",
"What was the year compnay was founded?",
"Provide an overall narrative of the operations of the company.",
"Please provide a geographical breakdown of U.S. vs. Canada vs all others."]


search_client = SearchClient(endpoint, index_name, credential=credential)
embedding_model_name = "text-embedding-ada-002"

response_list = []

from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2024-05-01-preview"
)

for q in question_list:
    # vector_query = VectorizableTextQuery(text=q, k_nearest_neighbors=5, fields="content_vector", exhaustive=True)
    embedding = client.embeddings.create(input=q, model=embedding_model_name, dimensions=1536).data[0].embedding

    vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=5, fields="content_vector")

    results = search_client.search(  
        search_text=None,  
        vector_queries= [vector_query],
        select=["fileurl", "section_name", "section_description", "section_chunk_id", "content"],
        top=3
    )  
    # print(results)
    #concatenate all the 10-Ks into a single string
    context = ""
    for result in results:  
        # print(f"Source: {result['fileurl']}")
        context = context + f"Source: {result['fileurl']} Section: {result['section_name']}\nContent: {result['content']}\n\n"
    print(context)
    sys_message = system_prompt.format(data_sources=context, output_json=output_json)


    # Generate the system message
    response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{"role":"system", "content":sys_message},
                {"role":"user","content": q,}],
        max_tokens=4000,
        response_format={"type":"json_object"},
        )
    
    response_list.append({q:response.choices[0].message.content})

# now save the responses to a csv file
import csv
with open(output_filename, 'w', newline='') as csvfile:
    fieldnames = ['Question', 'Response']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for response in response_list:
        print(response)
        for key in response.keys():
            writer.writerow({'Question': key, 'Response': response[key]})

Source: https://www.sec.gov/ix?doc=/Archives/edgar/data/0000023217/000155837024009764/tmb-20240526x10k.htm Section: Risk Factors
Content: We are exposed to cybersecurity risk through our information systems and our
use of third-party information systems.

While we have experienced threats to our data and systems, to date, we are not
aware that we have experienced a breach that had a material impact on our
operations or business. Cyberattacks are occurring more frequently, are
constantly evolving in nature and are becoming more sophisticated.
Additionally, continued geopolitical turmoil, including the Russia-Ukraine
military conflict, has heightened the risk of cyberattacks. While we attempt
to continuously monitor and mitigate against cyber risks, including through
leveraging multi-sourced threat intelligence, investing in new technologies,
and developing third-party cybersecurity risk management capability in support
of strategic suppliers, we may incur significant costs in protecting