In [1]:
import os
from dotenv import load_dotenv
from sec_api import RenderApi, XbrlApi, ExtractorApi
load_dotenv()

sec_key = os.getenv('SEC_API_KEY')
# xbrlApi = XbrlApi(sec_key)

# renderApi = RenderApi(api_key=sec_key)

alphabet_code = '1652044'
apphabet_10k_url ='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'

# xbrl_json = xbrlApi.xbrl_to_json(htm_url=apphabet_10k_url)

# filing_10k_html = renderApi.get_filing(apphabet_10k_url)

extractorApi = ExtractorApi(sec_key)
section_list = ['1','1A', '1B', '2', '3', '4', '5', '6', '7', '7A', '8', '9', '9A', '9B', '10', '11', '12', '13', '14', '15']
# html_list = []
# for section in section_list:
#     html_list.append(extractorApi.get_section(apphabet_10k_url, section, 'html'))
# # save each item of html_list to a file
# for i, html in enumerate(html_list):
#     with open(f'sec-10k\item_{section_list[i]}.html', 'w') as f:
#         f.write(html)

In [20]:
def clean_html(html, chunk_size=4000):
    soup = BeautifulSoup(html, "html.parser")

    # Remove script and style tags
    for tag in soup(["script", "style"]):
        tag.decompose()

    # Remove all attributes except 'href' for <a> tags
    for tag in soup.find_all(True):  # True finds all tags
        if tag.name == "a":  # Keep 'href' for hyperlinks
            tag.attrs = {key: tag.attrs[key] for key in tag.attrs if key == "href"}
        elif tag.name == "img":
            tag.attrs = {key: tag.attrs[key] for key in tag.attrs if key == "src"}
        else:
            tag.attrs = {}  # Remove all other attributes

        # Find all <hr> tags and split at their positions
        chunks = []
        current_chunk = ""
        for element in soup.contents:  # Iterate over direct children
            if element.name == "hr":  # Split at <hr>
                if len(current_chunk) > chunk_size:
                    chunks.append(current_chunk)
                    current_chunk = ""
                else:
                    current_chunk = current_chunk + str(element)
            else:
                current_chunk = current_chunk+ str(element)  # Keep adding elements

        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append("".join(current_chunk))

    return str(soup), chunks

# load all HTML files in sec-1ok
import glob, os
from bs4 import BeautifulSoup

# Load all HTML files
html_files = glob.glob("sec-10k/*.html")

# Create a new directory to store cleaned HTML files
os.makedirs("sec-10k-cleaned", exist_ok=True)
os.makedirs("sec-10k-chunked", exist_ok=True)

# Extract text from HTML files
for file in html_files:
    with open(file, "r") as f:
        text = f.read()
        cleaned_html, chunks = clean_html(text)
        #save into sec-10k-cleaned
    with open(f'sec-10k-cleaned/{os.path.basename(file)}', 'w') as f:
        f.write(cleaned_html)

    for i, chunk in enumerate(chunks):
        with open(f'sec-10k-chunked/{os.path.basename(file)[:-5]}_chunk_{i}.html', 'w') as f:
            f.write(chunk)    

In [14]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) if len(os.environ["AZURE_COGNITIVE_SEARCH_KEY"]) > 0 else DefaultAzureCredential()
index_name = "sec-10k"
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = "text-embedding-ada-002"
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = "sec-10k"

In [None]:
from azure.storage.blob import BlobServiceClient  
import os
import html2text
blob_container_name = "sec-10k-markdown"
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
    container_client.create_container()

documents_directory = "sec-10k"
for file in os.listdir(documents_directory):
    with open(os.path.join(documents_directory, file), "rb") as data:
        markdown_data = html2text.html2text(data.read().decode("utf-8"))
        name = os.path.basename(file)
        if not container_client.get_blob_client(name).exists():
            container_client.upload_blob(name=name, data=markdown_data)

print(f"Setup sample data in {blob_container_name}")

Setup sample data in sec-10k-markdown


: 

In [3]:
from azure.storage.blob import BlobServiceClient  
import os

# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
    container_client.create_container()

documents_directory = "sec-10k"
for file in os.listdir(documents_directory):
    with open(os.path.join(documents_directory, file), "rb") as data:
        name = os.path.basename(file)
        if not container_client.get_blob_client(name).exists():
            container_client.upload_blob(name=name, data=data)

print(f"Setup sample data in {blob_container_name}")

Setup sample data in sec-10k


In [4]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)
from azure.search.documents.indexes._generated.models import NativeBlobSoftDeleteDeletionDetectionPolicy

# Create a data source 
indexer_client = SearchIndexerClient(endpoint, credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container,
    data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy()
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'sec-10k-blob' created or updated


In [5]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)  
fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)  
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  


sec-10k created


In [6]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset
)

# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


sec-10k-skillset created


In [7]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(endpoint, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  


 sec-10k-indexer is created and running. If queries return no results, please wait a bit and try again.


In [10]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "What was the year founded?"  
  
search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   


parent_id: aHR0cHM6Ly9kYXRhc2NpZW5jZWRhdGFzdG9yZS5ibG9iLmNvcmUud2luZG93cy5uZXQvc2VjLTEway9pdGVtXzE1Lmh0bWw1
chunk_id: 4153ce67524c_aHR0cHM6Ly9kYXRhc2NpZW5jZWRhdGFzdG9yZS5ibG9iLmNvcmUud2luZG93cy5uZXQvc2VjLTEway9pdGVtXzE1Lmh0bWw1_pages_103
Score: 0.8049527
Content: M</span><span style="color:#000000;font-family:'Arial',sans-serif;font-size:8pt;font-weight:400;line-height:100%">ATHER </span></div></td><td colspan="3" style="padding:2px 1pt;text-align:left;vertical-align:bottom"><span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-weight:400;line-height:100%">Director</span></td><td colspan="3" style="padding:2px 1pt;text-align:center;vertical-align:bottom"><span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-weight:400;line-height:100%">February 2, 2023</span></td></tr><tr><td colspan="3" style="border-top:1pt solid #000000;padding:2px 1pt;text-align:center;vertical-align:top"><span style="color:#000000;font-family:'Arial',sans-serif;font-