Extract data from forms with Azure Document Intelligence \
https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-custom?view=doc-intel-3.1.0&tabs=fott \
https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/python/FormRecognizer/rest/python-train-extract.md \
Training https://learn.microsoft.com/en-us/training/modules/work-form-recognizer/?source=recommendations

In [None]:
print("Hello World!")

In [None]:
# Speech to text
from openai import OpenAI
client = OpenAI()
audio_file_path = "..\\data\\speech\\yearwiththesaints_00_anonymous.wav" 
audio_file= open(audio_file_path, "rb")
transcript = client.audio.transcriptions.create(
  model="whisper-1", 
  file=audio_file,
  response_format="text"
)

print(transcript)

In [None]:
# Summarize 
from openai import OpenAI
client = OpenAI()
user_prompt = f"Could you please summarize the transcript ### {transcript} ###"
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": user_prompt},
  ]
)

print(response.choices[0].message.content)

In [None]:
response.choices[0].message.content

In [None]:
# Async usage
import asyncio
import json
from openai import AsyncOpenAI

client = AsyncOpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    # api_key="My API Key",
)

async def show_json(obj):
    display(json.loads(obj.model_dump_json())) 

async def main() -> None:
    chat_completion = await client.chat.completions.create(
        messages=[{"role": "user", "content": "Say this is a test"}],
        model="gpt-3.5-turbo",
    )
    await show_json(chat_completion)


# asyncio.run(main())
await main()

In [None]:
# Streaming Responses
from openai import OpenAI

client = OpenAI()

stream = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Say this is a test"}],
    stream=True,
)
for part in stream:
    print(part.choices[0].delta.content or "")

In [None]:
# The async client uses the exact same interface.
from openai import AsyncOpenAI

client = AsyncOpenAI()

stream = await client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Say this is a test"}],
    stream=True,
)
async for part in stream:
    print(part.choices[0].delta.content or "")

In [None]:
# step 1 read PDF to memory
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

# Azure AI sample document
file_path = "../data/pdf/wmt-20230430_extract info.pdf"
# file_path = "https://drive.google.com/file/d/1SZq75rs0H2GhGH6ok3IumqAA10_yFSY5/view?usp=drive_link"

# formatting function
def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])


def analyze_read(file_path):
    # sample document

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    if "https://" in file_path:
        poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-read", file_path
        )
    else:
        with open(file_path, "rb") as f:
            poller = document_analysis_client.begin_analyze_document(
            "prebuilt-read", document=f, locale="en-US"
        )
  
    result = poller.result()

    print("Document contains content: ", result.content)

    for idx, style in enumerate(result.styles):
        print(
            "Document contains {} content".format(
                "handwritten" if style.is_handwritten else "no handwritten"
            )
        )

    for page in result.pages:
        print("----Analyzing Read from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            print(
                "...Line # {} has text content '{}' within bounding box '{}'".format(
                    line_idx,
                    line.content,
                    format_polygon(line.polygon),
                )
            )

        for word in page.words:
            print(
                "...Word '{}' has a confidence of {}".format(
                    word.content, word.confidence
                )
            )

    print("----------------------------------------")
    return result


if __name__ == "__main__":
    result = analyze_read(file_path)
   

In [51]:
! pip install chardet beautifulsoup4



In [1]:
from bs4 import BeautifulSoup
import chardet

file_path = '../data/html/wmt-20230430.html'

def load_local_html(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())

    encoding = result['encoding']
    # print(encoding)
    
    with open(file_path, 'r', encoding=encoding) as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

soup = load_local_html(file_path)

# Now you can use BeautifulSoup methods to navigate and extract information from the HTML
# For example, print the text content of all paragraphs
outfile_path = '../data/txt/wmt-20230430.txt'
with open(outfile_path, 'w', encoding='utf-8') as file:
    for paragraph in soup.find_all('span'):
        file.write(paragraph.get_text()+'\n\n')
        

In [3]:
soup.find_all()

[<html xml:lang="en-US" xmlns="http://www.w3.org/1999/xhtml" xmlns:country="http://xbrl.sec.gov/country/2022" xmlns:dei="http://xbrl.sec.gov/dei/2022" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2020-02-12" xmlns:ixt-sec="http://www.sec.gov/inlineXBRL/transformation/2015-08-31" xmlns:link="http://www.xbrl.org/2003/linkbase" xmlns:srt="http://fasb.org/srt/2022" xmlns:us-gaap="http://fasb.org/us-gaap/2022" xmlns:wmt="http://www.walmart.com/20230430" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="text/html" http-equiv="Content-Type"/>
 <title>wmt-20230430</title></head><body><div style="display:none"><ix:header><ix:hidden><ix:nonnumeric contextref="ic42b5b

In [21]:
import pathlib
import os
import hashlib

pathlib.Path().resolve()
# os.listdir(pathlib.Path().resolve())
# file = "semantic-kernel-continure.ipynb"
# file.split(".")[-1]
dst = "C:/Repository/azure/azureai/src/app.py"
# os.path.split(dst)
hashlib.md5(open(dst, "rb").read()).hexdigest()


'a9e936dff473630e19d2de0f377ceb5f'

In [None]:
# ask one by one
ask = "Can you find context relevent to {Consolidated net income}?"
# ask = "Can you find context relevent to {Cash, cash equivalents and restricted cash at end of period} as of April 30, 2023?"
# ask = "Can you find context relevent to {Rating agency for Long-term debt}?"
# ask = "find context relevent to {legal proceedings and certain regulatory matters}"
print("===========================\n" + "Query: " + ask + "\n")

memories = await kernel.memory.search_async(memory_collection_name, ask, limit=1, min_relevance_score=0.77)

i = 0
for memory in memories:
    i += 1
    print(f"Result {i}:")
    # print("  URL:     : " + memory.id)
    print("  Title    : " + memory.description)
    print("  Relevance: " + str(memory.relevance))
    print()

In [None]:
# Include all questions
async def search_memory_examples(kernel: sk.Kernel) -> None:
    questions = [
        "find context relevent to {Rating agency for Long-term debt}",
        "find context relevent to {Cash, cash equivalents and restricted cash at end of period}",
        "find context relevent to {Consolidated net income}",
        "find context relevent to {legal proceedings and certain regulatory matters}",
    ]

    for question in questions:
        print("===========================\n" + "Query: " + question + "\n")
        memories = await kernel.memory.search_async(memory_collection_name, question, limit=1, min_relevance_score=0.77)
        print("Relevance: " + str(memories[0].relevance))
        print(f"Answer: {memories[0].description}\n")

In [None]:
await search_memory_examples(kernel)

In [None]:
class VectorSearchEngineSkill:
    """
    A vector search engine skill.
    """
    from semantic_kernel.orchestration.sk_context import SKContext
    from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter

    @sk_function(
        description="find context relevent to Consolidated net income or \
            Rating agency for Long-term debt or \
            legal proceedings or \
            cash equivalents and restricted cash at end of period \
            ", name="searchAsync"
    )
    @sk_function_context_parameter(
        name="query",
        description="The search query",
    )
    async def search_async(self, query: str, context: SKContext) -> str:
        query = query or context.variables.get("query")
        ask = f"find context relevent to {query}"
        # print(f"debug query: {ask}")
        # msg= context.variables.get("query")
        # print(f"debug msg: {query}")

        memories = await kernel.memory.search_async(memory_collection_name, ask, limit=1, min_relevance_score=0.77)
        result = memories[0].description
        print(memories[0].relevance)
        # print(f"debug: {result}")
        return result

In [3]:
from atlassians import Bitbucket


Collecting atlassian-python-api
  Downloading atlassian_python_api-3.41.4-py3-none-any.whl.metadata (8.7 kB)
Downloading atlassian_python_api-3.41.4-py3-none-any.whl (168 kB)
   ---------------------------------------- 0.0/168.6 kB ? eta -:--:--
   ------- -------------------------------- 30.7/168.6 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 168.6/168.6 kB 2.5 MB/s eta 0:00:00
Installing collected packages: atlassian-python-api
Successfully installed atlassian-python-api-3.41.4
