In [None]:
import os, re
from pathlib import Path
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv(".env")

In [None]:
endpoint = os.getenv("FR_ENDPOINT")
key = os.getenv("FR_KEY")

# Instantiate DocumentAnalysisClient
document_analysis_client = DocumentIntelligenceClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

<h3>Document Intelligence - Text Extraction by Layout Model
</h3>


├── pdf_dir  
│&emsp; &emsp; ├── text_dir  
│&emsp; &emsp; └── pdf files  
this script

In [None]:
import os

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='o200k_base',
    chunk_size=4000, 
    chunk_overlap=500
)

#テキストファイルを読み込んで、指定のトークン数のチャンクファイルに分割します。
def splitChunkFile(filepath):
    dirname = os.path.dirname(filepath)
    output_path = dirname + "/chunks/"
    os.makedirs(output_path, exist_ok=True)

    f = open(filepath, 'r', encoding='UTF-8')
    data = f.read()
    chunk = text_splitter.split_text(data)

    #chunk単位でループ
    for i, chunkedtext in enumerate(chunk):        

        basename = os.path.splitext(os.path.basename(filepath))[0]
        outputfilepath = output_path + basename + "-" + str(i) + ".txt"
        
        #print(i, len(enc.encode(chunkedtext)), outputfilepath)
        with open(outputfilepath, 'w', encoding='UTF-8') as fo:
            fo.write(chunkedtext)

        fo.close()
    f.close()
   
    return

<h5>Form Recognizer - Layout Model によるテキストの抽出
</h5>

In [None]:
pdf_dir = Path('./pdf')
text_dir = Path('./text')
text_path = Path(os.path.join(pdf_dir, text_dir))
os.makedirs(text_path, exist_ok=True)

for pdf in next(os.walk(pdf_dir))[2]:

    with open(os.path.join(pdf_dir, pdf), "rb") as f:        
        poller = document_analysis_client.begin_analyze_document("prebuilt-layout", analyze_request=f, content_type="application/octet-stream")
        result = poller.result()
        text = result.content.replace(":unselected:", "").replace(":selected:", "")

        chunk = text_splitter.split_text(text)

        #chunk単位でループ
        for i, chunkedtext in enumerate(chunk):        

            basename = os.path.splitext(os.path.basename(pdf))[0]
            filename = basename + "_" + str(i) + ".txt"
            outputfilepath = os.path.join(text_path, filename)
            
            #print(i, len(enc.encode(chunkedtext)), outputfilepath)
            with open(outputfilepath, 'w', encoding='UTF-8') as fo:
                fo.write(chunkedtext)

            fo.close()
        f.close()



<h5>Form Recognizer - Layout Model によるテキストの抽出 (Mark Down)
</h5>

In [None]:
def convert_markdown_headings(markdown_text):
    # Convert "===" headers to "#"
    markdown_text = re.sub(r'^(.*?)\n={3,}$', r'# \1', markdown_text, flags=re.MULTILINE)

    # Convert "---" headers to "##"
    markdown_text = re.sub(r'^(.*?)\n-{3,}$', r'## \1', markdown_text, flags=re.MULTILINE)
    
    return markdown_text

In [None]:
input_path = Path('./pdf/serviceInfo.pdf')
output_path = Path('./text')
os.makedirs(output_path, exist_ok=True)


with open(input_path, "rb") as f:        
    #data_bytes = f.read()
    poller = document_analysis_client.begin_analyze_document("prebuilt-layout", analyze_request=f, content_type="application/octet-stream", output_content_format=ContentFormat.MARKDOWN)
    result = poller.result()
    text = convert_markdown_headings(result.content)
    #text = result.content.replace('\n',' \n').replace(":unselected:", " ")

    out_fname = Path(input_path).stem + ".md"
    with open(os.path.join(output_path, out_fname), 'w', encoding='utf-8') as outfile:
        outfile.write(text)