In [None]:
import os, re
from pathlib import Path
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat, AnalyzeOutputOption, AnalyzeResult
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv(".env")

In [None]:
endpoint = os.getenv("FR_ENDPOINT")
key = os.getenv("FR_KEY")

# Instantiate DocumentAnalysisClient
document_analysis_client = DocumentIntelligenceClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

<h3>Document Intelligence - Text Extraction by Layout Model
</h3>


├── pdf_dir  
│&emsp; &emsp; ├── text_dir  
│&emsp; &emsp; └── pdf files  
this script

In [None]:
import os

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='o200k_base',
    chunk_size=4000, 
    chunk_overlap=500
)

#テキストファイルを読み込んで、指定のトークン数のチャンクファイルに分割します。Load the text file and split it into chunk files with the specified number of tokens.
#この関数は本ノートブック内で使用されていません。This function is not used within this notebook.
def splitChunkFile(filepath):
    dirname = os.path.dirname(filepath)
    output_path = dirname + "/chunks/"
    os.makedirs(output_path, exist_ok=True)

    f = open(filepath, 'r', encoding='UTF-8')
    data = f.read()
    chunk = text_splitter.split_text(data)

    #chunk単位でループ
    for i, chunkedtext in enumerate(chunk):        

        basename = os.path.splitext(os.path.basename(filepath))[0]
        outputfilepath = output_path + basename + "-" + str(i) + ".txt"
        
        #print(i, len(enc.encode(chunkedtext)), outputfilepath)
        with open(outputfilepath, 'w', encoding='UTF-8') as fo:
            fo.write(chunkedtext)

        fo.close()
    f.close()
   
    return

<h5>Document Intelligence - Layout Model によるテキストの抽出
</h5>

In [None]:
pdf_dir = Path('./pdf')
text_dir = Path('./text')
text_path = Path(os.path.join(pdf_dir, text_dir))
os.makedirs(text_path, exist_ok=True)

for pdf in next(os.walk(pdf_dir))[2]:

    with open(os.path.join(pdf_dir, pdf), "rb") as f:        
        poller = document_analysis_client.begin_analyze_document("prebuilt-layout", body=f, content_type="application/octet-stream")
        result = poller.result()
        text = result.content.replace(":unselected:", "").replace(":selected:", "")

        chunk = text_splitter.split_text(text)

        #chunk単位でループ
        for i, chunkedtext in enumerate(chunk):        

            basename = os.path.splitext(os.path.basename(pdf))[0]
            filename = basename + "_" + str(i) + ".txt"
            outputfilepath = os.path.join(text_path, filename)
            
            #print(i, len(enc.encode(chunkedtext)), outputfilepath)
            with open(outputfilepath, 'w', encoding='UTF-8') as fo:
                fo.write(chunkedtext)

            fo.close()
        f.close()



### 以下は参考のため記載しています。

<h5>Document Intelligence - Layout Model によるテキストの抽出 (Mark Down)
</h5>

In [None]:
import os, re
from pathlib import Path

# Unify the format of headings in markdown text
def convert_markdown_headings(markdown_text):
    # Convert "===" headers to "#"
    markdown_text = re.sub(r'^(.*?)\n={3,}$', r'# \1', markdown_text, flags=re.MULTILINE)

    # Convert "---" headers to "##"
    markdown_text = re.sub(r'^(.*?)\n-{3,}$', r'## \1', markdown_text, flags=re.MULTILINE)
    
    return markdown_text

In [None]:
pdf_dir = Path('./pdf')
text_dir = Path('./text2')
text_path = Path(os.path.join(pdf_dir, text_dir))
os.makedirs(text_path, exist_ok=True)

for pdf in next(os.walk(pdf_dir))[2]:

    with open(os.path.join(pdf_dir, pdf), "rb") as f:        
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", body=f, content_type="application/octet-stream", output_content_format=DocumentContentFormat.MARKDOWN 
        )
        
        result = poller.result()
        md_content = result.content
        md_content = convert_markdown_headings(result.content)

        title = Path(pdf).stem
        out_fname = title + ".md"
        with open(os.path.join(text_path, out_fname), 'w', encoding='utf-8') as outfile:
            #outfile.write(text)
            outfile.write(md_content)

### Chunk Markdown documents

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
#    ("##", "Header 2"),
#    ("###", "Header 3"),
]
# Include the headers in the splits.
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

In [None]:
pdf_dir = Path('./pdf')
text_dir = Path('./text2')
chunks_dir = Path('./chunks')
text_path = Path(os.path.join(pdf_dir, text_dir))
chunks_path = Path(os.path.join(pdf_dir, chunks_dir))
os.makedirs(chunks_path, exist_ok=True)

for text in next(os.walk(text_path))[2]:

    with open(os.path.join(text_path, text), 'r', encoding="utf-8") as f:        
        content = f.read()
        splits = text_splitter.split_text(content)

        for i, split in enumerate(splits):
            title = Path(text).stem
            out_fname = title + f"_{i}.md"
            with open(os.path.join(chunks_path, out_fname), 'w', encoding='utf-8') as outfile:
                outfile.write(split.page_content)


### 図の取り出し

In [None]:
path_to_sample_document="./Large Language Models A Survey‾02.jpeg"
with open(path_to_sample_document, "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-layout",
        body=f,
        output=[AnalyzeOutputOption.FIGURES],
        content_type="application/octet-stream",
    )
result: AnalyzeResult = poller.result()
operation_id = poller.details["operation_id"]

In [None]:
if result.figures:
    for figure in result.figures:
        if figure.id:
            response = document_analysis_client.get_analyze_result_figure(
                model_id=result.model_id, result_id=operation_id, figure_id=figure.id
            )
            with open(f"Large Language Models A Survey‾{figure.id}.png", "wb") as writer:
                writer.writelines(response)
else:
    print("No figures found.")