In [1]:
import os, json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import numpy as np

load_dotenv(".env")

True

In [2]:
endpoint = os.environ["FR_ENDPOINT"]
key = os.environ["FR_KEY"]

# Instantiate DocumentAnalysisClient
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

<h5>Document Intelligence - Text Extraction by Layout Model
</h5>


├── pdf_dir  
│&emsp; &emsp; ├── text_dir  
│&emsp; &emsp; └── pdf files  
this script

In [3]:
pdf_dir = Path('./pdf')
text_dir = Path('./text')
text_path = Path(os.path.join(pdf_dir, text_dir))
os.makedirs(text_path, exist_ok=True)

def format_out_doc(resultPages: list, title: str):
    out_doc = {}
    for page in resultPages:
        page_content = ""
        for line in page.lines:
            page_content += line.content + "\n"
        page_content = page_content.replace('\n',' \n').replace(":unselected:", " ")
        out_doc[title + "_" +"{:02}".format(page.page_number)] = page_content
    return out_doc



for pdf in next(os.walk(pdf_dir))[2]:

    with open(os.path.join(pdf_dir, pdf), "rb") as f:        
        data_bytes = f.read()
        #print(f"Analyzing {pdf}")
        poller = document_analysis_client.begin_analyze_document("prebuilt-layout", data_bytes)
        result = poller.result()
        #text = result.content.replace('\n',' \n').replace(':unselected:',' ')
        title = Path(pdf).stem
        text = format_out_doc(result.pages, title)

        out_fname = title + ".txt"
        with open(os.path.join(text_path, out_fname), 'w', encoding='utf-8') as outfile:
            #outfile.write(text)
            outfile.write(json.dumps(text, ensure_ascii=False))

## TEST: do not run hereafter

In [5]:
pdf_dir = Path('./pdf')
pdf = "fukuoka-garbage.pdf"

def format_out_doc(resultPages: list, title: str):
    out_doc = {}
    for page in resultPages:
        page_content = ""
        for line in page.lines:
            page_content += line.content + "\n"
        page_content = page_content.replace('\n',' \n').replace(":unselected:", " ")
        out_doc[title + "_" +"{:02}".format(page.page_number)] = page_content
    return out_doc

with open(os.path.join(pdf_dir, pdf), "rb") as f:        
    data_bytes = f.read()
    #print(f"Analyzing {pdf}")
    poller = document_analysis_client.begin_analyze_document("prebuilt-layout", data_bytes)
    result = poller.result()
    #text = result.content.replace('\n',' \n').replace(':unselected:',' ')
    title = Path(pdf).stem
    text = format_out_doc(result.pages, title)
    
text = json.dumps(text, ensure_ascii=False)
text

'{"fukuoka-garbage_01": "Flow of waste disposal in Fukuoka City \\nGarbage in Fukuoka City is divided into four categories: burnable garbage, non-combustible garbage, empty bottles and PET bottles, and oversized garbage. \\nAlthough it may seem that the amount is less than in other municipalities, the garbage collected is actually further divided into nine categories at each treatment f \\n燃えるごみ用 \\n69452 \\n4 52 \\n300135 \\n500PIR \\n69458 \\nPeriodic \\ncollection \\nBurnable \\ngarbage \\nPeriodic \\ncollection \\nNon-burnable \\ngarbage \\nApplication \\nrequired \\nBulky \\nPeriodic \\ncollection \\nEmpty Bottle \\nPet Bottle \\nPower Generation \\nand Heat Supply \\nIncineration ash \\nIncineration at \\ndepreciation facilities \\nCombustible \\n① \\nLandfill \\nron-combustib \\nmaterials \\n2 \\nCrushing and sorting \\nat the Recycling \\nIron \\n③ \\n(Non-combustible) \\nAluminium \\n④ \\nnon-usable items \\n(Flammable materials) \\n⑤ \\nUsable items (furniture, etc.) \\nReuse

In [6]:
content_dict = json.loads(text)
content_dict


{'fukuoka-garbage_01': 'Flow of waste disposal in Fukuoka City \nGarbage in Fukuoka City is divided into four categories: burnable garbage, non-combustible garbage, empty bottles and PET bottles, and oversized garbage. \nAlthough it may seem that the amount is less than in other municipalities, the garbage collected is actually further divided into nine categories at each treatment f \n燃えるごみ用 \n69452 \n4 52 \n300135 \n500PIR \n69458 \nPeriodic \ncollection \nBurnable \ngarbage \nPeriodic \ncollection \nNon-burnable \ngarbage \nApplication \nrequired \nBulky \nPeriodic \ncollection \nEmpty Bottle \nPet Bottle \nPower Generation \nand Heat Supply \nIncineration ash \nIncineration at \ndepreciation facilities \nCombustible \n① \nLandfill \nron-combustib \nmaterials \n2 \nCrushing and sorting \nat the Recycling \nIron \n③ \n(Non-combustible) \nAluminium \n④ \nnon-usable items \n(Flammable materials) \n⑤ \nUsable items (furniture, etc.) \nReuse \nRecycle \nSorting and \npackaging at a \nsor

In [7]:
def remove_page_format(text: str):
    content = json.loads(text)
    out_doc = ''
    for page in content:
        out_doc += content[page]
    return out_doc.replace('\n',' ')

print(remove_page_format(text))

Flow of waste disposal in Fukuoka City  Garbage in Fukuoka City is divided into four categories: burnable garbage, non-combustible garbage, empty bottles and PET bottles, and oversized garbage.  Although it may seem that the amount is less than in other municipalities, the garbage collected is actually further divided into nine categories at each treatment f  燃えるごみ用  69452  4 52  300135  500PIR  69458  Periodic  collection  Burnable  garbage  Periodic  collection  Non-burnable  garbage  Application  required  Bulky  Periodic  collection  Empty Bottle  Pet Bottle  Power Generation  and Heat Supply  Incineration ash  Incineration at  depreciation facilities  Combustible  ①  Landfill  ron-combustib  materials  2  Crushing and sorting  at the Recycling  Iron  ③  (Non-combustible)  Aluminium  ④  non-usable items  (Flammable materials)  ⑤  Usable items (furniture, etc.)  Reuse  Recycle  Sorting and  packaging at a  sorting facility  L  Empty bottle  E  colourless  ⑥  brown  ⑦  PET bottle  ot