# はじめに
本ノートはAdobe indesignで2段組に組版されたPDFファイルを対象に、テキストデータを抽出や、機械学習用データセットの作成を支援するツール群です。
基本的に上から下へ、Ctrl+Enterキーを押していくことで実行が進みます。

# PDFからxmlの抽出
PDFに含まれるテキストを、XML形式で取り出します。このxmlには文字だけでなく、文字の位置の情報や、文字のサイズの情報も含まれています。

In [None]:
import fileupload
from IPython.display import display
uploader = fileupload.FileUploadWidget()

def _handle_upload(change):
    w = change['owner']
    with open(w.filename, 'wb') as f:
        f.write(w.data)
    print('Uploaded `{}` ({:.2f} kB)'.format(
        w.filename, len(w.data) / 2**10))

uploader.observe(_handle_upload, names='data')
print("Browseボタンを押して、処理したいPDFファイルを登録して下さい")
display(uploader)

In [34]:
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
from tqdm import tqdm
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.detect_vertical = True
os.makedirs(uploader.filename+"_xml",exist_ok=True)
# 処理するPDFを開く
fp = open(uploader.filename, 'rb')
for index,page in tqdm(enumerate(PDFPage.get_pages(fp))):
    rettxt = BytesIO()
    device = XMLConverter(rsrcmgr, rettxt, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter.process_page(page)
    content=rettxt.getvalue().decode('utf-8')
    with open(uploader.filename+"_xml"+os.sep+'{:03d}.xml'.format(index+1), mode='w') as wf:
        wf.write(content)
    rettxt.close()
fp.close()

440it [00:53,  8.18it/s]


# xmlからテキストの抽出

xmlのままでは人間にとっては読みづらいので、テキスト化します。

In [42]:
import glob
import xml.etree.ElementTree as ET
import numpy as np
import sys
import os
from tqdm import tqdm

xmlfilelist=sorted(glob.glob(uploader.filename+"_xml"+os.sep+"*"))
os.makedirs(uploader.filename+"_txt",exist_ok=True)

for xmlpath in tqdm(xmlfilelist):
    xmlname=os.path.basename(xmlpath)
    
    f = open(xmlpath, encoding="utf-8")
    xmlstringlist = f.readlines()
    xmlstrings="".join(xmlstringlist)
    f.close()
    xmlstrings=xmlstrings.replace("<pages>","")
    xmlstrings = xmlstrings.replace("", "")
    #print(xmlstrings)
    root = ET.fromstring(xmlstrings)
    pageinfo=root.find("page")
    resstringlist=[]
    cordlist=[]
    
    _,_,width,height=[float(x) for x in root.get("bbox").split(",")]
    for index,textbox in enumerate(root.findall('textbox')):
        cords=[float(x) for x in textbox.get("bbox").split(",")]
        x1=width-int(cords[0])
        x2=width-int(cords[2])
        y2=height-int(cords[1])
        y1=height-int(cords[3])
        if y1<height/2:
            cordlist.append(x1)
        else:
            cordlist.append(x1+width)
        textstring=""
        for textline in textbox.findall('textline'):
            for text in textline.findall('text'):
                if text.text:
                    if "cid" in text.text:
                        textstring+="？"
                    else:
                        textstring+=text.text.rstrip()
        resstringlist.append(textstring)
    txtname=xmlname[:-4]+".txt"
    with open(uploader.filename+"_txt"+os.sep+txtname, mode='w') as wf:
        for index in np.argsort(np.array(cordlist)):
            wf.write(resstringlist[index])
import shutil
shutil.make_archive(uploader.filename+"_txt", 'zip', root_dir=uploader.filename+"_txt")
from IPython.display import FileLink
local_file = FileLink(uploader.filename+"_txt.zip", result_html_prefix="作成したテキストをダウンロードする-> ")
display(local_file)

100%|██████████| 440/440 [00:04<00:00, 101.29it/s]


# ※以下は機械学習用のツール群です

# PDFの画像化

In [54]:
#!pip3 install pdf2image
import os
from pathlib import Path
from pdf2image import convert_from_path
pdf_path = Path(uploader.filename)
# PDF -> Image に変換（150dpi）
pages = convert_from_path(str(pdf_path), 150)
os.makedirs(uploader.filename+"_img",exist_ok=True)
# 画像ファイルを１ページずつ保存
for index, page in tqdm(enumerate(pages)):
    image_path = uploader.filename+"_img"+os.sep+'{:03d}.jpg'.format(index+1)
    # JPEGで保存
    page.save(str(image_path), "JPEG")

440it [00:18, 24.01it/s]


# XML文字位置情報を利用したYOLO形式データセットへの変換
2段組の組版PDFを1段ごとに分割してYOLO形式に変換します。

In [None]:
import cv2
import pandas as pd
chardic={}
os.makedirs(uploader.filename+"_imgdiv", exist_ok=True)
os.makedirs(uploader.filename+"_labelsdiv", exist_ok=True)
for imgpath in tqdm(glob.glob(uploader.filename+"_img/*")):
    
    img=cv2.imread(imgpath)
    imgheight,imgwidth=img.shape[:2]
    imgname=os.path.basename(imgpath)
    xmlname=imgname[:-4]+".xml"
    #print(xmlname)
    xmlpath=os.path.join(uploader.filename+"_xml",xmlname)
    xmlstrings=""
    with open(xmlpath, encoding="utf-8") as f:
        xmlstringlist = f.readlines()
        xmlstrings="".join(xmlstringlist)
    xmlstrings=xmlstrings.replace("<pages>","")
    xmlstrings = xmlstrings.replace("^H", "")
    xmlstrings = xmlstrings.replace("", "")
    
    root = ET.fromstring(xmlstrings)
    _,_,xmlwidth,xmlheight=[float(x) for x in root.get("bbox").split(",")]
    pageinfo=root.find("page")
    encodedf1 = pd.DataFrame(index=[], columns=["id"])
    encodedf2 = pd.DataFrame(index=[], columns=["id"])
    
    for index,textbox in enumerate(root.findall('textbox')):
        for textline in textbox.findall('textline'):
            for text in textline.findall('text'):
                if text.text and text.text.rstrip()!="":
                    cords = [float(x) for x in text.get("bbox").split(",")]
                    x1 = int(cords[0] / xmlwidth* imgwidth) 
                    x2 = int(cords[2] / xmlwidth * imgwidth) 
                    y2 = height - int(cords[1] / xmlheight * imgheight)  
                    y1 = height - int(cords[3] / xmlheight * imgheight) 
                    textchar = ""
                    if "cid" in text.text:
                        textchar="?"
                    elif text.text:
                        textchar=text.text.rstrip()
                    if not textchar in chardic:
                        chardic[textchar]=len(chardic)+1
                        #print(len(chardic)+1,textchar)
                    if y1 < height // 2:
                        centerx=(x1+x2)/(2*width)
                        centery = (y1 + y2) / (height//2*2)
                        widthp=(x2-x1)/width
                        heightp = (y2 - y1) / (height//2)
                        textstring="%3f %3f %3f %3f"%(centerx,
                                                      centery,widthp,heightp)
                        encodedf1 = encodedf1.append(
                            pd.Series({"id": str(chardic[textchar])+" "+textstring}),
                            ignore_index=True)
                    else:
                        y1-=height // 2
                        y2-=height // 2
                        centerx = (x1 + x2) / (2 * width)
                        centery = (y1 + y2) / (height // 2 * 2)
                        widthp = (x2 - x1) / width
                        heightp = (y2 - y1) / (height // 2)
                        textstring = "%3f %3f %3f %3f" % (centerx,
                                                          centery, widthp, heightp)
                        encodedf2 = encodedf2.append(
                            pd.Series({"id": str(chardic[textchar])+" "+textstring}),
                            ignore_index=True)
    
    cv2.imwrite(uploader.filename+"_imgdiv/"+imgname[:-4]+"_1.jpg",
                img[:imgheight//2,:,:])
    cv2.imwrite(uploader.filename+"_imgdiv/"+imgname[:-4]+"_2.jpg",
                img[imgheight // 2:,:, :])
    encodedf1.to_csv(uploader.filename+"_labelsdiv/"+imgname[:-4]+"_1.txt",
                     index=False,header=None)
    encodedf2.to_csv(uploader.filename+"_labelsdiv/" + imgname[:-4] + "_2.txt", 
                     index=False, header=None)

with open(uploader.filename+'_encodedic.json','w') as f:
    json.dump(chardic,f,ensure_ascii=False)