In [13]:
import pandas as pd
from glob import glob
import xml.etree.ElementTree as ET

In [14]:
from xml.dom import minidom
import re

In [15]:
def prettify(rough_string):
    reparsed = minidom.parseString(rough_string)
    xml_declaration = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
	schematypens="http://purl.oclc.org/dsdl/schematron"?>'''

    # まずprettyXMLを取得
    pretty = reparsed.toprettyxml(indent="\t")
    
    # XML宣言を置き換え
    pretty = re.sub(r'<\?xml.*?\?>', xml_declaration, pretty, flags=re.DOTALL)
    
    # 整形処理
    pretty = re.sub(r"[\t ]+\n", "", pretty)  # インデント後の不要な改行を削除
    pretty = pretty.replace(">\n\n\t<", ">\n\t<")  # 不要な空行を削除
    pretty = re.sub(r"\n\s*\n", "\n", pretty)  # 連続した改行（空白行を含む）を単一の改行に置換
    
    # エスケープされた引用符を元に戻す
    pretty = pretty.replace("&quot;", "\"")
    
    return pretty

ET.register_namespace('', "http://www.tei-c.org/ns/1.0")  # デフォルト名前空間として設定

In [16]:
new_path = "./data/カタログデータ - 画像のID一覧_OCR.csv"

new_df = pd.read_csv(new_path)

new_map = {}

for index, row in new_df.iterrows():
    id = row["o:item"]

    if pd.isna(id):
        continue

    if id in ["TurskoBlueFolder-001", "TurskoBlueFolder-002"] or id.startswith("Tursko5"):


        if pd.isna(id):
            continue

        if id not in new_map:
            new_map[id] = []

        image_path = row["画像のファイルパス"] + ".JPG"

        new_map[id].append(image_path)


In [17]:
new_map

{'Tursko5-Cover': ['Day 11/IMG_0116.JPG',
  'Day 11/IMG_0117.JPG',
  'Day 14/IMG_0039.JPG',
  'Day 14/IMG_0040.JPG'],
 'TurskoBlueFolder-001': ['Day 14/IMG_0034.JPG', 'Day 14/IMG_0035.JPG'],
 'TurskoBlueFolder-002': ['Day 14/IMG_0036.JPG', 'Day 14/IMG_0037.JPG'],
 'Tursko5-001': ['Day 14/IMG_0042.JPG', 'Day 14/IMG_0043.JPG'],
 'Tursko5-002': ['Day 14/IMG_0044.JPG', 'Day 14/IMG_0045.JPG'],
 'Tursko5-003': ['Day 14/IMG_0046.JPG', 'Day 14/IMG_0047.JPG'],
 'Tursko5-004': ['Day 14/IMG_0048.JPG', 'Day 14/IMG_0049.JPG'],
 'Tursko5-005': ['Day 14/IMG_0052.JPG', 'Day 14/IMG_0053.JPG'],
 'Tursko5-006': ['Day 14/IMG_0054.JPG', 'Day 14/IMG_0055.JPG'],
 'Tursko5-007': ['Day 14/IMG_0056.JPG', 'Day 14/IMG_0057.JPG'],
 'Tursko5-008': ['Day 14/IMG_0058.JPG', 'Day 14/IMG_0059.JPG'],
 'Tursko5-009': ['Day 14/IMG_0062.JPG', 'Day 14/IMG_0063.JPG'],
 'Tursko5-010': ['Day 14/IMG_0064.JPG', 'Day 14/IMG_0065.JPG'],
 'Tursko5-011': ['Day 14/IMG_0066.JPG', 'Day 14/IMG_0067.JPG'],
 'Tursko5-012': ['Day 14/IMG_006

In [21]:
from tqdm import tqdm

for id in tqdm(new_map):
    images = new_map[id]

    raw_tei_xml = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
	schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
	<teiHeader>
		<fileDesc>
			<titleStmt>
				<title></title>
			</titleStmt>
			<publicationStmt>
				<p>Publication Information</p>
			</publicationStmt>
			<sourceDesc>
				<p/>
			</sourceDesc>
		</fileDesc>
	</teiHeader>
	<facsimile>
	</facsimile>
	<text>
		<body>
        <p/>
		</body>
	</text>
</TEI>
'''

    root = ET.fromstring(raw_tei_xml)

    root.find(".//{http://www.tei-c.org/ns/1.0}title").text = id

    for image in images:
        surface = ET.Element("surface")
        graphic = ET.Element("graphic")
        graphic.attrib["url"] = f"../../../{image}"
        surface.append(graphic)
        root.find(".//{http://www.tei-c.org/ns/1.0}facsimile").append(surface)

    with open(f"../docs/tei3/{id}.xml", "w") as f:
        f.write(prettify(ET.tostring(root, encoding="utf-8").decode("utf-8")))

    # break


100%|██████████| 608/608 [00:00<00:00, 980.58it/s] 
