In [243]:
import pandas as pd
from glob import glob
import xml.etree.ElementTree as ET
import json

In [244]:
from xml.dom import minidom
import re
def prettify(rough_string):
    reparsed = minidom.parseString(rough_string)
    pretty = re.sub(r"[\t ]+\n", "", reparsed.toprettyxml(indent="\t"))  # インデント後の不要な改行を削除
    pretty = pretty.replace(">\n\n\t<", ">\n\t<")  # 不要な空行を削除
    pretty = re.sub(r"\n\s*\n", "\n", pretty)  # 連続した改行（空白行を含む）を単一の改行に置換
    # エスケープされた引用符を元に戻す
    pretty = pretty.replace("&quot;", "\"")
    return pretty

In [245]:
old_path = "./data/カタログデータ - 新規画像撮影の対応表.csv"

old_df = pd.read_csv(old_path)

old_map = {}

old_map_extra = {}

for index, row in old_df.iterrows():
    value = row["画像一覧"]
    if pd.isna(value):
        continue
    old_map[row["現在のアイテムID"]] = {
        "id_new": row["dcterms:identifer"],
        "files": value.split("|"),
        "folder": row["現在のフォルダ"]
    }

    for basename in value.split("|"):
        old_map_extra[basename] = row["現在のフォルダ"]

# old_map

In [246]:
# old_map_extra

In [247]:
new_path = "./data/カタログデータ - 画像のID一覧_OCR.csv"

new_df = pd.read_csv(new_path)

new_map = {}

for index, row in new_df.iterrows():
    id = row["o:item"]


    if pd.isna(id):
        continue

    if id not in new_map:
        new_map[id] = []

    image_path = row["画像のファイルパス"] + ".JPG"

    new_map[id].append(image_path)

# new_map


In [248]:
files = glob("../docs/tei3_original/*.xml")

# files = glob("../docs/tei3_original/DSCN0121.xml")
# files = glob("../docs/tei3_original/DSCN1969.xml")

files.sort()

ET.register_namespace('', "http://www.tei-c.org/ns/1.0")  # デフォルト名前空間として設定

missings = []

errors = []

for file in files:

    # try:
        tree = ET.parse(file)
        root = tree.getroot()

        surfaceGrp = root.findall(".//{http://www.tei-c.org/ns/1.0}surfaceGrp")

        facsimiles = root.find(".//{http://www.tei-c.org/ns/1.0}facsimile")

        for surfaceGrp in surfaceGrp:

            manifest = surfaceGrp.get("facs")

            # print(manifest)

            manifest_path = "../docs/old/iiif/" + manifest.split("/")[-2] + "/manifest.json"

            



            with open(manifest_path, "r") as f:
                manifest_json = json.load(f)

            canvas_map = {}

            for canvas in manifest_json["sequences"][0]["canvases"]:
                canvas_map[canvas["@id"]] = canvas["label"]

            surfaces = surfaceGrp.findall(".//{http://www.tei-c.org/ns/1.0}surface")





            for surface in surfaces:
                
                graphics = surface.findall(".//{http://www.tei-c.org/ns/1.0}graphic")
                
                zones = surface.findall(".//{http://www.tei-c.org/ns/1.0}zone")

                surface_new = ET.Element("surface")
                facsimiles.append(surface_new)

                for graphic in graphics:
                    # print(graphic.attrib)

                    

                    canvas_id = graphic.get("n")

                    # print("canvas_id", canvas_id)

                    if "@value" not in canvas_map[canvas_id]:
                        missings.append({
                            "file": file,
                            "canvas_id": canvas_id
                        })
                        continue

                    filename = canvas_map[canvas_id]["@value"]

                    # print(filename)

                    file_basename = filename.split(".")[0]

                    '''
                    if file_basename not in old_map:
                        missings.append({
                            "file": file,
                            "basename": file_basename
                        })
                        continue

                    old_info = old_map[file_basename]
                    '''

                    old_folder = old_map_extra[file_basename]

                    graphic_new = ET.Element("graphic")
                    graphic_new.set("url", "../../../" + old_folder.replace(" ", "%20") + "/" + filename)

                    # print(graphic_new.attrib)

                    surface_new.append(graphic_new)
                    for zone in zones:
                        zone_new = ET.Element("zone")
                        # 属性値が存在する場合のみセット
                        for attr, value in [
                            ("ulx", zone.get("uly")),
                            ("uly", zone.get("uly")),
                            ("lrx", zone.get("lrx")),
                            ("lry", zone.get("lry"))
                        ]:
                            if value is not None:
                                zone_new.set(attr, value)

                        # xml:idも同様にチェック
                        xml_id = zone.get("{http://www.w3.org/XML/1998/namespace}id")
                        if xml_id is not None:
                            zone_new.set("{http://www.w3.org/XML/1998/namespace}id", xml_id)
                        
                        surface_new.append(zone_new)
            
                    

                    ## new

                    if file_basename not in new_map:
                        continue

                    old_info = old_map[file_basename]

                    new_id = old_info["id_new"]

                    # print("new_id", new_id)

                    new_files = new_map[new_id]

                    for new_file in new_files:

                        surface_new = ET.Element("surface")
                        facsimiles.append(surface_new)

                        graphic_new = ET.Element("graphic")
                        graphic_new.set("url", "../../../" + new_file.replace(" ", "%20"))
                        surface_new.append(graphic_new)

            # del surfaceGrp

            # surfaceGrp.insert(0, surface_new)

            facsimiles.remove(surfaceGrp)

        opath = file.replace("tei3_original", "tei3")
        with open(opath, "w") as f:
            f.write(prettify(ET.tostring(root, encoding="utf-8").decode("utf-8")))

    # except Exception as e:

    #     errors.append({
    #         "file": file,
    #         "error": e
    #     })





TypeError: cannot serialize None (type NoneType)

In [221]:
len(missings)

1

In [222]:
missings

[{'file': '../docs/tei3_original/DSCN1969.xml',
  'canvas_id': 'https://diyhistory.org/public/phr2/iiif/1499/canvas/p3'}]

In [223]:
len(errors)

3

In [224]:
errors

[{'file': '../docs/tei3_original/DSCN0216.xml',
  'error': TypeError('cannot serialize None (type NoneType)')},
 {'file': '../docs/tei3_original/DSCN0241.xml',
  'error': TypeError('cannot serialize None (type NoneType)')},
 {'file': '../docs/tei3_original/DSCN0253.xml',
  'error': TypeError('cannot serialize None (type NoneType)')}]