In [1]:
"""
Récupère toutes les balises xml des fichiers XML Raw
"""

from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict

elements = defaultdict(set)
attributes = defaultdict(set)

raw_dir = Path("../raw_files")
for xml_file in raw_dir.glob("*.xml"):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        stack = [(root, None)]
        while stack:
            elem, parent = stack.pop()
            if parent:
                elements[parent].add(elem.tag)
            for attr in elem.attrib:
                attributes[elem.tag].add(attr)
            stack.extend((child, elem.tag) for child in elem)
    except Exception as e:
        print(f"Erreur avec {xml_file}: {e}")

all_tags = set(elements.keys()) | {tag for children in elements.values() for tag in children}

print("<!ELEMENT Root ANY>")
for tag in sorted(all_tags):
    children = elements[tag]
    if children:
        children_str = ", ".join(sorted(children))
        print(f"<!ELEMENT {tag} ({children_str})*>")
    else:
        print(f"<!ELEMENT {tag} (#PCDATA)>")
    if attributes[tag]:
        attrs = " ".join(f"{a} CDATA #IMPLIED" for a in attributes[tag])
        print(f"<!ATTLIST {tag} {attrs}>")


<!ELEMENT Root ANY>
<!ELEMENT Article (chap, col, folio, line, margin, margin_car, margin_left, margin_right, ms, supralinear, text, vacat_car, verse_nb)*>
<!ELEMENT Root (Article)*>
<!ELEMENT chap (margin_car, superscript, vacat_car)*>
<!ELEMENT col (#PCDATA)>
<!ELEMENT folio (#PCDATA)>
<!ELEMENT greek (#PCDATA)>
<!ELEMENT line (Article)*>
<!ELEMENT margin (line, margin_car, margin_reconstructed, margin_supralinear, parallel, superscript, verse_nb)*>
<!ELEMENT margin_car (Article)*>
<!ELEMENT margin_infralinear (#PCDATA)>
<!ELEMENT margin_left (margin_car)*>
<!ELEMENT margin_reconstructed (#PCDATA)>
<!ELEMENT margin_right (margin_car)*>
<!ELEMENT margin_supralinear (#PCDATA)>
<!ELEMENT ms (#PCDATA)>
<!ELEMENT parallel (#PCDATA)>
<!ELEMENT superscript (#PCDATA)>
<!ELEMENT supralinear (#PCDATA)>
<!ELEMENT text (Article, folio, greek, line, margin_car, margin_infralinear, margin_reconstructed, margin_supralinear, parallel, superscript, supralinear, vacat_car, verse_nb)*>
<!ELEMENT vacat_

In [2]:
"""
Récupère toutes les balises xml des fichiers XML Tei_files
"""

from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict

elements = defaultdict(set)
attributes = defaultdict(set)

raw_dir = Path("../xml_tei_revised")
for xml_file in raw_dir.glob("*.xml"):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        stack = [(root, None)]
        while stack:
            elem, parent = stack.pop()
            if parent:
                elements[parent].add(elem.tag)
            for attr in elem.attrib:
                attributes[elem.tag].add(attr)
            stack.extend((child, elem.tag) for child in elem)
    except Exception as e:
        print(f"Erreur avec {xml_file}: {e}")

all_tags = set(elements.keys()) | {tag for children in elements.values() for tag in children}

print("<!ELEMENT Root ANY>")
for tag in sorted(all_tags):
    children = elements[tag]
    if children:
        children_str = ", ".join(sorted(children))
        print(f"<!ELEMENT {tag} ({children_str})*>")
    else:
        print(f"<!ELEMENT {tag} (#PCDATA)>")
    if attributes[tag]:
        attrs = " ".join(f"{a} CDATA #IMPLIED" for a in attributes[tag])
        print(f"<!ATTLIST {tag} {attrs}>")

<!ELEMENT Root ANY>
<!ELEMENT del (#PCDATA)>
<!ATTLIST del rend CDATA #IMPLIED>
<!ELEMENT div (div, line, margin, space, stich, w)*>
<!ATTLIST div folio CDATA #IMPLIED type CDATA #IMPLIED line CDATA #IMPLIED n CDATA #IMPLIED>
<!ELEMENT g (#PCDATA)>
<!ATTLIST g type CDATA #IMPLIED>
<!ELEMENT hi (#PCDATA)>
<!ATTLIST hi rend CDATA #IMPLIED>
<!ELEMENT line (w)*>
<!ATTLIST line folio CDATA #IMPLIED col CDATA #IMPLIED n CDATA #IMPLIED>
<!ELEMENT margin (div, w)*>
<!ATTLIST margin folio CDATA #IMPLIED type CDATA #IMPLIED line CDATA #IMPLIED>
<!ELEMENT ms (div, margin)*>
<!ATTLIST ms name CDATA #IMPLIED>
<!ELEMENT root (ms)*>
<!ELEMENT space (#PCDATA)>
<!ATTLIST space extent CDATA #IMPLIED unit CDATA #IMPLIED>
<!ELEMENT stich (#PCDATA)>
<!ELEMENT w (del, g, hi, space, stich)*>
<!ATTLIST w reconsctructed CDATA #IMPLIED reconstructed CDATA #IMPLIED>
