# Transform XML into TEI


In [1]:
import re
import os
from lxml import etree

[x] <article ID=""> : <entry xml:id="">
[x] <Nat_Art>Trévoux 1743</Nat_Art> : removed
[x] <G><vedette>ABAISER,</vedette></G> : <form type="lemma"><orth>ABAISER</orth></form><pc>,</pc> # sortir la virgule ou le point
[x] <svedet> : <form><orth>
[x] <cat_gra> : <gramGrp>
[x] {x} : <pb n="x"/>
[x] <G> : removed
[x] <S> : removed
[x] <I> : removed
[x] <auteur> : removed
[x] <cit> : removed
[x] <oeuvre> : removed
[x] <REM> : <g ref="#manicule-glyph"/>
[x] <dom>termes de Fauconnerie</dom> : <usg type="domain" ana="#fauconnerie">termes de Fauconnerie</usg>
[] <usg type="domain" >termes de Fauconnerie</usg>: ajouter l'attribut <usg type="domain" ana="#fauconnerie">termes de Fauconnerie</usg>
[x] ajout des balises <p> : après la vedette et à chaque retour chariot à l'intérieur d'une entry
[x]<form type="lemmaGrp"> : intègre le <form type="lemma"> et le <gramGrp>
[] <tab> : ?
[] etym ?

In [2]:
def add_citation(content):
    lines = content.split('\n')
    new_content = []
    for line in lines:
   
        if '<cit>' in line:
            line = line.replace('<cit>', '<cit type="example"><quote>')
        if '</cit>' in line:
            line = line.replace('</cit>', '') + '</quote></cit>'

        new_content.append(line)
    return '\n'.join(new_content)


def add_paragraph(content):
    # read file line by line, search for <entry>, then for lines not starting with an element until line starting with </entry>
    # add <p> before the first line not starting with an element
    # and </p> before the line starting with </entry>
    lines = content.split('\n')
    new_content = []
    inside_entry = False
    inside_tab = False
    inside_citation = False
    for line in lines:
        stripped_line = line.strip()
        if '<cit' in stripped_line:
            inside_citation = True
        if '</cit>' in stripped_line:
            inside_citation = False
            new_content.append(line)
            continue
        if stripped_line.startswith('<tab>'):
            inside_tab = True
        elif stripped_line.startswith('</tab>'):
            inside_tab = False
            new_content.append(line)
            continue
        if stripped_line.startswith('<entry'):
            inside_entry = True
            new_content.append(line)
        elif inside_entry and stripped_line.startswith('</entry'):
            inside_entry = False
            new_content.append(line)
        elif inside_entry:
            # TODO and not inside citation
            if not inside_tab and not inside_citation and not (stripped_line.startswith('<') and stripped_line.endswith('>')) and not '<form type="lemma"' in stripped_line:
                new_content.append('<p>')
                new_content.append("          "+line)
                new_content.append("        </p>")
            else:
                new_content.append("        "+line)
        else:
            new_content.append(line)
    return '\n'.join(new_content)

def add_lemmaGrp(content):
    lines = content.split('\n')
    new_content = []
    inside_entry = False
    for line in lines:
        stripped_line = line.strip()
        if stripped_line.startswith('<entry'):
            inside_entry = True
            new_content.append(line)
            new_content.append('<form type="lemmaGrp">')
        elif inside_entry and stripped_line.startswith('</entry'):
            inside_entry = False
            new_content.append(line)
        elif inside_entry:
            if '<form type="lemma"' in stripped_line or stripped_line.startswith('<orth>') or stripped_line.startswith('<pb ') or stripped_line.startswith('</form>') or stripped_line.startswith('<pc>') or stripped_line.startswith('<g ref'):
                new_content.append(line)    
            else:
                if stripped_line.startswith('<gramGrp>'):
                    line = line.replace('</gramGrp>', '</gramGrp></form>\n', 1)
                    new_content.append(line)
                else:
                    new_content.append('</form>')
                    new_content.append(line)
                inside_entry = False
        else:
            new_content.append(line)
    return '\n'.join(new_content)



def xml_2_tei(content):

    content = content.replace('\n<Nat_Art>Trévoux 1743</Nat_Art>', '')
    content = content.replace('<article ID=', '<entry id=').replace('</article>', '</entry>')
    content = content.replace('<cat_gra>', '<gramGrp>').replace('</cat_gra>', '</gramGrp>')

    # vedette
    content = content.replace(',</vedette>', '</vedette><pc>,</pc>\n').replace('.</vedette>', '</vedette><pc>.</pc>\n')
    content = content.replace('<vedette>', '<form type="lemma">\n<orth>').replace('</vedette>', '</orth>\n</form>\n')
    
    # page break : problème fin de colonne et pas de page...
    content = re.sub(r'{([0-9]+)}', r'<pb n="\1"/>', content)
    # find: <pb n="146"/> dans TR1.tei
    # déplacer les page break, par exemple ceux qui sont directement dans <entry> ou <form> ?

    # problème position balise manicule ?
    content = content.replace('<REM />', '<g ref="#manicule-glyph"/>')

    # gramGrp
    content = add_lemmaGrp(content)
    
    content = add_citation(content)
    # paragraph
    content = add_paragraph(content)

    # svedet
    content = content.replace(',</svedet>', '</svedet><pc>,</pc>\n').replace('.</svedet>', '</svedet><pc>.</pc>\n')
    content = content.replace('<svedet>', '<form>\n<orth>').replace('</svedet>', '</orth>\n</form>\n')

    # dom
    content = content.replace(',</dom>', '</dom>\n<pc>,</pc>\n').replace('.</dom>', '</dom>\n<pc>.</pc>\n')
    content = content.replace('<dom>', '<usg type="domain">').replace('</dom>', '</usg>')
    # <usg type="domain" ana="#fauconnerie">termes de Fauconnerie</usg>
    
    return content


In [3]:
inputpath = os.path.join('..', 'data', '1743_LeRobert', 'xml')
outputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')
if not os.path.exists(outputpath):
    os.makedirs(outputpath)

for filename in sorted(os.listdir(inputpath)):
    if not filename.endswith('.xml'):
        continue
    with open(os.path.join(inputpath,filename), 'r', encoding='utf-8') as f:
        content = f.read()

        # suppression de certaines balises (solution temporaire)
        for tag in ['G', 'S', 'I', 'auteur', 'oeuvre', 'Etym']: # 'cit'
            content = content.replace(f'<{tag}>', f'').replace(f'</{tag}>', f'')
        
        content = xml_2_tei(content)

        with open(f'{os.path.join(outputpath, filename[:-4])}.tei', 'w', encoding='utf-8') as f:
            f.write(content)

In [4]:
# update paragraph with n=, improve indentation

inputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')

for filename in sorted(os.listdir(inputpath)):
    try:
        parser = etree.XMLParser(collect_ids=False, encoding='utf-8') 
        root = etree.parse(os.path.join(inputpath, filename), parser=parser).getroot()    

        print(filename)
        volume = filename[2:3]

        for entry in root.findall('.//entry', namespaces=root.nsmap):
            # add attribute type="mainEntry" to <entry>
            entry.set('type', 'mainEntry')

            # if <paragraph> exists and contains <form>, add an <entry type="relatedEntry"> as parent for this <paragraph> and the following ones (until another paragraph with form or end of entry)
            paragraphs = entry.findall('.//p', namespaces=root.nsmap)
            new_paragraphs = []
            current_related_entry = None
            for paragraph in paragraphs:
                form = paragraph.find('.//form', namespaces=root.nsmap)
                if form is not None:
                    # create new relatedEntry
                    current_related_entry = etree.Element('entry', type='relatedEntry')
                    entry.append(current_related_entry)
                    
                if current_related_entry is not None:
                    current_related_entry.append(paragraph)

        for entry in root.findall('.//entry', namespaces=root.nsmap):
            for i, subordinate in enumerate(entry.findall('.//paragraph', namespaces=root.nsmap)):                
                # add n attribute to paragraph
                subordinate.set('n', str(i+1))

        etree.indent(root, space="  ", level=0)
        # write the modified XML back to a file
        with open(os.path.join(inputpath, filename), 'wb') as f:
            f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='UTF-8'))


    except etree.XMLSyntaxError as e:
        print(f"Erreur de syntaxe XML : {e}")




Erreur de syntaxe XML : Document is empty, line 1, column 1 (.DS_Store, line 1)
TR1.tei
TR2.tei
TR3.tei
TR4.tei
TR5.tei
TR6.tei


## Check if the XML are valid

In [6]:
inputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')

for filename in sorted(os.listdir(inputpath)):
    try:
        parser = etree.XMLParser(collect_ids=False, encoding='utf-8') 
        root = etree.parse(os.path.join(inputpath, filename), parser=parser).getroot()    
        #print(root.nsmap)
        print(filename)
        # compte les balise <entry>
        entry_count = len(root.findall('.//entry[@type="mainEntry"]', namespaces=root.nsmap))
        print(f"Nombre d'entrées : {entry_count}")
        # compte les balise <p>
        paragraph_count = len(root.findall('.//p', namespaces=root.nsmap))
        print(f"Nombre de paragraphes : {paragraph_count}")
        
    except etree.XMLSyntaxError as e:
        print(f"Erreur de syntaxe XML : {e}")

Erreur de syntaxe XML : Document is empty, line 1, column 1 (.DS_Store, line 1)
TR1.tei
Nombre d'entrées : 8244
Nombre de paragraphes : 19458
TR2.tei
Nombre d'entrées : 10146
Nombre de paragraphes : 20813
TR3.tei
Nombre d'entrées : 9818
Nombre de paragraphes : 21404
TR4.tei
Nombre d'entrées : 9470
Nombre de paragraphes : 20244
TR5.tei
Nombre d'entrées : 10702
Nombre de paragraphes : 19547
TR6.tei
Nombre d'entrées : 5586
Nombre de paragraphes : 10591
