# Convert TEI into TSV


In [24]:
import re
import os
from lxml import etree
import pandas as pd

In [30]:
def clean_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    # Replace newlines with a space
    text = text.replace("\n", " ")
    # Split the text into words and join them with a single space
    text = " ".join(text.split())
    return text

def extract_text(element):
    text = element.text or ""
    for child in element:
        text += extract_text(child)
        if child.tail:
            text += child.tail
    return clean_text(text)



In [34]:
inputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')

data = []

for filename in sorted(os.listdir(inputpath)):
    try:
        parser = etree.XMLParser(collect_ids=False, encoding='utf-8') 
        root = etree.parse(os.path.join(inputpath, filename), parser=parser).getroot()    
        #print(root.nsmap)
        print(filename)
        volume = filename[2:3]

        for entry in root.findall('.//entry', namespaces=root.nsmap):
            entry_id = entry.get('id')
            form = entry.find('.//form[@type="lemma"]/orth', namespaces=root.nsmap)
            if form is not None:
                entry_lemma = form.text
            else:
                print("Forme : non trouvée, entry:", entry_id)
                entry_lemma = None

            for i, subordinate in enumerate(entry.findall('.//sense', namespaces=root.nsmap)):
                
                form = subordinate.find('.//form/orth', namespaces=root.nsmap)
                if form is not None:
                    subordinate_lemma = form.text
                else:
                    subordinate_lemma = None

                usg = subordinate.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                if usg is not None:
                    subordinate_domain = usg.text
                else:
                    subordinate_domain = None

                # Extract the text content from the subordinate element
                content = extract_text(subordinate)

                if i == 0:
                    content = entry_lemma + " " + content

                row = [volume, entry_id, entry_lemma, i+1, subordinate_lemma, subordinate_domain, content]
                #volume | entry | entry_lemma | subordinate | subordinate_lemma | subordinate_domain | content
                data.append(row)
                
                #print(row)
        
    except etree.XMLSyntaxError as e:
        print(f"Erreur de syntaxe XML : {e}")


# convert data into a dataframe

df = pd.DataFrame(data, columns=['volume', 'entry', 'entry_lemma', 'subordinate', 'subordinate_lemma', 'subordinate_domain', 'content'])
df.head()


TR1.tei
TR2.tei
TR3.tei
TR4.tei
TR5.tei
TR6.tei


Unnamed: 0,volume,entry,entry_lemma,subordinate,subordinate_lemma,subordinate_domain,content
0,1,250000010,A,1,,,A est la première Lettre de l'Alphabet Françoi...
1,1,250000010,A,2,,,C'est inutilement que la plupart des Grammairi...
2,1,250000010,A,3,,,"A se prononce du gozier, ce qui ne rend pas ce..."
3,1,250000010,A,4,,,Le son de l'a est ordinairement un son clair. ...
4,1,250000010,A,5,,,Le son de l'a est un de ceux que les muets for...


In [35]:
df.to_csv(os.path.join('..', 'data', '1743_LeRobert', 'Trevoux1743.tsv'), sep='\t', index=False, encoding='utf-8')

In [36]:
df.shape

(135004, 7)