# Convert TEI into TSV


In [14]:
import re
import os
from lxml import etree
import pandas as pd

In [15]:
def clean_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    # Replace newlines with a space
    text = text.replace("\n", " ")
    # Split the text into words and join them with a single space
    text = " ".join(text.split())
    return text

def extract_text(element):
    text = element.text or ""
    for child in element:
        text += extract_text(child)
        if child.tail:
            text += child.tail
    return clean_text(text)



In [52]:
inputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')

data = []

for filename in sorted(os.listdir(inputpath)):
    try:
        parser = etree.XMLParser(collect_ids=False, encoding='utf-8') 
        root = etree.parse(os.path.join(inputpath, filename), parser=parser).getroot()    
        #print(root.nsmap)
        print(filename)
        volume = filename[2:3]

        for entry in root.findall('.//entry[@type="mainEntry"]', namespaces=root.nsmap):
            
            id = 1
            entry_id = entry.get('id')
            form = entry.find('.//form[@type="lemma"]/orth', namespaces=root.nsmap)
            if form is not None:
                entry_lemma = form.text
            else:
                print("Forme : non trouvée, entry:", entry_id)
                entry_lemma = None
            
            for i, paragraph in enumerate(entry.findall('./p', namespaces=root.nsmap)):
                usg = None
                content = ""
                form = paragraph.find('.//form/orth', namespaces=root.nsmap)
                if form is not None:
                    paragraph_lemma = form.text
                else:
                    paragraph_lemma = None
                
                # get number of usg with type="domain" in the paragraph
                num_usg_domain = len(paragraph.findall('.//usg[@type="domain"]', namespaces=root.nsmap))

                usg = paragraph.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                if usg is not None:
                    paragraph_domain = usg.text
                else:
                    paragraph_domain = None

                # Extract the text content from the paragraph element
                content = extract_text(paragraph)

                if i == 0:
                    content = entry_lemma + " " + content

                #entry_content += content + "\n\n"

                #if entry_content != "":
                row = [volume, entry_id, entry_lemma, id, 'mainEntry', i+1, paragraph_domain, content, num_usg_domain]
                #volume | entry | entry_lemma | paragraph | paragraph_domain | content
                data.append(row)
            

            for relatedEntry in entry.findall('.//entry[@type="relatedEntry"]', namespaces=root.nsmap):
                id += 1
            
                form = relatedEntry.find('.//form/orth', namespaces=root.nsmap)
                if form is not None:
                    paragraph_lemma = form.text
                else:
                    paragraph_lemma = None
                
                for i, paragraph in enumerate(relatedEntry.findall('.//p', namespaces=root.nsmap)):
                    usg = None
                    content = ""

                    form = paragraph.find('.//form/orth', namespaces=root.nsmap)
                    if form is not None:
                        paragraph_lemma = form.text
                    else:
                        paragraph_lemma = None
                    # get number of usg with type="domain" in the paragraph
                    num_usg_domain = len(paragraph.findall('.//usg[@type="domain"]', namespaces=root.nsmap))

                    usg = paragraph.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                    if usg is not None:
                        paragraph_domain = usg.text
                    else:
                        paragraph_domain = None

                    # Extract the text content from the paragraph element
                    content = extract_text(paragraph)

                    if i == 0:
                        content = entry_lemma + " " + content

                    #relatedEntry_content += content + "\n\n"

                    row = [volume, entry_id, entry_lemma, id, 'relatedEntry', i+1, paragraph_domain, content, num_usg_domain]
                    #volume | entry | entry_lemma | subordinate | subordinate_domain | content
                    data.append(row)
                    
                #print(row)
        
    except etree.XMLSyntaxError as e:
        print(f"Erreur de syntaxe XML : {e}")


# convert data into a dataframe

df = pd.DataFrame(data, columns=['volume', 'entry', 'head', 'subEntryId', 'type', 'paragraphId', 'srcDomain', 'text', 'numUsgDomain'])
df.head()


Erreur de syntaxe XML : Document is empty, line 1, column 1 (.DS_Store, line 1)
TR1.tei
TR2.tei
TR3.tei
TR4.tei
TR5.tei
TR6.tei


Unnamed: 0,volume,entry,head,subEntryId,type,paragraphId,srcDomain,text,numUsgDomain
0,1,250000010,A,1,mainEntry,1,,A est la première Lettre de l'Alphabet Françoi...,0
1,1,250000010,A,1,mainEntry,2,,C'est inutilement que la plupart des Grammairi...,0
2,1,250000010,A,1,mainEntry,3,,"A se prononce du gozier, ce qui ne rend pas ce...",0
3,1,250000010,A,1,mainEntry,4,,Le son de l'a est ordinairement un son clair. ...,0
4,1,250000010,A,1,mainEntry,5,,Le son de l'a est un de ceux que les muets for...,0


In [None]:
### !!! CODE BELOW IS FOR SUBENTRY LEVEL (NOT PARAGRAPH LEVEL)
inputpath = os.path.join('..', 'data', '1743_LeRobert', 'tei')

data = []

for filename in sorted(os.listdir(inputpath)):
    try:
        parser = etree.XMLParser(collect_ids=False, encoding='utf-8') 
        root = etree.parse(os.path.join(inputpath, filename), parser=parser).getroot()    
        #print(root.nsmap)
        print(filename)
        volume = filename[2:3]

        for entry in root.findall('.//entry[@type="mainEntry"]', namespaces=root.nsmap):
            usg = None
            id = 1
            entry_id = entry.get('id')
            form = entry.find('.//form[@type="lemma"]/orth', namespaces=root.nsmap)
            if form is not None:
                entry_lemma = form.text
            else:
                print("Forme : non trouvée, entry:", entry_id)
                entry_lemma = None
            entry_content = ""
            for i, paragraph in enumerate(entry.findall('./p', namespaces=root.nsmap)):
                
                form = paragraph.find('.//form/orth', namespaces=root.nsmap)
                if form is not None:
                    paragraph_lemma = form.text
                else:
                    paragraph_lemma = None
                if usg is None:
                    usg = paragraph.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                    if usg is not None:
                        paragraph_domain = usg.text
                    else:
                        paragraph_domain = None

                # Extract the text content from the paragraph element
                content = extract_text(paragraph)

                if i == 0:
                    content = entry_lemma + " " + content

                entry_content += content + "\n\n"

            if entry_content != "":
                row = [volume, entry_id, entry_lemma, id, 'mainEntry', paragraph_domain, entry_content]
                #volume | entry | entry_lemma | paragraph | paragraph_domain | content
                data.append(row)
                

            for relatedEntry in entry.findall('.//entry[@type="relatedEntry"]', namespaces=root.nsmap):
                id += 1
                usg = None
                relatedEntry_content = ""
                form = relatedEntry.find('.//form/orth', namespaces=root.nsmap)
                if form is not None:
                    paragraph_lemma = form.text
                else:
                    paragraph_lemma = None
                if usg is None:
                    usg = paragraph.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                    if usg is not None:
                        paragraph_domain = usg.text
                    else:
                        paragraph_domain = None

                for i, paragraph in enumerate(relatedEntry.findall('.//p', namespaces=root.nsmap)):
                    
                    form = paragraph.find('.//form/orth', namespaces=root.nsmap)
                    if form is not None:
                        paragraph_lemma = form.text
                    else:
                        paragraph_lemma = None

                    usg = paragraph.find('.//usg[@type="domain"]', namespaces=root.nsmap)
                    if usg is not None:
                        paragraph_domain = usg.text
                    else:
                        paragraph_domain = None

                    # Extract the text content from the paragraph element
                    content = extract_text(paragraph)

                    if i == 0:
                        content = entry_lemma + " " + content

                    relatedEntry_content += content + "\n\n"

                row = [volume, entry_id, entry_lemma, id, 'relatedEntry', paragraph_domain, relatedEntry_content]
                #volume | entry | entry_lemma | subordinate | subordinate_domain | content
                data.append(row)
                    
                #print(row)
        
    except etree.XMLSyntaxError as e:
        print(f"Erreur de syntaxe XML : {e}")


# convert data into a dataframe

df = pd.DataFrame(data, columns=['volume', 'entry', 'head', 'subEntryId', 'type', 'srcDomain', 'text'])
df.head()


Erreur de syntaxe XML : Document is empty, line 1, column 1 (.DS_Store, line 1)
TR1.tei
TR2.tei
TR3.tei
TR4.tei
TR5.tei
TR6.tei


Unnamed: 0,volume,entry,head,subEntryId,type,srcDomain,text
0,1,250000010,A,1,mainEntry,,A est la première Lettre de l'Alphabet Françoi...
1,1,250000020,AAHUS,1,mainEntry,,AAHUS Aahusium. Ville de l'Evéché de Munster. ...
2,1,250000030,AAR,1,mainEntry,,"AAR ou AHR. subst. Aara, Abrinca. Rivière d'Al..."
3,1,250000040,AAR,1,mainEntry,,"AAR Arula ou Arola, & non pas Arosa, comme on ..."
4,1,250000050,AARBRER,1,mainEntry,,AARBRER Terme ancien qui n'est pas aujourd'hui...


In [17]:
df.shape

(81374, 8)

In [29]:
df.shape

(112053, 8)

In [31]:
df.head(70)

Unnamed: 0,volume,entry,head,subEntryId,type,paragraphId,srcDomain,text
0,1,250000010,A,1,mainEntry,1,,A est la première Lettre de l'Alphabet Françoi...
1,1,250000010,A,1,mainEntry,2,,C'est inutilement que la plupart des Grammairi...
2,1,250000010,A,1,mainEntry,3,,"A se prononce du gozier, ce qui ne rend pas ce..."
3,1,250000010,A,1,mainEntry,4,,Le son de l'a est ordinairement un son clair. ...
4,1,250000010,A,1,mainEntry,5,,Le son de l'a est un de ceux que les muets for...
...,...,...,...,...,...,...,...,...
65,1,250000150,ABADIR,1,mainEntry,4,,Priscien rapporte qu'Abaddir étoit aussi le no...
66,1,250000160,ABAEUZ,1,mainEntry,1,Terme de Coutume,ABAEUZ Terme de Coutume . Biens abaeuz. Bona v...
67,1,250000170,ABAJOUR,1,mainEntry,1,Terme d'Architecture,"ABAJOUR Terme d'Architecture , Spiraculum, esp..."
68,1,250000170,ABAJOUR,1,mainEntry,2,,Ce mot est composé du verbe abattre & du nom j...


In [53]:
df['book'] = 'DUFLT_1743'

#df.rename(columns={"entry_lemma": "head", "content": "text", "subordinate_domain":"src-domain"}, inplace=True)
df['numero'] = df.groupby('volume')['entry'].transform(lambda x: pd.factorize(x)[0] + 1)

df = df[['book', 'volume', 'numero', 'head', 'subEntryId',  'type', 'paragraphId','srcDomain', 'text', 'numUsgDomain']]


In [54]:
df.head(60)

Unnamed: 0,book,volume,numero,head,subEntryId,type,paragraphId,srcDomain,text,numUsgDomain
0,DUFLT_1743,1,1,A,1,mainEntry,1,,A est la première Lettre de l'Alphabet Françoi...,0
1,DUFLT_1743,1,1,A,1,mainEntry,2,,C'est inutilement que la plupart des Grammairi...,0
2,DUFLT_1743,1,1,A,1,mainEntry,3,,"A se prononce du gozier, ce qui ne rend pas ce...",0
3,DUFLT_1743,1,1,A,1,mainEntry,4,,Le son de l'a est ordinairement un son clair. ...,0
4,DUFLT_1743,1,1,A,1,mainEntry,5,,Le son de l'a est un de ceux que les muets for...,0
5,DUFLT_1743,1,1,A,1,mainEntry,6,,"A devant un e, avec lequel il fait une diphton...",0
6,DUFLT_1743,1,1,A,1,mainEntry,7,,"A devant un i, ou devant un y, avec lequel il ...",0
7,DUFLT_1743,1,1,A,1,mainEntry,8,,A devant o & ne faisant qu'une même syllabe av...,0
8,DUFLT_1743,1,1,A,1,mainEntry,9,,"A devant u se prononce comme un o, comme dans ...",0
9,DUFLT_1743,1,1,A,1,mainEntry,10,,A devant y a le meme son que devant i ; il fau...,0


In [60]:
df.to_csv(os.path.join('..', 'data', '1743_LeRobert', 'Trevoux1743_paragraphs_260210.tsv'), sep='\t', index=False, encoding='utf-8')
df.to_excel(os.path.join('..', 'data', '1743_LeRobert', 'Trevoux1743_paragraphs_260210.xlsx'), index=False)


In [57]:
df.shape

(112053, 10)