In [1]:
from nlppipe.nlppipe import NLPPipe
from factextraction.model.model import SyntacticIndex
import joblib 
from gensim.models import KeyedVectors
import pkg_resources
from glob import glob 
import re 
import os 
from multiprocessing.dummy import Pool
from tqdm import tqdm 
from bs4 import BeautifulSoup 
import json
import copy
from itertools import groupby
%load_ext autoreload
%autoreload 2

# Semantic Annotation

In [2]:
factextractor_config = {
    "extractor": SyntacticIndex('data/EMONTAL-facts'),
    "classifier": joblib.load('data/EMONTAL-facts/model/mlClassifier/clf.joblib'),
    'we': KeyedVectors.load('data/embeddings/conceptnet_fr-19.08-clean_simpler', mmap='r')
}
heidelpath = pkg_resources.resource_filename(__name__, f"heideltimetagger/heideltime-standalone")


# heidelpath = '/Volumes/T7/THESE/EMONTAL/semanticannotation/main/heideltimetagger/heideltime-standalone'
heideltime_config = {
    "heidelpath": heidelpath,
    "filepath": '',
    "lg": 'FRENCH',
    "dct": '',
    "doctype": 'NEWS',
    "configpath": f'{heidelpath}/config.props',
    "heideltime": f'{heidelpath}/de.unihd.dbs.heideltime.standalone.jar',
    "infer_day": True
}

nlppipe = NLPPipe(model='fr_dep_news_trf', heideltime_config=heideltime_config, factextractor_config=factextractor_config)

2024-02-05 11:01:58 gensim.utils INFO: loading KeyedVectors object from data/embeddings/conceptnet_fr-19.08-clean_simpler
2024-02-05 11:01:58 gensim.utils INFO: loading vectors from data/embeddings/conceptnet_fr-19.08-clean_simpler.vectors.npy with mmap=r
2024-02-05 11:01:59 gensim.utils INFO: KeyedVectors lifecycle event {'fname': 'data/embeddings/conceptnet_fr-19.08-clean_simpler', 'datetime': '2024-02-05T11:01:59.001419', 'gensim': '4.3.2', 'python': '3.11.7 (main, Dec 15 2023, 12:09:04) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
  from .autonotebook import tqdm as notebook_tqdm


Available components:  ['transformer', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer']


In [29]:
import pandas as pd 

df = pd.read_csv('/Volumes/T7/THESE/EMONTAL/semanticannotation/main/data/EMONTAL-facts/model/semanticIndex/index/semanticIndex.csv', index_col=0)
# df.set_index('Unnamed: 0', inplace=True)
df.loc[['naître_VERB', '12_NUM']]

Unnamed: 0,occupation,capitalOf,country,placeOfBirth,nextInBodyWater,spouse,memberOf,sharesBordersWith,dateOfBirth,dateOfDeath,inception,headOfGoverment,educatedAt
naître_VERB,0.004283,0.0,0.0,0.461793,0.0,0.002059,0.002322,0.0,0.74886,0.475305,0.001916,0.002072,0.002113
12_NUM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632992,0.602644,0.485944,0.0,0.0


In [35]:
stats.hmean([0.706, 0.602])

0.6498654434250764

In [34]:
from scipy import stats


sel = df.loc[['naître_VERB', '12_NUM']][['dateOfBirth', 'dateOfDeath']]
sel.apply(stats.hmean, axis=0)

dateOfBirth    0.686068
dateOfDeath    0.531453
dtype: float64

In [20]:
import pyvis
from pyvis.network import Network

n = Network(directed=True, notebook=True)

g = nlppipe.factextractor.extractor.syntacticIndex["naître_VERB"][0]['graph']

for node in g.nodes(data=True):
    n.add_node(node[0], label=node[1]['text'])
    # print(node[1]['text'])

for edge in g.edges(data=True):
    print(edge)
    n.add_edge(edge[0], edge[1], label=edge[-1]['dep'], physics =True)

n.toggle_physics(True)
n.show('naitre.html')

(10, 7, {'dep': 'nsubj', 'width': 1})
(10, 22, {'dep': 'obl:mod', 'width': 1})
naitre.html


In [32]:
with open('data/emontal-ner-er/corpus/graph_1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
corpus = [x['sent'] for x in data['content']]
proc_corpus = nlppipe.process_corpus(corpus)

In [59]:

for i, doc in enumerate(proc_corpus):
    os.makedirs(f'graph_data_emontal/{i}', exist_ok=True)

    # print(doc)
    ents = []
    rels = []
    tmp_rels = []
    for x, ent in enumerate(doc.spans['REL']):
        ents.append(
            {   
                'id': f"{i}_{x}_{ent._.ENT_type}",
                'text': ent.text,
                'type': ent._.ENT_type
             }
        )

        for rel in ent._.ENT_relations:
            y = copy.copy(rel)
            y['ent'] = ent.text
            y['ent_type'] = ent._.ENT_type
            y['ent_id'] = f"{i}_{x}_{ent._.ENT_type}"
            tmp_rels.append(y)

    tmp_rels.sort(key=lambda x: x['id'])
    for k, group in groupby(tmp_rels, key=lambda x: x['id']):
        # print(k)
        rel = {}
        for j, g in enumerate(group):
            rel['id'] = f"{i}_{g['id']}"
            rel['relation'] = g['relation']
            rel['score'] = g['score']
            rel['rule'] = g['rule']
            rel['anchor'] = g['anchor']
            rel['nodes'] = g['nodes']
            rel[f'ent_{j+1}'] = g['ent']
            rel[f'ent_{j+1}_type'] = g['ent_type']
            rel[f'ent_{j+1}_id'] = g['ent_id']
            rel['sent'] = doc.text
        rels.append(rel)

    with open(f'graph_data_emontal/{i}/ents.json', 'w', encoding='utf-8') as f:
        json.dump(ents, f, indent=4)
    
    with open(f'graph_data_emontal/{i}/rels.json', 'w', encoding='utf-8') as f:
        json.dump(rels, f, indent=4)
    print(ents)
    print(rels)
    
    # print()
    # print(ents)

[]
[]
[{'id': '1_0_Q5', 'text': 'Bernard', 'type': 'Q5'}, {'id': '1_1_Misc', 'text': 'homme', 'type': 'Misc'}]
[{'id': '1_1_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'ent_1': 'Bernard', 'ent_1_type': 'Q5', 'ent_1_id': '1_0_Q5', 'sent': 'Bernard était un homme de cinq pieds neuf pouces, d’une cinquantaine d’années, dont la taille était droite et très menue, le port roide et assuré, la physionomie d’une imperturbable austérité que n’avait jamais égarée un sourire.', 'ent_2': 'homme', 'ent_2_type': 'Misc', 'ent_2_id': '1_1_Misc'}, {'id': '1_2_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'ent_1': 'Bernard', 'ent_1_type': 'Q5', 'ent_1_id': '1_0_Q5', 'sent': 'Bernard était un homme de cinq pieds neuf pouces, d’une cinquantaine d’années, dont la taille était droite et très menue, le port roide et assuré, la physionomie d’une imperturbable austérité que n

In [31]:


all_rels = []
for doc in proc_corpus:
    # print(doc)
    rels = []
    for rel in doc.spans['REL']:
        rel_data = rel._.ENT_relations
        print(rel)
        
        print(rel._.ENT_relations)


        for x in rel_data:

            y = copy.copy(x)
            # print(y)
            # print(rel.text)
            y['text'] = rel.text
            y['type'] = rel._.ENT_type
            y['root_node'] = rel._.ENT_root_node
            y['start'] = rel._.ENT_start,
            y['end'] = rel._.ENT_end
            y['char_start'] = rel._.ENT_charstart,
            y['char_end'] = rel._.ENT_charend
            y['sentence'] = doc.text
            rels.append(y)
        # rels.extend(rel_data)
        # print(data)
    # for x in rels:
    #     print(x)
    # print(rels)
    # all_rels.extend(rels)
    # print()
            
    # rels.sort(key=lambda x: x['id'])
    # for k, group in groupby(rels, key=lambda x: x['id']):
    #     print(k)
    #     for g in group:
    #         print(g)
            
# for x in all_rels:
#     print(x)



Bernard
[{'id': '1_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'text': 'homme', 'type': 'Misc', 'root_node': 3, 'start': (3,), 'end': 4, 'char_start': (17,), 'char_end': 22}, {'id': '2_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'text': 'homme', 'type': 'Misc', 'root_node': 3, 'start': (3,), 'end': 4, 'char_start': (17,), 'char_end': 22}]
homme
[{'id': '1_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'text': 'homme', 'type': 'Misc', 'root_node': 3, 'start': (3,), 'end': 4, 'char_start': (17,), 'char_end': 22}, {'id': '2_occupation', 'relation': 'occupation', 'score': 1.0, 'rule': 'semantic', 'anchor': 'homme_NOUN', 'nodes': [0, 3], 'text': 'homme', 'type': 'Misc', 'root_node': 3, 'start': (3,), 'end': 4, 'char_start': (17,), 'char_end': 22}]
Giromagny
[{'id': '1_educatedAt', 'relat

In [4]:
all_files = glob('/Volumes/T7/THESE/CORPUS/**/**_docbook.xml', recursive=True)
len(all_files)

8856

In [30]:
final_files = glob('/Volumes/T7/THESE/CORPUS/**/**_docbook-final.xml', recursive=True)
len(final_files)

6136

In [4]:
# emontal corpus
def filterPath(filepath:str, file_list):
    filename = filepath.replace('_docbook.xml', '')
    if filename in file_list:
        return False
    return True

spacy_files = glob('/Volumes/T7/THESE/CORPUS/**/**.spacy', recursive=True)
spacy_files = [x.replace('.spacy', '') for x in spacy_files]

all_files = glob('/Volumes/T7/THESE/CORPUS/**/**_docbook.xml', recursive=True)
all_files = filter(lambda x: filterPath(x, spacy_files), all_files)

In [7]:
# lla dataset
def filterPath(filepath:str, file_list):
    filename = filepath.split('/')[-1]
    filename = filename.replace('_docbook.xml', '')
    # print(filename)
    if filename in file_list:
        return False
    return True

lladataset = glob('/Volumes/T7/THESE/CORPUS/Logical-Layout-Analysis-Dataset/test/**/bpt**.xml', recursive=True)
lla_filenames = [x.split('/')[-1] for x in lladataset]
lla_filenames = [x.replace('.xml', '') for x in lla_filenames]

all_files = glob('/Volumes/T7/THESE/CORPUS/**/**_docbook.xml', recursive=True)
# all_files = filter(lambda x: not filterPath(x, lla_filenames), all_files)
all_files = list(filter(lambda x: not filterPath(x, lla_filenames), all_files))


In [8]:
len(all_files)

7

In [4]:
sent_re = re.compile(r'<(sent|title)([^>]*)>([^</]*)')
id_re = re.compile(r'id="(.[^"]*)"')

def extractData(file):

    corpus = sent_re.findall(file)
    data, metadata = [], []
    for x in corpus:
        if x[1]:
            tag_type = x[0]
            tag_id = id_re.search(x[1]).groups(0)[0]
            sent = x[-1]
            # print(tag_type, tag_id, sent)
            # yield (sent, {"tag_id": tag_id, "tag_type": tag_type})
            data.append(sent)
            metadata.append({"tag_id": tag_id, "tag_type": tag_type})

    # break
    return data, metadata

In [29]:

def processFiles(filepath):

    print(f'Processing {filepath}')

    filename = filepath.replace('_docbook.xml', '')

    with open(filepath, encoding='utf-8') as f:
        file = f.read()

    corpus, metadata = extractData(file)

    corpus = nlppipe.process_corpus(corpus=corpus, list_metadata=metadata, thresh=0, fuzzyMatch=False)
    # corpus = map(lambda x: nlppipe.process_doc(x[0], x[1], thresh=0, fuzzyMatch=True), corpus)

    # nlppipe.save2disk(list(corpus), savepath=f"{filename}.spacy")
    print(f'Processing {filename} done !')

    return corpus

# n_core = 2
# with Pool(n_core) as p :
#     p.map(processFiles, all_files)


In [41]:
os.makedirs('test', exist_ok=True)

# x = processFiles(all_files[0])
for filepath in all_files:
    filename = os.path.basename(filepath)
    filename = filename.replace('_docbook.xml', '')
    os.makedirs(f"test/{filename}", exist_ok=True)
    # print(filename)
    proc = processFiles(filepath)

    for doc in proc:
        if doc.spans['REL']:
            # print(doc)
            for i, rel in enumerate(doc.spans['REL']):
                data = {
                    'text': rel.text,
                    'type':rel._.ENT_type,
                    'root_node': rel._.ENT_root_node,
                    'start': rel._.ENT_start,
                    'end' : rel._.ENT_end,
                    'char_start': rel._.ENT_charstart,
                    'char_end' : rel._.ENT_charend,
                    'relations': rel._.ENT_relations
                }
                with open(f'test/{filename}/{i}.json', 'w', encoding='utf-8') as f:
                    json.dump(data,f, indent=4)    


Processing /Volumes/T7/THESE/CORPUS/fond_comtois/cb328152109/bpt6k65474640/bpt6k65474640_docbook.xml
Processing /Volumes/T7/THESE/CORPUS/fond_comtois/cb328152109/bpt6k65474640/bpt6k65474640 done !
Processing /Volumes/T7/THESE/CORPUS/fond_comtois/cb328362537/bpt6k927011z/bpt6k927011z_docbook.xml


KeyboardInterrupt: 

In [28]:
len(proc)

588

oui, les voilà bien, les vrais bénéfices de guerre, que nul ne saurait vous ravir, et qui constituent la richesse de vos âmes, par l'offrande que vous faites à Dieu de vos fatigues, de vos souffrances, et, s'il le fallait même, de votre vie.
Bonnes nouvelles reçues des anciens membres du groupe : Lamboley, Naillod, Moyse, Jodon, Picon,
Alexandre Blondeau et Benjamin Bourgeois ont fait emplette : le premier d'un petit Pierre, le second d'une Bernadette.
C'est digne d'un membre de la J. C , et surtout du groupe de Notre-Dame de Guyans.
Il nous fait hon neur et nous sert de modèle : trente-trois communions pendant six mois sur le front.
Après avoir énormément souffert des brûlures occasionnées par les gaz vésicants, Henri Bonnet va mieux.
C'est sur l'emplacement du château des seigneurs de Montmahoux qu'a été érigée la croix monumentale autour de laquelle se grouperont, dans un pèlerinage de réparation et d'actions de grâces, les membres de la J. C de notre canton rescapés de la Grande Gu

In [6]:
spacy_files = glob('/Volumes/T7/THESE/CORPUS/**/**.spacy', recursive=True)
spacy_files = [x for x in spacy_files if 'backup' not in x]
len(spacy_files)

8856

In [8]:
len(list_done)

8859

In [7]:
from tqdm import tqdm

with open('done.txt', 'r', encoding='utf-8') as f:
    list_done = f.read().split('\n')
len(list_done)

spacy_files = [x for x in spacy_files if x not in list_done]
print(len(spacy_files))

def relation(spacy_file):
    print(spacy_file)
    
    doc = nlppipe.load_from_disk(spacy_file)
    for d in doc:
        nlppipe.factExtraction(d, thresh=0, fuzzyMatch=True)

    nlppipe.save2disk(doc, savepath=spacy_file)

    print(f'{spacy_file} done')

    
    list_done.append(spacy_file)
    # with open(f"{filepath[0]}/done.txt", 'w', encoding='utf-8') as f:
    #     f.write('done')

# func_rel = relation
# for x in tqdm(spacy_files):
#     func_rel(x)

# relation(spacy_files[1])

n_core = 3
with Pool(n_core) as p :
    p.map(relation, spacy_files)

0


In [6]:
with open('done.txt', 'w', encoding='utf-8') as f:
    for line in list_done:
        f.write(f"{line}\n")

In [14]:
doc[i].user_data

{'tag_id': 'sent_1',
 'tag_type': 'sent',
 ('._.', 'TIMEX_type', 32, 36): 'DATE',
 ('._.', 'TIMEX_value', 32, 36): '1937',
 ('._.', 'TIMEX_span', 32, 36): (32, 36)}

## Conversion to XML DocBook

In [9]:
spacy_files = glob('/Volumes/T7/THESE/CORPUS/fond**/**/**.spacy', recursive=True)
spacy_files = [x for x in spacy_files if 'backup' not in x]
print(len(spacy_files))

docbook_files = glob('/Volumes/T7/THESE/CORPUS/fond**/**/**docbook.xml', recursive=True)
docbook_files = [x for x in docbook_files if 'backup' not in x]

print(len(docbook_files))

8769
8769


In [10]:
# all_ents = [x for x in doc[i].ents] + [x for x in doc[i].spans['TIMEX']] 

# rel = [x for x in doc[i].spans['REL']]
def getAllEnts(doc):
    
    all_ents = [x for x in doc.ents] + [x for x in doc.spans['TIMEX']] + [x for x in doc.spans['REL']]

    for x in all_ents:
        if x.label_ == 'Q5':
            x.label_ = 'pers'
        elif x.label_ == 'Property:P625':
            x.label_ = 'loc'
        elif x.label_ == 'TIMEX':
            x.label_ = 'time'
        elif x.label_ == 'Misc':
            x.label_ = 'misc'

    # print(all_ents)
    new_list = []

    while True:

        if not all_ents:
            break
        else:
            candidate = all_ents.pop(0)
            filter_list = []

            for y in all_ents:
                rel = y._.ENT_relations
                if rel:
                    # x is within y: take y
                    # if candidate.text in y.text:                    
                    if (candidate.start_char >= y.start_char) & (candidate.end_char <= y.end_char):
                        
                        filter_list.append(candidate)
                        candidate = y

                    # x is bigger than y : take x
                    elif (y.start_char >= candidate.start_char) & (y.end_char <= candidate.end_char):
                        candidate._.ENT_relations = rel

                        filter_list.append(y)
            if candidate not in new_list:
                new_list.append(candidate)
            all_ents = [x for x in all_ents if x not in filter_list]
    return new_list


# for x in ent_list:
#     print(x)
#     print(x._.ENT_relations)
#     print()


In [11]:
def compareEnt(c, x):
    if c.text == x.text:
        if c.label_ == x.label_:
            return True
        return False
    return False

# def setEntId(list_ent, base_id = ''):

#     i = 1

#     while True:
#         if not list_ent:
#             break 
#         else: 
#             # c = list_ent.pop(0)
#             c = list_ent[0]
#             # print('c', c)
#             # condition
#             identicals = [x for x in list_ent[1:] if compareEnt(c, x)]
#             identicals = [c] + identicals
#             # print('identicals', identicals)

#             for x in identicals:
#                 # mapping[x[0]] = f"{x[1].label_}_{i}"
#                 x._.ENT_id = f"{base_id}{x.label_}_{i}"

#             # filtering
#             list_ent = [x for x in list_ent if x not in identicals]
#             i += 1

def setEntId(list_ent):
    for i, ent in enumerate(list_ent):
        label = ent.label_
        ent._.ENT_id = f"{label}_{i + 1}"


In [12]:
def assignIdtoArticleEnts(article, doc):

    # used to assign id to ent
    # article_id = article['id']

    # get all sent and title tags in article
    # since ents were processed in these tags
    tags = article.find_all(['sent','title'])
    # print(tags)
    
    list_id = [x['id'] for x in tags]

    # get spacy Doc corresponding to these sent/title tags
    list_doc = [x for x in doc if x.user_data['tag_id'] in list_id]

    # get alls ents in this article
    # then assigns unique id to similar entity 
    all_ent = [y for x in list_doc for y in getAllEnts(x)]
    # setEntId(all_ent, base_id=f"{article_id}_")
    setEntId(all_ent)


In [13]:

def taggifyEntities(doc, id_start):
    """_summary_

    :param doc: _description_
    :type doc: _type_
    :return: _description_
    :rtype: _type_
    """    

    
    # text to process
    text = doc.text

    # will contain the new text with the tags
    new_text = ''
    
    # 0 by default, is updated each time a tag is added, so
    # as to keep track what text to add
    start_position = 0

    # get entities to add as tags. Sort them so as to add them in order
    all_ents = getAllEnts(doc)
    all_ents.sort(key=lambda x: x.start_char)

    if all_ents:

        for i, ent in enumerate(all_ents):

            ent_text = ent.text
            label = ent.label_

            ent_start = ent.start_char
            ent_end = ent.end_char

            # ent_id = ent._.ENT_id
            ent_id = f"{label}_{i + 1 + id_start}"

            if label == 'TIMEX':
                label = ent._.TIMEX_type

            # new_text.append(text[start_position:ent_start])

            new_text += text[start_position:ent_start]


            # new_tag = f'<ent type="{label}">\n'

            new_tag = f'<ent id="{ent_id}" type="{label}">\n'

            if ent._.ENT_relations:
                for rel in ent._.ENT_relations:

                    rel_tag = f"<rel id=\"{rel['id']}\" type=\"{rel['relation']}\" score=\"{rel['score']}\" rule=\"{rel['rule']}\" anchor=\"{rel['anchor']}\"/>"

                    new_tag += f"{rel_tag}\n"

            new_tag+= f'{ent_text}</ent>'

            # new_text.append(new_tag)
            new_text += new_tag


            start_position = ent_end

        id_start += len(all_ents)
        
        return new_text, id_start
        # return ''.join(new_text)
    else:
        return text, id_start


In [14]:
import cchardet

# def procDocBook(docbook_file, spacy_file):
def procDocBook(tuple_file):
    
    docbook_file = tuple_file[0]
    spacy_file = tuple_file[1]

    
    with open(docbook_file, 'r', encoding='utf-8') as f:

        soup = BeautifulSoup(f, 'lxml')

    doc = nlppipe.load_from_disk(spacy_file)

    print(docbook_file)

    # articles = soup.find_all('article')

    # func_assign = assignIdtoArticleEnts
    # for article in articles:
    #     func_assign(article, doc)
    # content = soup.find('content')

    # func_assign(content, doc)
    
    func_taggify = taggifyEntities
    id_start = 0
    
    for d in doc:
        d_id = d.user_data['tag_id']
        tag = soup.find(id=d_id)
        new_text, id_start = func_taggify(d, id_start)
        tag.string.replaceWith(new_text) 
        # print(id_start)
        
    # soup_text = soup.prettify(formatter=None)
    
    # print('ents :')
    # print(soup.find_all('ent'))
    # for i, x in enumerate(soup.find_all('ent')):
    #     x['id'] = f"{x['type']}_{i + 1}"

    new_file = docbook_file.replace('.xml', '-final.xml')
    
    with open(new_file, 'w', encoding='utf-8') as f:
        f.write(soup.prettify(formatter=None))

    print(docbook_file, 'done !')

    # print(soup.prettify(formatter=None))
    # return soup_text
    
    # break

In [15]:
list_final = glob('/Volumes/T7/THESE/CORPUS/fond**/**/**docbook-final.xml', recursive=True)

proc = []
for x, y in zip(docbook_files, spacy_files):
    final = x.replace('.xml', '-final.xml')
    if final not in list_final:
        proc.append((x, y))
len(proc)

3746

In [21]:
proc = zip(docbook_files, spacy_files)

In [16]:
with Pool(6) as p:
    p.map(procDocBook, proc)

/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929028h/bpt6k929028h_docbook.xml
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929028h/bpt6k929028h_docbook.xml done !
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb34459439z/bpt6k9801280s/bpt6k9801280s_docbook.xml
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929029w/bpt6k929029w_docbook.xml
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929029w/bpt6k929029w_docbook.xml done !
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb344175853/bpt6k65283452/bpt6k65283452_docbook.xml
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb34459439z/bpt6k9801280s/bpt6k9801280s_docbook.xml done !
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb344175853/bpt6k65283452/bpt6k65283452_docbook.xml done !
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929030t/bpt6k929030t_docbook.xml
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb42746557k/bpt6k929030t/bpt6k929030t_docbook.xml done !
/Volumes/T7/THESE/CORPUS/fond_bourgogne/cb344175853/bpt6k

2024-01-16 20:16:34 root ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/g9/npfr2mks4118dkv7g8ckgccr0000gn/T/ipykernel_98554/2186805975.py", line 2, in <module>
    p.map(procDocBook, proc)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/multiprocessing/pool.py", line 364, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/multiprocessing/pool.py", line 765, in get
    self.wait(timeout)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/multiprocessing/pool.py", line 762, in wait
    self._event.wait(timeout)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/threading.py", line 574, in wait
    signaled = self._cond.wait(timeout)
  File "/Users/nicola

2024-01-16 20:16:36 root INFO: 
Unfortunately, your original traceback can not be constructed.




During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/site-packages/IPython/core/ultratb.py", line 248, in wrapped
    return f(*args, **kwargs)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/site-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/inspect.py", line 1541, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/Users/nicolasgutehrle/opt/anaconda3/envs/corpus/lib/python3.9/inspect.py", line 1499, in getframeinfo
    filena

/Volumes/T7/THESE/CORPUS/fond_comtois/cb32778245r/bpt6k947572f/bpt6k947572f_docbook.xml done !


In [99]:
print()

<?xml version="1.0" encoding="utf-8"?>
<html>
 <body>
  <document>
   <metadata>
    <ark>
     bpt6k97105757
    </ark>
    <identifier>
     https://gallica.bnf.fr/ark:/12148/bpt6k97105757
    </identifier>
    <date>
     1937
    </date>
    <title>
     Annales d'Igé en Mâconnais : recueil de documents et de matériaux pour servir à l'histoire de cette commune
    </title>
    <contributor>
     Morgand, Arthur Alfred (1897-1979). Préfacier
    </contributor>
    <publisher>
     (Mâcon)
    </publisher>
    <languague>
    </languague>
    <creator>
     Jacquet, Pierre (1875-1933). Auteur du texte
    </creator>
    <source>
     Bibliothèque municipale de Dijon, 2016-113718
    </source>
    <typedoc>
     fascicule
    </typedoc>
    <nqamoyen>
     99.02
    </nqamoyen>
    <dewey>
     9
    </dewey>
    <image_url>
     https://gallica.bnf.fr/ark:/12148/bpt6k97105757/highres
    </image_url>
   </metadata>
   <content>
    <page id="5">
     <header block_id="PAG_00000005_TB

In [100]:
with open('test.xml', 'w', encoding='utf-8') as f:
    f.write(soup.prettify(formatter=None))