# HTR performed on LTK-290

In [1]:
import os
from os.path import join
import nltk
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import pandas as pd
import re
import xml.etree.ElementTree as ET
from tqdm import tqdm
import stanza

ns = { 'a': 'http://www.loc.gov/standards/alto/ns-v4#' }

In [2]:
full_text = []

In [3]:
def get_text(path):
    text_data = []
    xml_file = open(path,encoding='utf-8').read()

    root = ET.fromstring(xml_file)
    if root:

        pages = root.findall('a:Layout/a:Page',ns)
        for page in pages:
            lines = page.findall('a:PrintSpace/a:TextBlock/a:TextLine',ns)
            for line in lines:
                height = line.get('HEIGHT')
                width = line.get('WIDTH')
                vpos = line.get('VPOS')
                hpos = line.get('HPOS')
                htr = line.find('a:String',ns)
                text_data.append( (htr.get('CONTENT'),int(hpos),int(vpos),int(width),int(height) ))
    return text_data



In [4]:
path = 'export_job_9796675/2429581/LTK_290/alto'
files = [join(path,file) for file in os.listdir(path) if re.search(r'xml$',file)]

In [10]:
head = '''<html>
<head>
<style>

li {
  font-size: 2em;
}

div.text {
  font-size: 2em;
}

div.image {
  margin-bottom: 50px;
}
  
body {
    margin-left: 10%;
    margin-right: 10%;
    font-family: Helvetica,Candara,Geneva;
    background-color: #F5F5F5;
}  

h2 {
  font-size: 3em;
}

</style>
<meta charset="utf-8">

</head>
<body>
'''

closer = '''
</body>
</head>
'''

In [11]:
file_label = dict()

directory = 'Pages'
import os
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
out = open(os.path.join(directory,'index.html'),'w',encoding='utf-8')

out.write(head)
out.write('<h2>Folia</h2>')

out.write('<ul>')
           
for file in sorted(files):
    
    label = re.split(r'_',os.path.basename(file))[2]
    label = re.sub('[.]xml$','',label)
    file_label[file]=label

    out.write('<li>')
    out.write(f'<a href="{label.strip()}.html" >{label}</a>')
    out.write('</li>')
out.write('</ul>')  

out.write(closer)
out.close()


for i,file in enumerate(sorted(files)):
    
    lines_on_page = []
           
    out = open(os.path.join(directory,f'{file_label[file].strip()}.html'),'w',encoding='utf-8')
    out.write(head)
    
    file_id = re.split(r'_',os.path.basename(file))[1]
    
    iiif_url = f'https://iiif.universiteitleiden.nl/iiif/2/hdl:1887.1%252Fitem:{file_id}/full/full/0/default.jpg'
    out.write(f'<a name="{file_label[file]}" />')
    out.write(f'<h2><a target="_new" href="{iiif_url}">{file_label[file]}</a></h2><br/>')
    
    text_data = get_text(file)
    if len(text_data)>0:
        for line in text_data:
            line_text = line[0]
            lines_on_page.append(line_text)
            hpos = line[1]
            vpos = line[2]
            width = line[3]
            height = line[4]
            iiif_url = f'https://iiif.universiteitleiden.nl/iiif/2/hdl:1887.1%252Fitem:{file_id}/{hpos},{vpos},{width},{height}/full/0/default.jpg'
            out.write(f'<div class="text">{line_text}</div>\n\n')
            out.write(f'<div class="image"><img src="{iiif_url}"></div>\n')
            
    
    
    if i<len(files)-1:
        label_next = re.split(r'_',os.path.basename(files[i+1]))[2]
        label_next = re.sub('[.]xml$','',label_next)  
        out.write(f'<div class="text"> <a href="{label_next}.html">Next</a> </div>')
        
    out.write('<h2>Full text</h2>')
    out.write(f'<div class="image"><img src="https://iiif.universiteitleiden.nl/iiif/2/hdl:1887.1%252Fitem:{file_id}/full/800,/0/default.jpg" /></div>')
    
    out.write('<div class="text">')
    for line in lines_on_page:
        out.write(f'{line}<br/>')
    out.write('</div>')
        
    full_text.extend(lines_on_page)
    
    if i<len(files)-1:
        label_next = re.split(r'_',os.path.basename(files[i+1]))[2]
        label_next = re.sub('[.]xml$','',label_next)  
        out.write(f'<div class="text"> <a href="{label_next}.html">Next</a> </div>')
        
    
    
    out.write(closer)
    
    out.close()

The results of the HTR as conducted in Transkribus, and using the Dutch Demeter model, can be found [on the bookandbyte server](https://bookandbyte.universiteitleiden.nl/DATH/LTK290). 

## Named Entity Recognition

In [8]:
nlp = stanza.Pipeline(lang='nl', processors='tokenize,ner')

entities = []

for line in full_text:

    doc = nlp(line)

    for ent in doc.ents:
        entities.append([ent.text,ent.type])


2024-05-24 16:19:22 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| ner       | conll02 |

2024-05-24 16:19:22 INFO: Use device: cpu
2024-05-24 16:19:22 INFO: Loading: tokenize
2024-05-24 16:19:22 INFO: Loading: ner
2024-05-24 16:19:22 INFO: Done loading processors!


In [9]:
for ne in entities:
    if ne[1]=='PER' or ne[1]=='ORG':
        print(ne)

['A-H', 'ORG']
['N.2', 'ORG']
['E.', 'ORG']
['1.o', 'ORG']
['DU:peuw', 'ORG']
['D', 'ORG']
['E.', 'ORG']
['Jod', 'ORG']
['UHEd', 'ORG']
['E.', 'ORG']
['Wesen', 'PER']
['E.F.', 'ORG']
['A.I.S.', 'ORG']
['DUVIOUVIJ', 'ORG']
['I', 'ORG']
['D', 'ORG']
['D', 'ORG']
['Heer', 'PER']
['Getijdeboek', 'ORG']
['Verdam', 'ORG']
['Getijdeboek', 'ORG']
['Goth.', 'ORG']
['Ghebet', 'ORG']
['Vigilie', 'ORG']
['Wijsheit', 'ORG']
['J.', 'ORG']
['Schultens', 'ORG']
['Catal', 'ORG']
['RIJ', 'PER']
['Sʼ', 'ORG']
['Octauc', 'ORG']
["S'", 'ORG']
['Octᵃ', 'ORG']
['Symon', 'ORG']
['Dertienendach', 'ORG']
['Ysidorus', 'ORG']
['Lucianus', 'ORG']
['Paulus', 'PER']
['xiij', 'ORG']
['paeus', 'ORG']
['mᵃr', 'ORG']
['Octᵃ', 'ORG']
['dertienendach', 'ORG']
['Maurus', 'ORG']
['Martellus', 'ORG']
['pacus', 'ORG']
['Lelyveld', 'ORG']
['Secretaris', 'ORG']
['Prisca', 'ORG']
['Maurius', 'ORG']
['Fabiaen', 'ORG']
['Agniet', 'ORG']
['Vincencius', 'ORG']
['Emerenciana', 'ORG']
['Thymotheus apostel', 'ORG']
['Pouwels', 'ORG']
[