In [1]:
import openai
import os
import re
import xml.etree.ElementTree as ET
from PIL import Image
import requests
from io import BytesIO
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI()

def get_completion(prompt, model="gpt-4"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

def get_completion_text( prompt, text, model="gpt-4"):
    messages = [{"role": "user", "content": prompt + "```" + text + "```"}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

def readtext( fname):
    tree = ET.parse( fname)
    root = tree.getroot()

    ns = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    text = ""

    for textline in root.iterfind( './/page:TextLine/page:TextEquiv[last()]/page:Unicode', ns):
        if textline.text != None:
            text = text + textline.text + "\n"
    return( text)

def createhtml(title):
    html = ET.Element('html')
    head = ET.SubElement(html, 'head')
    body = ET.SubElement(html, 'body')

    title_element = ET.SubElement(head, 'title')
    title_element.text = title
    stylesheet = ET.SubElement(head, 'link')
    stylesheet.set('rel', 'stylesheet')
    stylesheet.set('href', 'style.css')

    return html

#splits de tekst string in coordinaten en return een array van coordinaten
def split_coords( coords):
    coords = coords.split( " ")
    list = []
    for point in coords:
        point = point.split( ",")
        list.append ( [int( point[0]), int( point[1])])
    return list

def get_boundaries( coords):
     return( min( coords, key=lambda x: x[0])[0], min( coords, key=lambda x: x[1])[1], 
            max( coords, key=lambda x: x[0])[0], max( coords, key=lambda x: x[1])[1])

def load_image( url):
    response = requests.get( url)
    img = Image.open( BytesIO(response.content))
    return img


In [20]:
fnames = [ 
 #   "NL-HaNA_1.04.02_4994_0003-groot.xml" , "NL-HaNA_1.04.02_4994_0004-groot.xml"
 #   , "NL-HaNA_1.04.02_4994_0005-groot.xml",
 #   "NL-HaNA_1.04.02_4994_0006-groot.xml"
 #   , "NL-HaNA_1.04.02_4994_0007-groot.xml", "NL-HaNA_1.04.02_4994_0008-groot.xml"
    "NL-HaNA_1.04.02_1098_0554-groot.xml"
    ]

fname = fnames.last()
html = createhtml( fname)
body = html.find( 'body')

text = ""
prompt = f"""
de tekst tussen backticks is uit een VOC brief van de 1629. Vertaal de tekst tussen drie backticks naar het Nederlands. Er kunnen
schrijffouten in de tekst zitten. Er worden synoniemen gebruikt, bijvoorbeeld Jerommus, Jermmus Conteeles, Gerommus en Jeronius zijn 
allemaal Jeronimus Cornelisz. Namen zijn meestal geschreven als voornaam achternaam beroep. het woord Item betekent een nieuwe aanklacht.
geef de vertaling zonder backticks of quotes.
"""

for fname in fnames:
    text = readtext( fname)
    response = get_completion_text( prompt, text)

    ET.SubElement( body, 'h1').text = fname
    ET.SubElement( body, 'p').text = response
    

ET.ElementTree( html).write( 'inv 1098.html')


In [56]:
def get_word( image, coords):
    return( image.crop( coords))

fname = "NL-HaNA_1.04.02_1098_0554-groot.xml"

ns = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

tree = ET.parse( fname)
root = tree.getroot()

#read metadata from root using iterfind
for metadata in root.iterfind( './/page:TranskribusMetadata', ns):
        #from the metadata element, get the value of the attribute 'imgUrl'
        url = metadata.get('imgUrl')
        img = load_image( url)

        #get the element with the coordinates of the textline
        coordsElt = root.find( './/page:Coords', ns)
        coords = split_coords( coordsElt.get('points'))
        print( get_boundaries( coords))

        woord = get_word( img, get_boundaries( coords))
        woord.show()

        

(829, 255, 2975, 3886)


In [2]:
#select a region from the image
def get_region( image, coords):
    return( image.crop( coords))

def get_rawtext( region):
    ns = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    rawtext = ""
    for textline in region.iterfind( './/page:TextLine/page:TextEquiv[last()]/page:Unicode', ns):
        yield( textline) 

#process the text regions in the xml file
def process_text_regions(root, prompt, dir, outfile):
    ns = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    metadata = root.find( './/page:TranskribusMetadata', ns)
    url = metadata.get('imgUrl')
    img = load_image( url)

    newhtml = createhtml( outfile)

    head = newhtml.find( 'head') 
    comment = ET.Comment(prompt)
    head.append(comment)

    body = newhtml.find( 'body')
    for region in root.iterfind( './/page:TextRegion', ns):
#        coords = split_coords( region.find( './/page:Coords', ns).get('points'))
#        regionimg = get_region( img, get_boundaries( coords))
#        regionimg.save( dir + f"region_{region.get('id')}.png")
#        imageref = ET.SubElement( body, 'img')
#        imageref.attrib['src'] = f"region_{region.get('id')}.png"

        par = ET.SubElement( body, 'ul')
        par.text = ""
        rawtext = ""
        par.attrib['class'] = 'origineel'
        for textline in get_rawtext( region):
            if textline.text and textline.text.strip():
                line = ET.SubElement( par, 'li')
                line.text = textline.text
                rawtext += textline.text + "\n"
        
        response = get_completion_text(prompt, rawtext)
        trans = ET.SubElement( body, 'p')
        trans.attrib['class'] = 'transcriptie'
        trans.text = response

    ET.ElementTree( newhtml).write( dir + outfile)

# Now you can call the function with root and prompt as parameters
# process_text_regions(root, prompt)
# "NL-HaNA_1.04.02_4994_0004-groot.xml","NL-HaNA_1.04.02_4994_0005-groot.xml",
filenames = [ "NL-HaNA_1.04.02_4994_0007-groot.xml" ]  

for fname in filenames:
    tree = ET.parse(fname)
    root = tree.getroot()
    with open('prompt.txt', 'r') as file:
        prompt = file.read()
    process_text_regions(root, prompt, './output/', fname.replace( ".xml", ".html"))

        

In [74]:
def get_tagged_text( region):
    ns = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    for textline in region.iterfind('.//page:TextLine[@custom]', ns):
        custom = textline.get('custom')
        rawline = textline.findall( 'page:TextEquiv[last()]/page:Unicode', ns)
        if 'person' in custom:
            #transform the custom attribute to a dictionary
            tags = re.findall( r'(\w+) \{offset:(\d+); length:(\d+);\}', custom)
            
            for tag in tags:
                yield( tag[0], rawline[0].text[int(tag[1]):int(tag[1]) + int(tag[2])])
                      
filenames = [ "NL-HaNA_1.04.02_4994_0007-groot.xml" ]  

for fname in filenames:
    tree = ET.parse(fname)
    root = tree.getroot()
    for person in get_tagged_text( root):
            print(person)


('person', 'Jeromius Corneeles')
('place', 'haerlem')
('person', 'Jan Hendrickx')
('place', 'bremen')
('person', 'Jan hendrickx')
('person', 'anneken heydens')
('person', 'hans hardens')
('person', 'hilleken herdens')
('person', 'Hans hardensz')
('date', '12 Julij')
('person', 'Jeronims')
('person', 'leendert michielsen')
