# OCR Pipeline: From Legacy through PAGE.xml to XML-TEI
## The data flow includes the following suggested steps, which can be modified as needed:
1. Convert Olive software produced physical and logical (PRXML layout data into a PAGE xml (pxml) format. The conversion excludes the publication text, which is assumed to contain errors.
2. Upload pxmls and scans of the document into Transkribus server.
3. Run Baseline detection for the document on Transkribus server.
4. Download the document.
5. Modify and extend baselines coordinates.
6. Upload the document.
7. Run HTR for the document on Transkribus server.
8. Download the document.
9. Convert and combine the document data into TEI format.

## Convert Abbyy Olive document layout into PageXML format

In [1]:
from TkbsDocument import Document

p = Document()
p.load_aby_data("C:\\TkbsTest\\0105\\")
p.export_tkbs_format("C:\\TkbsTest\\PXML_Converted\\")


## Upload PageXML format into Transkribus server
For Transkribus documentation: https://transkribus.eu/wiki/index.php/Upload_via_REST_API

In [8]:
from TkbsApiClient import TranskribusClient

user = "user.name@mailserver.com"
key = "password"
collection = "2455"
tkbs = TranskribusClient(sServerUrl='https://transkribus.eu/TrpServerTesting')
tkbs.auth_login(user, key, True)
print("session id: " + tkbs.getSessionId())

loadfolder = "C:\\TkbsTest\\PXML_Converted\\"
img1 = loadfolder + "Pg001_150.png"
xml1 = loadfolder + "Pg001_150.pxml"
img2 = loadfolder + "Pg002_150.png"
xml2 = loadfolder + "Pg002_150.pxml"

jstring = '{"md": {"title": "HaZfirah Load Test", "author": "Nurit", "description": "Test"}\
    , "pageList": {"pages": [\
    {"fileName": "Pg001_150.png", "pageXmlName": "Pg001_150.pxml", "pageNr": 1}, \
    {"fileName": "Pg002_150.png", "pageXmlName": "Pg002_150.pxml", "pageNr": 2}]}}'
imgobject1 = open(img1, 'rb')
imgobject2 = open(img2, 'rb')
xmlobject1 = open(xml1, 'rb')
xmlobject2 = open(xml2, 'rb')
pfiles = []
pfiles.append({'img': ('Pg001_150.png', imgobject1, 'application/octet-stream'), 'xml': ('Pg001_150.pxml', xmlobject1, 'application/octet-stream')})
pfiles.append({'img': ('Pg002_150.png', imgobject2, 'application/octet-stream'), 'xml': ('Pg002_150.pxml', xmlobject2, 'application/octet-stream')})

response = tkbs.createDocFromImages(collection, jstring, pfiles)
print(response)

session id: FDE3D402AE64865E0751EAD9C7C9A0BB
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><trpUpload><md><docId>-1</docId><title>HaZfirah Load Test</title><author>Nurit</author><uploadTimestamp>0</uploadTimestamp><uploaderId>0</uploaderId><nrOfPages>0</nrOfPages><collectionList/></md><pageList><pages><fileName>Pg001_150.png</fileName><pageXmlName>Pg001_150.pxml</pageXmlName><pageUploaded>true</pageUploaded><pageNr>1</pageNr></pages><pages><fileName>Pg002_150.png</fileName><pageXmlName>Pg002_150.pxml</pageXmlName><pageUploaded>true</pageUploaded><pageNr>2</pageNr></pages></pageList><uploadId>7057</uploadId><created>2019-04-21T15:30:28.831+02:00</created><finished>2019-04-21T15:30:35.529+02:00</finished><userId>17463</userId><userName>nurit.grd@gmail.com</userName><nrOfPagesTotal>2</nrOfPagesTotal><uploadType>JSON</uploadType><jobId>15692</jobId><colId>2455</colId></trpUpload>


## Run Baseline Detection
For Transkribus documentation: https://transkribus.eu/wiki/index.php/Layout_Analysis_API

In [9]:
docid = "5577"
pageid = "18432"

doc_parts = '{\
   "docList" : {\
      "docs" : [ {\
         "docId" : ' + str(docid) + ',\
         "pageList" : {\
            "pages" : [ {\
               "pageId" : ' + str(pageid) + '} ]\
         }\
      } ]\
   }\
}'
d = json.loads(doc_parts)
response = tkbs.analyzeLayout(colId=collection, docPagesJson=doc_parts, bBlockSeg=False, bLineSeg=True)
print(response)

<?xml version="1.0" encoding="UTF-8" standalone="yes"?><trpJobStatuses><trpJobStatus><jobId>15693</jobId><docId>5577</docId><pageNr>-1</pageNr><type>Layout analysis (CITlabAdvancedLaJob: lines)</type><state>CREATED</state><success>false</success><description></description><userName>nurit.grd@gmail.com</userName><userId>17463</userId><createTime>1555853486706</createTime><startTime>0</startTime><endTime>0</endTime><jobData>#Sun Apr 21 15:31:26 CEST 2019
doWordSeg=false
doLineSeg=true
docDescs.0=&lt;?xml version\="1.0" encoding\="UTF-8" standalone\="yes"?&gt;\n&lt;documentSelectionDescriptor&gt;\n    &lt;docId&gt;5577&lt;/docId&gt;\n    &lt;pageList&gt;\n        &lt;pages&gt;\n            &lt;pageId&gt;18432&lt;/pageId&gt;\n        &lt;/pages&gt;\n    &lt;/pageList&gt;\n&lt;/documentSelectionDescriptor&gt;\n
doPolygonToBaseline=false
doBaselineToPolygon=false
doBlockSeg=false
</jobData><resumable>false</resumable><jobImpl>CITlabAdvancedLaJobMultiThread</jobImpl><created>2019-04-21T15:31:

## Download Transkribus Document

In [10]:
response = tkbs.download_document(collection, docid, "C:\\TkbsTest\\Baseline_" + str(collection) + "_" + str(docid))
print(response)

(1544434628769, ['Pg001_150', 'Pg002_150'])


## Extend Baseline Coordinates

In [13]:
import os
import xml.etree.cElementTree as ET

def edit_pg_baseline(pgfile, addpoints):
    ET.register_namespace('', "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15")
    tree = ET.ElementTree(file=pgfile)
    xml_changed = False
    for myelement in tree.iterfind('*/*/*/*'):
        if myelement.tag.endswith("Baseline"):
            xml_changed = True
            points = myelement.attrib.get('points')
            points_list = points.split(" ")
            startpoint = str(int(points_list[0].split(",")[0]) - addpoints) + "," + points_list[0].split(",")[1]
            endpoint = str(int(points_list[(len(points_list) - 1)].split(",")[0]) + addpoints) + "," + points_list[(len(points_list) - 1)].split(",")[1]
            new_points = startpoint + " " + points + " " + endpoint
            myelement.set('points', new_points)
    if (xml_changed):
        tree.write(pgfile)
        print("BASELINE CHANGED: " + pgfile)


for root, dirs, files in os.walk("C:\\TkbsTest\\Baseline_21675_86775"):
    for fname in files:
        if fname.upper().endswith(".PXML") and fname.upper().startswith("PG"):
            fullname = root + "\\" + fname
            edit_pg_baseline(fullname, 10)



BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg001_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg002_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg003_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg004_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg005_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg006_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg007_150.pxml
BASELINE CHANGED: C:\TkbsTest\Baseline_21675_86775\Pg008_150.pxml


## Run OCR
For Transkribus documentation: https://transkribus.eu/wiki/index.php/HTR#Recognition

In [15]:
docid = "5577"
pageid = "18432"
dictionaryName = ""
HTRmodelid = "41"

jstring = '{\
   "docId" : ' + docid + ',\
   "pageList" : {\
      "pages" : [ {\
         "pageId" : ' + pageid + '\
      } ]\
   }\
}'
      
response = tkbs.htrRnnDecode(collection, HTRmodelid, dictionaryName, docid, jstring, bDictTemp=False)
print(response)

15695
