In [1]:
# Import a module for regular expression

import re
import pandas as pd

In [2]:
# Just demonstrate how the TSV file to be converted into the TEI-compliant file looks like
dataframe = pd.read_table('Ward_ledger_HMS_all.tsv')
dataframe


Unnamed: 0,PageNum,Head,Place,Date,Name,Service,Ref,Pound,Silling,Pence
0,3A,Comus,Barrow,1934-08-01,Leslie & Godwin Ld,Insurance for 1 month L12500,,4.0,14.0,5.0
1,,,,1934-08-30,Leslie & Godwin Ld,Insurance to Devonport to Barrow for 3 months ...,,149.0,15.0,3.0
2,,,,1934-08-30,Leslie & Godwin Ld,Insurance under Collision Clause,,4.0,15.0,7.0
3,,,,1934-08-30,Leslie & Godwin Ld,Insurance on Anchors & Cables,,,10.0,6.0
4,,,,1934-09-05,United Towing Co Ld,Towage to Barrow per Tug Seaman,,345.0,,
5,,,,1934-09-09,Trinity House,Inward Pilotage,,8.0,13.0,3.0
6,,,,1934-09-15,Customs & Excise,Walney Dues 1312 Tons,,5.0,9.0,4.0
7,,,,1934-09-05,Trinity House,Outward Pilotage,,2.0,16.0,
8,,,,1934-09-09,LM & S Rly,Towage Basin to Dock,,15.0,,
9,,,,1934-10-04,LM & S Rly,Carriage on Lamps sold to Devonport,,,15.0,4.0


In [3]:
# Define a function to automatically mark up the TSV file in conformity with the TEI

def generate_DEPCHA_TEI_from_TSV(input_file, output_file):

    file = open(input_file, 'r', encoding='utf-8')
    lines = file.readlines()
    file.close()

    output_f = open(output_file, 'w', encoding='utf-8')
    n = '\n'
    output_f.write("""<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
	schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
      <teiHeader>
            <fileDesc>
                  <titleStmt>
                        <title>Title</title>
                  </titleStmt>
                  <publicationStmt>
                        <publisher>KCL MADH</publisher>
                        <idno type="PID">o:depcha.ward_ledger.1</idno>
                        <availability>
                              <p/>
                        </availability>
                        <date when="2019-05-25">May 25, 2019</date>
                  </publicationStmt>
                  <sourceDesc>
                        <p>Accounts of <orgName ana="#bk_from" ref="ThosWardLtd">Thos W. Ward Ltd</orgName> from <date ana="#bk_when"
                                    when="1934-06-01">June 1934</date>. 
                        </p>
                  </sourceDesc>
            </fileDesc>
      </teiHeader>
      <text>
            <body>""")

    noise = re.compile('&amp;| |,')
    pageNum_check = '0'

    for index, line in enumerate(lines):
        if index == 0:
            pass
        else:
            line = line.rstrip()
            cols = line.split('\t')
            pageNum = cols[0]
            head = cols[1]
            place = cols[2]
            date = cols[3]
            if '/' in date:
                date = date[:-3]
            firm_name = cols[4].replace('&', '&amp;')

            # create a reference name for each firm by manipulating the string
            firm_ref = re.sub(noise, '', firm_name)
            firm_ref = '#' + firm_ref
        
            service = cols[5].replace('&', '&amp;')
            service_split = service.split(' ')
            commodity = service_split[0]
            commodity = re.sub(noise, '', commodity)

            try:
                ref = cols[6]
            except IndexError:
                ref = ""
            try:
                pound = cols[7]
            except IndexError:
                pound = ""
            try:
                shilling = cols[8]
            except IndexError:
                shilling = ""
            try:
                pence = cols[9]
            except IndexError:
                pence = ""
        
            if pageNum != "":
                if index != 1:
                    output_f.write(f'</table></div></div>{n}')
                output_f.write(f'<div><fw type="pageNum">{pageNum}</fw>{n}')
                output_f.write(f'<div><table><head>{head}<placeName>{place}</placeName></head>{n}')
        
            output_f.write(f'<row ana="#bk_entry"><cell><date ana="#bk_when" when="{date}"/></cell>{n}')
            output_f.write(f'<cell><name ana="#bk_to" ref="{firm_ref}">{firm_name}</name>{n}')
            output_f.write(f'<measure ana="#bk_service #bk_to" commodity="{commodity}">{service}</measure></cell>{n}')
            output_f.write(f'<cell><rs>{ref}</rs></cell>{n}')
            if pound != "":
                output_f.write(f'<cell><measure ana="#bk_money #bk_from" commodity="Currency" quantity="{pound}" unit="pound">{pound}</measure></cell>{n}')
            if shilling != "":
                output_f.write(f'<cell><measure ana="#bk_money #bk_from" commodity="Currency" quantity="{shilling}" unit="shilling">{shilling}</measure></cell>{n}')
            if pence != "":
                output_f.write(f'<cell><measure ana="#bk_money #bk_from" commodity="Currency" quantity="{pence}" unit="pence">{pence}</measure></cell>{n}')
            output_f.write('</row>')

    output_f.write('</table></div></div></body></text></TEI>')
    output_f.close()


In [4]:
# Execute!

generate_DEPCHA_TEI_from_TSV('Ward_ledger_HMS_all.tsv', 'Ward_ledger_HMS_all.xml')