In [225]:
import json
import os

json_file = open('imagemaps.js')
imagemaps = json.load(json_file)

json_file = open('150-inscriptions.js')
inscriptions = json.load(json_file)



In [226]:
from _ctypes import PyObj_FromPtr
import json
import re

class NoIndent(object):
    """ Value wrapper. """
    def __init__(self, value):
        self.value = value


class MyEncoder(json.JSONEncoder):
    FORMAT_SPEC = '@@{}@@'
    regex = re.compile(FORMAT_SPEC.format(r'(\d+)'))

    def __init__(self, **kwargs):
        # Save copy of any keyword argument values needed for use here.
        self.__sort_keys = kwargs.get('sort_keys', None)
        super(MyEncoder, self).__init__(**kwargs)

    def default(self, obj):
        return (self.FORMAT_SPEC.format(id(obj)) if isinstance(obj, NoIndent)
                else super(MyEncoder, self).default(obj))

    def encode(self, obj):
        format_spec = self.FORMAT_SPEC  # Local var to expedite access.
        json_repr = super(MyEncoder, self).encode(obj)  # Default JSON.

        # Replace any marked-up object ids in the JSON repr with the
        # value returned from the json.dumps() of the corresponding
        # wrapped Python object.
        for match in self.regex.finditer(json_repr):
            # see https://stackoverflow.com/a/15012814/355230
            id = int(match.group(1))
            no_indent = PyObj_FromPtr(id)
            json_obj_repr = json.dumps(no_indent.value, sort_keys=self.__sort_keys, ensure_ascii=False)

            # Replace the matched id string with json formatted representation
            # of the corresponding Python object.
            json_repr = json_repr.replace(
                            '"{}"'.format(format_spec.format(id)), json_obj_repr)

        return json_repr


In [227]:
# Get a list of the x-coordinate position of each glyph on each tablet.
# We'll use this to figure out possible line-breaks in the tablet.
glyph_x_positions = {i["img"].split('-')[0][7:]: [a["coords"]["x"] for a in i["areas"]]
                     for i in imagemaps
                     if "Inscription" in i["img"]
                    }


In [228]:
# The glyphs in each inscription as a list.
inscription_glyphs = { i["name"]: list(i["parsedInscription"].replace('\n','').replace('\U0001076b',"")) 
                                        for i in inscriptions["inscriptions"]}
# The position in each list of the 'separator' glyph
sep_indices = {k: [i for i, v in enumerate(ig) if v == "𐄁"] for k,ig in inscription_glyphs.items() }


In [229]:
# Insert a mock position for the separator glyph ("𐄁") that I didn't bother capturing when collecting
# the position of each glyph in each image.
for k,v in sep_indices.items():
    if not v:
        continue
    if not k in glyph_x_positions:
        continue
    gxp = glyph_x_positions[k]
    if not gxp:
        continue
    for i in v: 
        if (i > len(gxp)):
            print("Skipping " + str(i) + " in " + k)
            continue
        p = gxp[i-1] if i > 0 else 0
        gxp.insert(i, p)


In [230]:
# Create a list with the index into each inscription of the probable line-break positions
newline_indices = { k: [i for i, v in enumerate(t) if not i or (v+5) < t[i-1]] 
                   for k,t in glyph_x_positions.items() }


In [239]:
(newline_indices["HT123+124a"], 
 list(zip(inscription_glyphs["HT123+124a"], list(enumerate(glyph_x_positions["HT123+124a"])))))

([0, 7, 14, 23, 32, 38, 46, 52, 60],
 [('𐘸', (0, 24)),
  ('𐘳', (1, 75)),
  ('𐘚', (2, 128)),
  ('𐄁', (3, 128)),
  ('𐙋', (4, 181)),
  ('𐄒', (5, 242)),
  ('𐄇', (6, 255)),
  ('𐙜', (7, 18)),
  ('𐄎', (8, 52)),
  ('𐝃', (9, 114)),
  ('𐘸', (10, 146)),
  ('𐘁', (11, 194)),
  ('𐄇', (12, 227)),
  ('𐝎', (13, 242)),
  ('𐘫', (14, 10)),
  ('𐙍', (15, 71)),
  ('𐙋', (16, 112)),
  ('𐄒', (17, 162)),
  ('𐄇', (18, 200)),
  ('𐝆', (19, 198)),
  ('𐙜', (20, 227)),
  ('𐄎', (21, 265)),
  ('𐝕', (22, 273)),
  ('𐘸', (23, 11)),
  ('𐘁', (24, 36)),
  ('𐝎', (25, 69)),
  ('𐘞', (26, 94)),
  ('𐘘', (27, 131)),
  ('𐙋', (28, 170)),
  ('𐄐', (29, 220)),
  ('𐄌', (30, 251)),
  ('𐙜', (31, 275)),
  ('𐄊', (32, 14)),
  ('𐝀', (33, 47)),
  ('𐘸', (34, 89)),
  ('𐘁', (35, 132)),
  ('𐝕', (36, 172)),
  ('𐘀', (37, 201)),
  ('𐘹', (38, 14)),
  ('𐙋', (39, 66)),
  ('𐄐', (40, 118)),
  ('𐄋', (41, 164)),
  ('𐙜', (42, 181)),
  ('𐄊', (43, 219)),
  ('𐝃', (44, 250)),
  ('𐘸', (45, 284)),
  ('𐘁', (46, 30)),
  ('𐝕', (47, 75)),
  ('𐙂', (48, 125)),
  ('𐘁', (4

In [232]:
# Get the inscriptions with uncertain readings
inscription_glyphs = { i["name"]: list(i["parsedInscription"].replace('\n','')) 
                                        for i in inscriptions["inscriptions"]}

rf = open('000-UncertainReadings.txt')
rf_lines = [l.strip() for l in rf.readlines() if any([s in l for s in ["doubtful", "erased"]])]
uncertains = {l[0]: [s for s in l[1].split(',') if s != 'eol'] 
              for l in [ l.split('\t') for l in rf_lines ] }

# Count how many match the actual number of glyphs in the inscription. These are the only
# ones we can actually use
r=[]
for k,v in uncertains.items():
    if k not in inscription_glyphs:
        continue
    r += [len(v) == len(inscription_glyphs[k])]
print(r.count(True), r.count(False))

# Limit to ones where the number of glyphs in `uncertains` matches the number of glyphs in the
# inscription
uncertains = {k:v for k,v in uncertains.items()
              if k in inscription_glyphs and len(v) == len(inscription_glyphs[k])}
print("Before filtering out erasures(𐝫)", len(uncertains))

# Now we need to remove the items in the `uncertains` lists that refer to '𐝫' (i.e. '\U0001076b')
# in the inscription, as the inscription we will tabulate will not contain '𐝫' glyphs.
for k,v in uncertains.items():
    if k not in inscription_glyphs:
        continue
    glyphs = inscription_glyphs[k]
    erasures = [i for i, g in enumerate(glyphs) if g == '\U0001076b']
    us = [u for i,u in enumerate(uncertains[k]) if i not in erasures]
    uncertains[k] = us

inscription_glyphs = { i["name"]: list(i["parsedInscription"].replace('\n','').replace('\U0001076b',"")) 
                                        for i in inscriptions["inscriptions"]}
uncertains = {k:v for k,v in uncertains.items()
              if k in inscription_glyphs and len(v) == len(inscription_glyphs[k])}
print("After filtering out erasures(𐝫)",len(uncertains))


267 108
Before filtering out erasures(𐝫) 267
After filtering out erasures(𐝫) 267


In [233]:
inf = open('dimensions-gorila-ocr-cleanedup.txt')
in_lines = inf.readlines()
raw_dimensions = { i.replace(' ',''): v.strip() for i,_,v in [l.split('\t') for l in in_lines] }
catalogue = { i.replace(' ',''): v.strip() for i,v,_ in [l.split('\t') for l in in_lines] }


In [234]:
dimensions = {k: {
                    "length": l.strip().replace(',','.')
                  , "height": h.strip().replace(',','.')
                  , "thickness": t.replace(',','.').replace('cm','').strip()
                  , "unit" : "cm"
                  , "source": "GORILA OCR"
                 } 
              for k,(l,h,t) in [[k, v.split('x')] for k,v in raw_dimensions.items()
                             if len(v.split('x')) == 3]}


In [237]:
tabulation = list(inscriptions["inscriptions"][0]["parsedInscription"].replace('\n','').replace("𐝫",""))

# Get the word each glyph is in.
words = [ w for w in 
         [list(w.replace('\n','').replace("𐝫","")) 
          for w in inscriptions["inscriptions"][0]["words"]]
         if w]
word_for_glyph = [i for i,w in enumerate(words) for c in w]


In [236]:
"""
Create the final output files
"""
import re
import copy

json_file = open('150-inscriptions.js')
inscriptions = json.load(json_file)

of = "inscriptions" + os.sep
supplements = []
for inscription in inscriptions["inscriptions"]:
    name = inscription["name"]
    supplement = {}
    lookup_name = re.sub(r'[abcd]', '', name)
    # Get the inscription as a list of glyphs
    tabulation = list(inscription["parsedInscription"].replace('\n','').replace("𐝫",""))

    # Get the word each glyph is in.
    words = [ w for w in 
             [list(w.replace('\n','').replace("𐝫","")) 
              for w in inscription["words"]]
             if w]
    word_for_glyph = [i for i,w in enumerate(words) for c in w]
    
    # If the inscription has multiple lines, split each line out into its own element in the list
    if name in newline_indices and len(newline_indices[name]) > 1:
        # Figure out the to/from index for each line
        ni = newline_indices[name]
        # The to/from is a list of tuples with the to/from indices
        segments = list(zip(ni[:-1], ni[1:]))
        # Split the glyph list into its lines
        tabulation = [list(zip(tabulation[b:e],
                               uncertains[lookup_name][b:e] if lookup_name in uncertains else [''] * (e-b),
                               word_for_glyph[b:e])
                          )
                      for b,e in segments + [(segments[-1][1],len(tabulation))]]
    else:
        l = len(tabulation)
        tabulation = [zip(tabulation,
                        uncertains[lookup_name][0:l] if lookup_name in uncertains else [''] * (l),
                        word_for_glyph[0:l])]
    tabulation = [NoIndent([c for c in t]) for t in tabulation]
    supplement["tabulation"] = tabulation

    words = [w for w in [ w.replace('\n','').replace("𐝫","")
               for w in inscription["words"]] if w]
    supplement["tabulatedWords"] = NoIndent(words)

    supplement["catalogue"] = catalogue[lookup_name] if lookup_name in catalogue else ""
    supplement["dimensionsRaw"] = raw_dimensions[lookup_name] if lookup_name in raw_dimensions else ""
    supplement["dimensions"] = dimensions[lookup_name] if lookup_name in dimensions else ""
    supplement["name"] = name
    supplements.append(supplement)
    
    output_file = open(of + inscription["name"] + ".json", "w")
    output_file.write(json.dumps(supplement, cls=MyEncoder, sort_keys=True, indent=2, ensure_ascii=False))
    output_file.close()

output_file = open("supplement.json", "w")
output_file.write(json.dumps(supplements, cls=MyEncoder, sort_keys=True, indent=2, ensure_ascii=False))
output_file.close()

    