# Data cleaning

producing sign mappings from the OCHRE sign list and Susanne's recommendations. Originally, data cleaning was built into the initial Luigi workflow, but it's better to do it all at once and then feed in the correct labels to the training workflow. 

In [33]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm import tqdm

In [34]:
archive = h5py.File("/local/ecw/deepscribe-data/pfa/a_pfa.h5", "r")
all_signs = list(archive.keys())
print(len(all_signs))
archive.close()

278


In [35]:
# remove whitespace
no_whitespace = [sign.strip() for sign in all_signs]
# load OCHRE sign list map for crude transformation
with open("/local/ecw/deepscribe/notebooks/readings_to_signs.json") as infile:
    reading_map = json.load(infile)

readings_mapped = [reading_map.get(sign, sign) for sign in no_whitespace]
print(f"{len(np.unique(readings_mapped))} unique signs remaining")

195 unique signs remaining


In [38]:
# additional rules from Susanne's email on 6/25/2020
additional_rules = {}
# map from old sign -> correct sign
additional_rules["HAL"] = "ḪAL"
additional_rules["na"] = "NA"
additional_rules["hal"] = "ḪAL"
additional_rules["GEŠ"] = "GIŠ"
additional_rules["1"] = "DIŠ"
additional_rules["2"] = "MIN"
additional_rules["10"] = "U"
additional_rules["20"] = "MAN"

# additional rules from Eddie's extrapolations
additional_rules["HA"] = "ḪA"
additional_rules["ha"] = "ḪA"
additional_rules["hu"] = "ḪU" #from ḫu

# additional rules from correcting Unicode errors 
additional_rules["N÷TA"] = "NÍTA"
additional_rules["P÷R"] = "PÍR" # double-check this. 
additional_rules["Z÷D"] = "ZÍD" # double-check this. 

additional_rules["m°n"] = "SAL" # from "mín"
additional_rules["p°r"] = "PÍR" # from pír
additional_rules["°b"] = "TUM" #from íb
additional_rules["°p"] = "TUM" #from íp
additional_rules["z°"] = "ZÍ" # from zí

additional_rules["k†n"] = "GÁN" # from kán
additional_rules["p†r"] = "BAR"# from pár
additional_rules["r†b"] = "GAL"# from ráb
additional_rules["r†p"] = "GAL"# from ráp

additional_rules["k£m"] = "NE" # from kúm
additional_rules["£"] = "Ú" #from ú


rules_applied = [additional_rules.get(sign, sign) for sign in readings_mapped]
print(f"{len(np.unique(rules_applied))} unique signs remaining")

179 unique signs remaining


In [39]:
# producing final sign map
final_map = {original:transformed for original, transformed in zip(all_signs, rules_applied)}

In [40]:
# transforming signs in dataset


new_archive = h5py.File("/local/ecw/deepscribe-data/pfa/a_pfa_cleaned.h5", "w")

original_archive = h5py.File("/local/ecw/deepscribe-data/pfa/a_pfa.h5", "r")

for label in tqdm(
    original_archive.keys(), desc="Mapping readings to signs"
):
    
    sign_name = final_map[label.strip()]
    group = new_archive.require_group(sign_name)
    # assigning all images to the same group
    for img in original_archive[label].keys():
        npy_img = np.array(original_archive[label][img])
        new_dset = group.create_dataset(img, data=npy_img)

        for key, val in original_archive[label][img].attrs.items():
            new_dset.attrs[key] = val

new_archive.close()
original_archive.close()

Mapping readings to signs: 100%|██████████| 278/278 [05:46<00:00,  1.25s/it]
