Import relevant stuff

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

from collections import Counter
import operator
import argparse
import json
import re

Instructions for running this as a regular .py from the command line (in case it will ever be necessary to combine scripts by using bash)

In [2]:
"""
From the command line:

   python add-normalizations.py -i [input csvs] -j [ogsl json] -a [atf file] -l [output log]

"""

'\nFrom the command line:\n\n   python add-normalizations.py -i [input csvs] -j [ogsl json] -a [atf file] -l [output log]\n\n'

Define default files and their paths. This script requires atf-file with normalized names; json that contain OGSL sign list; input files i.e. the filtered .csv files. Log file is used for storing mismatching names for debugging.

Also define sign and name dictionaries, define tags that are being fetched from the .csv. Generate also tags with [] in the beginning for later filtering purposes.

In [3]:
# Default files and paths
atf = '../Names_normalized/Ur 03 PD PNs.atf'
json_ = '../ogsl/ogsl-sl.json' 
input_ = ['../filtered_%i_with_neighbors.csv' % i for i in range(1,11)] 
log = 'add-normalizations.log'

# Sign and name normalization dicts
SIGNS = {}
NAMES = {}
TAGS = ('PN', 'DN', 'RN', 'GN', 'SN', 'TN') #, 'TN', 'WN', 'RN', 'MN', 'LN', 'EN', 'ON', 'KN')
BRTAGS = tuple(['[]%s' % t for t in TAGS])

Basic IO-operations.

In [4]:
def readfile(filename):
    with open(filename, 'r', encoding="utf-8") as data:
        if filename.endswith('.json'):
            return json.load(data)
        else:
            return data.read().splitlines()

def writefile(filename, text):
    with open(filename, 'w', encoding='utf-8') as data:
        data.write(text)
    print('wrote\t%s' % filename)
    

Define command line arguments.

In [5]:
def get_args():
    ap = argparse.ArgumentParser(description='Test')
    ap.add_argument('-j', '--json')
    ap.add_argument('-i', '--input', nargs='*')
    ap.add_argument('-a', '--atf')
    ap.add_argument('-l', '--log')
    return ap.parse_args()


Normalization functions.

    normalize() removes determinatives and does basic transliteration conversion operations.
    normalize_atf() changes pos-tags and signs that have different indices (e.g. ₓ -> ₁), fix tabs
    remove_tags() removes all name pos-tags

In [6]:
def normalize(string):
    string = re.sub('\{.+?\}', '', string.lower())
    string = string.replace('ṣ', 'sy')
    SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉šŋ", "0123456789cg")
    return string.translate(SUB)

def normalize_atf(string):
    string = string.replace('[]SN', '[]GN')
    string = string.replace('nigarX', 'nigar')
    string = re.sub('\t+', '\t', string)
    return string

def remove_tags(string):
    return re.sub('\[\][A-Z]N', '', string)

Map transliterated words into abstract sign values, for example lugal-uru-da and lugal-iri-da will become LUGAL^IRI^DA as IRI has readings uru and iri.

Sub-function hyphenate() normalizes all sign separators into dashes. This will help splitting transliterated words into individual signs.

In [7]:
def map_signs(xlit):
    """ Map all sign values in string to name representation, i.e.
    lugal-uru-da --> LUGAL^IRI^DA """
    def hyphenate(s):
        return re.sub('[\.:-]', '-', normalize(s))
    
    return '^'.join([SIGNS.get(sign, '$') for sign in hyphenate(xlit).split('-')])

This function calculates frequencies for different mismatches.

In [8]:
def make_log(mismatches):
    return ['%i\t%s' % (v, k) for k, v in sorted(Counter(mismatches).items(),
                                                 key=operator.itemgetter(1))]


ATF parser. Parsing includes following steps:

    1. define typical sumerian morphemes and their surface representations
    2. iterate ATF line by line and pass it throught normalize_atf()
    3. fetch transliteration and normalization
    4. use BRTAGS for matching only those names that contain wanted pos-tags
    5. remove tags from normalizations
    6. use map_signs() function to convert transliteration into abstract sign representation.
    7. generate three copies of each transliterated name:
        A) a copy with typical morphemes removed (lugal-dúr-ta --> lugal-dúr); morphemic endings are only removed
        if the the surface representations of morphemes are not found from the normalized name: e.g. PN-ke4 has its
        /ak+e/ = ke4 removed only if the normalization does not end in /ke/. This should ensure that this process
        does not produce ambiguity.
        B) a copy with typical morphemes added (lugal-dúr --> lugal-dúr-ta, lugal-dúr-ra etc.); this tries to ensure
        that the name dictionary will have different morphological forms for names and we do not have to do 
        morphological parsing in the CSV file where surface forms are not present (and we cannot know if e.g. -ta
        is a part of the name or not). This produces impossible morpheme combinations for names that already have
        case endings, but it should not matter as they will not match with anything; e.g. lugal-dúr-ta-ta.
        C) original transliteration (lugal-dúr)
        
        save these representations in NAMES dictionary

In [9]:
def parse_atf(filename):

    morphs = {'-ta': 'ta',
              '-ce3': 'še',
              '-ra': 'ra',
              '-ke4': 'ke',
              '-ar': 'ar'}

    for line in readfile(filename):
        line = normalize_atf(line)
        elems = line.split('\t')
        if len(elems) > 2:
            _, xlit, norm = elems[0:3]
            if norm.endswith(BRTAGS):
                """ Remove or add morphology """
                #norm = remove_tags(norm)
                for morph in morphs:
                    if xlit.endswith(morph) and not norm.endswith(morphs[morph]):
                        key = map_signs(xlit.rstrip(morph))
                        NAMES[key] = norm
                
                    key = map_signs(xlit + morph)
                    NAMES[key] = norm

                """ Add name as it is """
                key = map_signs(xlit)
                NAMES[key] = norm

JSON parser. Read OGSL sign list in JSON format to Python dict and normalize them to match the ATF file by using normalize() function.

In [10]:
def parse_json(filename):
    """ Build a sign dictionary based on JSON file by mapping
    normalized sign values to value(SIGN) notation """
    ogsl = readfile(filename)
    for sign in ogsl['signs'].keys():
        vals = ogsl['signs'][sign].get('values', None)
        if vals is not None:
            for v in vals:
                SIGNS[normalize(v)] = sign


CSV parser. CSV parsing includes following steps

    1. check if there are multiple input files or just one. If one, make sure it will not be a str().
    2. iterate all lines in every file
    3. define header for new CSV files
    4. split CSV by commas
    5. fetch all names that have predefined TAGS
    6. normalize transliteration by using normalize() and remove_tags()
    7. use map_signs() to convert transliteration into abstract sign value representation
    8. try to find corresponding abstract representation of the name from NAMES dictionary; if it cannot be found,
       return empty string
    9. append found normalizations into CSV between lemma and id_text
    10. write new CSV file (same path as the input, but mark them with _normalized suffix)

In the end, write a log file that contains all mismatches and their frequencies. Print a short summary of these.

In [11]:
def parse_csv(filename):
    """ Read all input CSV-files. Normalize all transliterated names """
    no = 0
    mismatches = []
    yes = 0
    
    if isinstance(filename, str):
        filename = [filename]

    for f in filename:
        #,Unnamed: 0,lemma,id_text,id_line,id_word,label,prof?,role?,family?,number?,commodity?,P_Number,neighbors
        csv_output = ['id,unnamed,lemma,normalizaton,id_text,id_line,id_word,label,prof?,role?,family?,number?,commodity?,P_Number,neighbors']
        for line in readfile(f)[1:]:
            norm = ''
            fields = line.split(',')
            id_ = fields[0]
            unnamed = fields[1]
            lemma = fields[2]
            rest = fields[3:]
            if lemma.endswith(TAGS):
                key = map_signs(normalize(remove_tags(lemma)))
                norm = NAMES.get(key, '')
                if not norm:
                    no += 1
                    mismatches.append(lemma)
                else:
                    yes += 1
            csv_output.append(','.join([id_, unnamed, lemma, norm] + rest))
        writefile(f.replace('with_neighbors', 'with_neighbors_normalized'), '\n'.join(csv_output))

    """ Print mismatches for debugging purposes """
    writefile(log, '\n'.join(make_log(mismatches)))

    print('mismatches %i | unique %i | matches %i | see %s for info' %\
              (no, len(set(mismatches)), yes, log))




Get command line arguments if given, else use predefined files and paths (apparently Jupyter does not like argparse, thus set commandline = True if there is need to use it)

In [12]:
commandline = False

if commandline:
    args = get_args()

    if args.log is not None:
        log = args.log

    if args.json is None:
        parse_json(json_)
    else:
        parse_json(args.json)

    if args.atf is None:
        parse_atf(atf)
    else:
        parse_atf(args.atf)

    if args.input is None:
        parse_csv(input_)
    else:
        parse_csv(args.input)
else:
    parse_json(json_)
    parse_atf(atf)
    parse_csv(input_)


wrote	../filtered_1_with_neighbors_normalized.csv
wrote	../filtered_2_with_neighbors_normalized.csv
wrote	../filtered_3_with_neighbors_normalized.csv
wrote	../filtered_4_with_neighbors_normalized.csv
wrote	../filtered_5_with_neighbors_normalized.csv
wrote	../filtered_6_with_neighbors_normalized.csv
wrote	../filtered_7_with_neighbors_normalized.csv
wrote	../filtered_8_with_neighbors_normalized.csv
wrote	../filtered_9_with_neighbors_normalized.csv
wrote	../filtered_10_with_neighbors_normalized.csv
wrote	add-normalizations.log
mismatches 1595 | unique 1067 | matches 69756 | see add-normalizations.log for info
