In [1]:
import pandas as pd
import re

In [2]:
with open('data/query_text_16_12_29-052645.txt', mode = 'r', encoding = 'utf8') as f:
    linelist = f.read().splitlines()

# Clean
first remove all flags etc., including brackets and half brackets


In [3]:
def clean(data, oldlist, new=""):
    for i in range(len(oldlist)):
        data = data.replace(oldlist[i],new)
    return data

In [4]:
remlist = ["`", "´", "/", "]", "[", "!", "?", "<", ">", "(", ")"]
lines = [clean(line, remlist) for line in linelist]

# Remove Editorial Lines and Headers
Header lines begin with a digit. Editorial remarks begin with a line indicator (like data lines), followed by one or more TABs, followed by #.

In [5]:
lines = [line for line in lines if not line[0].isdigit()]
lines = [line for line in lines if not "\t#" in line]

# Extract Names
In BDTNS names start with a capital or with {d}. The list comprehension iterates through the list of lines. It iterates through each line by splitting the line into words, testing whether the word begins with a capital or with {d}. The result is a list.

In [6]:
names = [word for line in lines for word in line.split() if word[0].isupper() or word.startswith('{d}')]
len(names)

891214

# Remove All Upper Case
Remove words that are entirely in upper case (for instance LU2.SU)

In [7]:
names =[word for word in names if not word.isupper()]
len(names)

838636

# Remove Place Names
Place names usually end with {ki}, or contain {ki} as in Eridu{ki}-ga ('in Eridu'). All words containing{ki} are removed - this might include a few PNs.

In [8]:
#names = [word for word in names if not word.endswith('{ki}')]
#names = [word for word in names if not '{ki}' in word]
#len(names)

# Remove Incomplete Entries
Remove names that have ellipsis (damage) or illegible signs.

In [9]:
names = [word for word in names if not '...' in word]
names = [word for word in names if not '-x' in word.lower() 
         and not 'x-' in word.lower() 
         and not '.x' in word.lower() 
         and not 'x.' in word.lower()
        and not '}x' in word.lower()
        and not 'x{' in word.lower()]
len(names)

809322

# Remove duplicates
Reduce the list to a unique set.

In [10]:
total_names = len(names)
#keep the total number of names for statistics
names = set(names)
len(names)

30146

# Sort Alphabetically

In [11]:
names = sorted(names)
names

['A-AH-{d}Utu',
 'A-AMA-a2-a',
 'A-AN-ba-az',
 'A-Ab-ba-ge-na-ta',
 'A-Ad-da',
 'A-Ad-da-kal-la',
 'A-Ad-da-mu',
 'A-An-na-hi-li-bi',
 'A-CIR-da',
 'A-DU-ba-bi',
 'A-DU-gam-ma',
 'A-DU-la-URU',
 'A-DU.DU-ce3',
 'A-DU.DU-e',
 'A-DU.DU-ta',
 'A-GU4-na-mu',
 'A-KA-a',
 'A-KU-um',
 'A-KU.KU-lum',
 'A-KU.KU-ta',
 'A-LUM-ma',
 'A-NE-i3-zi',
 'A-NE-ku-bi',
 'A-NI-ma',
 'A-NI-sig5',
 'A-NI-ta',
 'A-PA4-u2-a',
 'A-TU5-ce3',
 'A-U.E2-nu-tuku',
 'A-a',
 'A-a-',
 'A-a-Kal-la',
 'A-a-NI',
 'A-a-UN-e-ba-ab-du7',
 'A-a-a',
 'A-a-ab-ba',
 'A-a-ar',
 'A-a-ba',
 'A-a-ba-ni',
 'A-a-ba-ta',
 'A-a-bad3',
 'A-a-bad3-da-ri2-a',
 'A-a-bar-ra',
 'A-a-bi-ta',
 'A-a-bi2-du10',
 'A-a-ca3-mu',
 'A-a-ce3',
 'A-a-dingir',
 'A-a-dingir-mu',
 'A-a-dingir-mu-ce3',
 'A-a-dingir-mu-ta',
 'A-a-dingir-ta',
 'A-a-du10-ga',
 'A-a-e',
 'A-a-ga',
 'A-a-ga-mu',
 'A-a-ga-ta',
 'A-a-ge-na',
 'A-a-ge-na-ta',
 'A-a-gi-na',
 'A-a-gir15',
 'A-a-gir15{ir}',
 'A-a-gu-ni',
 'A-a-ha-ma-ti',
 'A-a-i3-li2',
 'A-a-i3-li2-cu',
 'A-a-i3-li2-s

# Create DataFrame
The DataFrame has two columns: column 1 has the original transliteration (as in BDTNS); column 2 starts out with the same data, but this data is transformed into ORACC compatible lemmatization

In [12]:
df = pd.DataFrame(names)

In [13]:
df.columns = ['Transliteration']

In [14]:
df['Normalized'] = df['Transliteration']

# Shin, Emphatic T, and Emphatic S
Replace c by š, C by Š, ty by ṭ, etc.

In [15]:
signs = {'c': 'š',
        'C': 'Š',
        'ty': 'ṭ',
        'TY': 'Ṭ',
        'sy': 'ṣ',
        'SY': 'Ṣ'}
for key in signs:
    df['Normalized'] = [word.replace(key, signs[key]) for word in df['Normalized']]

# Sign Reading Substitution
The conventions for reading signs in BDTNS differs from ORACC: BDTNS does not distinguish between G and nasal G (ŋ), and BDTNS uses short readings (`ku3`) where ORACC uses long readings (`kug`).

The following function (`signreplace()`) replaces a BDTNS reading with an ORACC reading. The regular expression uses `\\b` (before and after the sign) to indicate word boundaries, so that replacing `sag` by `saŋ` does not find `sag2` etc. Word boundaries (as defined by the `regex` module) include `-`, `.`, `{`, and `}`.

Since names are capitalized (as in `Dingir-nu-me-a`) each sign-replacement is run twice: once in lower case (`dingir`, replaced by `diŋir`) and once capitalized (`Dingir`, replaced by `Diŋir`).  

In [16]:
def signreplace(old, new, data):
    old_cap = old.capitalize()
    new_cap = new.capitalize()
    data = re.sub('\\b'+old_cap+'\\b', new_cap, data)
    data = re.sub('\\b'+old+'\\b', new, data)
    return data

# Dictionary of signs with canonical ORACC reading
Preliminary list of "short" vs. "long" sign readings (`du11` vs. `dug4`) and sign readings with nasal G (ŋ). The list does *not* include `mu` : `ŋu10`, because that is valid *only* at the end of a word. It is necessary to first remove morphological suffixes such as -ta, -še3, etc., which happens in the next phase.

In [17]:
bdtns_oracc = {'ag2': 'aŋ2',
               'balag': 'balaŋ', 
               'dagal': 'daŋal', 
               'dingir': 'diŋir',
               'eridu': 'eridug',
               'ga2': 'ŋa2', 
               'gar': 'ŋar',
               'geštin': 'ŋeštin',
               'gir2': 'ŋir2',
               'gir3': 'ŋiri3',
               'giri3': 'ŋiri3',
               'giš': 'ŋeš',
               'giškim': 'ŋiškim',
               'gišnimbar' : 'ŋešnimbar',
               'hun': 'huŋ',
               'kin': 'kiŋ2', 
               'nig2': 'niŋ2',
               'nigin': 'niŋin',
               'nigin2': 'niŋin2',
               'pisan': 'bisaŋ',
               'pirig': 'piriŋ',
               'sag': 'saŋ',
               'sanga': 'saŋŋa',
               'šeg3': 'šeŋ3', 
               'šeg6': 'šeŋ6',
               'umbisag': 'umbisaŋ',
               'uri2': 'urim2',
               'uri5': 'urim5',
               'uru': 'iri',
               
               'bara2' : 'barag',
               'du10' : 'dug3',
               'du11' : 'dug4',
               'gu4' : 'gud',
               'kala' : 'kalag',
               'ku3' : 'kug',
               'ku5': 'kud',
              # 'lu5' : 'lul',
               'sa6' : 'sag9',
               'ša6' : 'sag9',
               'za3' : 'zag'
           } 

In [18]:
for key in bdtns_oracc:
    df['Normalized'] = [signreplace(key, bdtns_oracc[key], word) for word in df['Normalized']]
#normalized = [signreplace(key, bdtns_oracc[key], word) for key in bdtns_oracc for word in df['Normalized']]


# Capitalize god names
God names (as part of personal names) are not consistently capitalized (as in `{d}utu-ki-ag2`). The first character after `{d}` must be a capital. The `regex` for doing so was found [here](http://stackoverflow.com/questions/8934477/making-letters-uppercase-using-re-sub-in-python).

In [19]:
df['Normalized'] = [re.sub('{d}([a-zšŋ])', lambda match: '{d}'+'{}'.format(match.group(1).upper()), word) 
                    for word in df['Normalized']]

In [20]:
df

Unnamed: 0,Transliteration,Normalized
0,A-AH-{d}Utu,A-AH-{d}Utu
1,A-AMA-a2-a,A-AMA-a2-a
2,A-AN-ba-az,A-AN-ba-az
3,A-Ab-ba-ge-na-ta,A-Ab-ba-ge-na-ta
4,A-Ad-da,A-Ad-da
5,A-Ad-da-kal-la,A-Ad-da-kal-la
6,A-Ad-da-mu,A-Ad-da-mu
7,A-An-na-hi-li-bi,A-An-na-hi-li-bi
8,A-CIR-da,A-ŠIR-da
9,A-DU-ba-bi,A-DU-ba-bi


# Remove Morphology
The following lines remove morphology that can (almost) unambigously be identified, namely `-ta` (ablative); `ke4` (genitive + ergative) and `-še3`. The genitive element of `-ke4` (`-k`) is kept when it immediately follows a vowel because in such cases it usually belongs to the name. Note that `-ra` (dative) is ambiguous, since it may be part of a name ending in /r/ like `Šul-gi-ra` (in the genitive). After removing these morphemes, word-final `-mu` is replaced by `-ŋu10`.

In [21]:
df['Normalized'] = [word[:-3] if word.endswith('-ta') else word for word in df['Normalized']]
df['Normalized'] = [word[:-4] if word.endswith('-še3') else word for word in df['Normalized']]
df['Normalized'] = [word[:-2] if word.endswith('a|e|i|u' + '-ke4') else word for word in df['Normalized']]
df['Normalized'] = [word[:-4] if word.endswith('-ke4') else word for word in df['Normalized']]
df['Normalized'] = [word[:-2]+'ŋu10' if word.endswith('-mu') else word for word in df['Normalized']]

df[10000:10050]

Unnamed: 0,Transliteration,Normalized
10000,Ha-ba-lu5-ge2,Ha-ba-lu5-ge2
10001,Ha-ba-lu5-ge2-ka-ce3,Ha-ba-lu5-ge2-ka
10002,Ha-ba-lu5-ge2-me-ec2,Ha-ba-lu5-ge2-me-eš2
10003,Ha-ba-lu5-ge2-ra,Ha-ba-lu5-ge2-ra
10004,Ha-ba-lu5-ge2-ta,Ha-ba-lu5-ge2
10005,Ha-ba-lu5-la,Ha-ba-lu5-la
10006,Ha-ba-lum,Ha-ba-lum
10007,Ha-ba-mu,Ha-ba-ŋu10
10008,Ha-ba-sa6,Ha-ba-sag9
10009,Ha-ba-sa6-ga,Ha-ba-sag9-ga


# Names with Genitive -k
Names such as `Nin-ŋir2-su` contain a (hidden) genitive morpheme `-(a)k` that only appears when followed by a vowel, as in `Nin-ŋir2-su-ke4`. In the preceding `Nin-ŋir2-su-ke4` has been shortened to `Nin-ŋir2-su-k` (removing the ergative morpheme). The presence of `Nin-ŋir2-su-k` in the list proves that `Nin-ŋir2-su` has a hidden geneitive and should be normalized `Ninŋirsuk`. The following cell tests for the existence of such instances, if yes, the `-k` is added to the normalized form of the name.

In [22]:
df['Normalized'] = [word + '-k' if word + '-k' in df.Normalized.values else word for word in df['Normalized']]

df[10000:10050]

Unnamed: 0,Transliteration,Normalized
10000,Ha-ba-lu5-ge2,Ha-ba-lu5-ge2
10001,Ha-ba-lu5-ge2-ka-ce3,Ha-ba-lu5-ge2-ka
10002,Ha-ba-lu5-ge2-me-ec2,Ha-ba-lu5-ge2-me-eš2
10003,Ha-ba-lu5-ge2-ra,Ha-ba-lu5-ge2-ra
10004,Ha-ba-lu5-ge2-ta,Ha-ba-lu5-ge2
10005,Ha-ba-lu5-la,Ha-ba-lu5-la
10006,Ha-ba-lum,Ha-ba-lum
10007,Ha-ba-mu,Ha-ba-ŋu10
10008,Ha-ba-sa6,Ha-ba-sag9
10009,Ha-ba-sa6-ga,Ha-ba-sag9-ga


# Replace a-a with aya
Replace a-a and A-a with aya and Aya, but only between word boundaries. For this we can use the function `signreplace()` defined above.


In [23]:
df['Normalized'] = [signreplace('a-a', 'aya', word) for word in df['Normalized']]
df[:100]

Unnamed: 0,Transliteration,Normalized
0,A-AH-{d}Utu,A-AH-{d}Utu
1,A-AMA-a2-a,A-AMA-a2-a
2,A-AN-ba-az,A-AN-ba-az
3,A-Ab-ba-ge-na-ta,A-Ab-ba-ge-na
4,A-Ad-da,A-Ad-da
5,A-Ad-da-kal-la,A-Ad-da-kal-la
6,A-Ad-da-mu,A-Ad-da-ŋu10
7,A-An-na-hi-li-bi,A-An-na-hi-li-bi
8,A-CIR-da,A-ŠIR-da
9,A-DU-ba-bi,A-DU-ba-bi


# Remove dashes and sign index numbers
In order to produce a normalized form of the name, sign separators (dashes) and sign index numbers are removed. In first instance this is done *only* if there are no uppercase characters further on in the name (uppercase may indicate a logogram). Secondarily we will consider instances where the uppercase letter is dues to a god name (as in `A-bu-um-{d}Dumu-zi`)

In [24]:
remove = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ':']
df['Normalized'] = [clean(word, remove) if word[1:].islower() 
                    or (word.startswith('{d}') and word[4:].islower()) else word for word in df['Normalized']]


# Theophoric Names
Theophoric names, if the god name is not at the beginning of the word, will have a capital in the middle of the word (as in `A-ba-{d}Dumu-zi-gen7`) and are thus ignored by the previous cell (no removal of dashes etc.). The following cell tests for the presence of `{d}` in the middle of the word (not at position 0) and splits that name into two. Each half may start with a capital, but should be lower case otherwise. If that is the case the dashes and index numbers are removed.

In [25]:
df['Normalized'] = [clean(word, remove) if ('{d}' in word[1:] 
              and word.split('{d}')[0][1:].islower() and word.split('{d}')[1][1:].islower()) else word 
              for word in df['Normalized']]
df[df['Normalized'].str.contains('{d}Dumu')]

Unnamed: 0,Transliteration,Normalized
344,A-ba-{d}Dumu-zi-gen7,Aba{d}Dumuzigen
517,A-bu-um-{d}Dumu-zi,Abuum{d}Dumuzi
3019,Arad2-{d}Dumu-zi,Arad{d}Dumuzi
5512,Cu-{d}Dumu-zi,Šu{d}Dumuzi
9087,Geme2-{d}Dumu-zi,Geme{d}Dumuzi
9088,Geme2-{d}Dumu-zi-da,Geme{d}Dumuzida
9089,Geme2-{d}Dumu-zi-de2,Geme{d}Dumuzide
9090,Geme2-{d}Dumu-zi-me,Geme{d}Dumuzime
9237,Geme2{d}Dumu-zi,Geme{d}Dumuzi
12349,Inim-{d}Dumu-zi,Inim{d}Dumuzi


# Theophoric Names with {d} twice
Names of the pattern `{d}Amar-{d}Suen` are still not normalized, because of the capital in position 3 (in `Amar`).

In [26]:
df['Normalized'] = [clean(word, remove) if ('{d}' in word[1:] and word.startswith('{d}')
              and word.split('{d}')[1][1:].islower() and word.split('{d}')[2][1:].islower()) else word 
              for word in df['Normalized']]
df[df['Transliteration'].str.contains('Suen')]

Unnamed: 0,Transliteration,Normalized
459,A-bi2-{d}Suen,Abi{d}Suen
1029,A-li2-id-{d}Suen,Aliid{d}Suen
1080,A-ma-an-{d}Suen,Amaan{d}Suen
1132,A-mur-{d}Suen,Amur{d}Suen
1133,A-mur-{d}Suen-ce3,Amur{d}Suen
1182,A-na-{d}Suen-tak2-la-ku,Ana{d}Suentaklaku
1294,A-ra-zu-{d}I-bi2-{d}Suen-ka-ce3-pa3-da,Arazu{d}Ibi{d}Suenkašepada
1295,A-ra-zu-{d}I-bi2-{d}Suen-na-pa3-da,Arazu{d}Ibi{d}Suennapada
1826,AK-{d}Suen,AK-{d}Suen
2647,Amar-{d}Suen,Amar{d}Suen


# Theophoric Names without {d}
God names such as `I-šum` and `E2-a` (and others?) are usually not preceded by `{d}`. We can submit those to a similar test, splitting the name at a dash when followed by `I-šum` and `E2-a`, followed by a word boundary `\\b` (using a positive lookahead regex). If the two halves of the split are both lower case after the initial character, remove dashes and index numbers. If other gods are found that are usually not preceded by `{d}` they can be added to the list `gods_no_d`.

In [27]:
gods_no_d = ['I-šum', 'E2-a', 'Er3-ra', 'A-šur5']
for god in gods_no_d:
    df['Normalized'] = [clean(word, remove) if (god in word[2:] 
              and re.split('-(?=' + god + '\\b)', word)[0][1:].islower() 
              and re.split('-(?=' + god + '\\b)', word)[1][1:].islower()) else word 
              for word in df['Normalized']]
df[df['Transliteration'].str.contains('E2-a')]

Unnamed: 0,Transliteration,Normalized
1116,A-mur-E2-a,AmurEa
1117,A-mur-E2-a-ta,AmurEa
1142,A-na-ab-E2-a,AnaabEa
1146,A-na-at-E2-a,AnaatEa
1460,A-zar3-E2-a,AzarEa
2029,Ab-ba-E2-a,AbbaEa
4337,Ca-at-E2-a,ŠaatEa
5031,Cu-E2-a,ŠuEa
5032,Cu-E2-a-ta,ŠuEa
5715,DE2-a,DE2-a


# Replace uu by u, etc.
In the normalization replace double vowels and double consonants with single ones (replace `Abuum` with `Abum` and `Abasagga` with `Abasaga`), but not at the beginning or the end of a word.


In [28]:
df['Normalized'] = [re.sub('\\B([a-z])\\1\\B', '\\1', word) for word in df['Normalized']]

# Alef between Vowels
Put alef between lowercase vowels. Note: for some reason the Alef is represented as an Ayin (`ʾ`) in the code cell. It does. however, correctly produce an Alef (`ʿ`) in the output.

In [29]:
vowel_combis = {"ae":"aʾe",
          "ai": "aʾi",
          "au": "aʾu",
          "ea": "eʾa",
          "ei": "eʾi",
          "eu": "eʾu",
          "ia": "iʾa",
          "ie": "iʾe",
          "iu": "iʾu",
          "ua": "uʾa",
          "ue": "uʾe",
          "ui": "uʾi"
         }



In [30]:
for key in vowel_combis:
    df['Normalized'] = [word.replace(key, vowel_combis[key]) for word in df['Normalized']]
df

Unnamed: 0,Transliteration,Normalized
0,A-AH-{d}Utu,A-AH-{d}Utu
1,A-AMA-a2-a,A-AMA-a2-a
2,A-AN-ba-az,A-AN-ba-az
3,A-Ab-ba-ge-na-ta,A-Ab-ba-ge-na
4,A-Ad-da,A-Ad-da
5,A-Ad-da-kal-la,A-Ad-da-kal-la
6,A-Ad-da-mu,A-Ad-da-ŋu10
7,A-An-na-hi-li-bi,A-An-na-hi-li-bi
8,A-CIR-da,A-ŠIR-da
9,A-DU-ba-bi,A-DU-ba-bi


# Assign Proper Noun Classes
Proper noun classes include:
    - RN       Royal Name
    - DN       Divine Name
    - PN       Personal Name
    - SN       Settlement Name
    - GN       Geographical Name (larger Geographical units such as states)
    - TN       Temple Name
    - ON       Object Name (such as divine vessels and chariots)
    - FN       Field Name

The class is indicated after square brackets after the name (as in `Utu[]DN`).

There are only 5 royal names: `UrNammak`, `Šulgir`, `AmarSuen`, `ŠuSuen`, and `IbbiSuen`.

Settlement names are followed by the determinative {ki}.

Divine names are preceded by the determinative {d}.

The rest are considered to be Personal Names. Field names, Temple names, and Geographical names etc. can usually not  be recognized unambiguously.

The first line of the code in the cell below add `[]PN` to each entry in the Normalization, turning every entry into a Personal Name. Subsequent lines replace `PN` with `SN`, `DN`, or `RN` where appropriate.

A certain amount of error is unavoidable. Personal names are often preceded by `{d}` because they contain a divine name. Similarly, god names may contain place names (as in `{d}Nin-Urim5{ki}`: Lady of Ur).

In [31]:
df['Normalized'] = [word+'[]PN' for word in df['Normalized']]
# Settlement names
df['Normalized'] = [word[:-2]+'SN' if '{ki}' in word else word for word in df['Normalized']]

In [32]:
# Divine names
df['Normalized'] = [word[:-2]+'DN' if word.startswith('{d}') and not '{d}' in word[4:] 
                    else word for word in df['Normalized']]

In [33]:
# Royal names
Shulgi = ['Šulgira', 'Šulgi', '{d}Šulgira', '{d}Šulgi']
AmarSuen = ['Amar{d}Suʾenra', 'Amar{d}Suʾen', 'Amar{d}Suʾenka', '{d}Amar{d}Suʾenra', 
            '{d}Amar{d}Suʾen', '{d}Amar{d}Suʾenka']
ShuSuen = ['Šu{d}Suʾen', 'Šu{d}Suʾenra', 'Šu{d}Suʾenka',
          '{d}Šu{d}Suʾen', '{d}Šu{d}Suʾenra', '{d}Šu{d}Suʾenka']
IbbiSuen = ['Ibi{d}Suʾen', 'Ibi{d}Suʾenra', 'Ibi{d}Suʾenka',
          '{d}Ibi{d}Suʾen', '{d}Ibi{d}Suʾenra', '{d}Ibi{d}Suʾenka']
df['Normalized'] = ['Šulgir[]RN' if word[:-4] in Shulgi else word for word in df['Normalized']]
df['Normalized'] = ['AmarSuʾen[]RN' if word[:-4] in AmarSuen else word for word in df['Normalized']]
df['Normalized'] = ['ŠuSuʾen[]RN' if word[:-4] in ShuSuen else word for word in df['Normalized']]
df['Normalized'] = ['IbbiSuʾen[]RN' if word[:-4] in IbbiSuen else word for word in df['Normalized']]
df['Normalized'] = ['UrNammak[]RN' if 'Ur{d}Nama' in word else word for word in df['Normalized']]

In [34]:
df[df['Normalized'].str.contains('[]RN', regex=False)]

Unnamed: 0,Transliteration,Normalized
2647,Amar-{d}Suen,AmarSuʾen[]RN
2651,Amar-{d}Suen-ka,AmarSuʾen[]RN
2652,Amar-{d}Suen-ka-ce3,AmarSuʾen[]RN
2653,Amar-{d}Suen-ke4,AmarSuʾen[]RN
2657,Amar-{d}Suen-ra,AmarSuʾen[]RN
5623,Cu-{d}Suen,ŠuSuʾen[]RN
5625,Cu-{d}Suen-ce3,ŠuSuʾen[]RN
5627,Cu-{d}Suen-ka,ŠuSuʾen[]RN
5628,Cu-{d}Suen-ke4,ŠuSuʾen[]RN
5630,Cu-{d}Suen-ra,ŠuSuʾen[]RN


# Remove Determinatives
Remove determinatives, but only from those names that have been normalized (do not contain `-` or `.` anymore).

In [35]:
df['Normalized'] = [re.sub('{.+?}', '', word) if not '-' in word and not '.' in word 
                    else word for word in df['Normalized']]
df

Unnamed: 0,Transliteration,Normalized
0,A-AH-{d}Utu,A-AH-{d}Utu[]PN
1,A-AMA-a2-a,A-AMA-a2-a[]PN
2,A-AN-ba-az,A-AN-ba-az[]PN
3,A-Ab-ba-ge-na-ta,A-Ab-ba-ge-na[]PN
4,A-Ad-da,A-Ad-da[]PN
5,A-Ad-da-kal-la,A-Ad-da-kal-la[]PN
6,A-Ad-da-mu,A-Ad-da-ŋu10[]PN
7,A-An-na-hi-li-bi,A-An-na-hi-li-bi[]PN
8,A-CIR-da,A-ŠIR-da[]PN
9,A-DU-ba-bi,A-DU-ba-bi[]PN


# Replace numbers by Index numbers
In names that could not be normalized automatically, replace numbers by index numbers and dashes (`-`) by dots (`.`).

In [36]:
numbers_index = {'0':'₀',
               '1': '₁',
               '2':'₂',
               '3':'₃',
               '4':'₄',
               '5':'₅',
               '6':'₆',
               '7':'₇',
               '8':'₈',
               '9':'₉',
                '-': '.'}

In [37]:
for key in numbers_index:
    df['Normalized'] = [word.replace(key, numbers_index[key]) for word in df['Normalized']]
df

Unnamed: 0,Transliteration,Normalized
0,A-AH-{d}Utu,A.AH.{d}Utu[]PN
1,A-AMA-a2-a,A.AMA.a₂.a[]PN
2,A-AN-ba-az,A.AN.ba.az[]PN
3,A-Ab-ba-ge-na-ta,A.Ab.ba.ge.na[]PN
4,A-Ad-da,A.Ad.da[]PN
5,A-Ad-da-kal-la,A.Ad.da.kal.la[]PN
6,A-Ad-da-mu,A.Ad.da.ŋu₁₀[]PN
7,A-An-na-hi-li-bi,A.An.na.hi.li.bi[]PN
8,A-CIR-da,A.ŠIR.da[]PN
9,A-DU-ba-bi,A.DU.ba.bi[]PN


# Statistics
In the current set, how many name forms and how many names? How many name forms could not be normalized?

In [38]:
not_norm_df = df[df['Normalized'].str.contains('.', regex=False)]
norm_df = df[~df['Normalized'].str.contains('.', regex=False)]
not_normalized = len(not_norm_df)
norm = len(norm_df)
norm_set = len(set(norm_df['Normalized']))
names_forms = len(df)
print('Name Instances ' + str(total_names))
print('Name forms: ' + str(names_forms))
print('Name forms Normalized: ' + str(norm) + "; representing " + str(norm_set) + " different names.")
print('Name forms not normalized: ' + str(not_normalized))

Name Instances 809322
Name forms: 30146
Name forms Normalized: 23224; representing 17121 different names.
Name forms not normalized: 6922


# Utamišaram
The Ur III name `Utamišaram` appears in many different spellings and name forms. How did our script do with him?


In [40]:
mica = df[df['Transliteration'].str.contains('mi-ca')]
Utamicaram = mica[mica['Transliteration'].str[0]=='U']
Utamicaram

Unnamed: 0,Transliteration,Normalized
22313,U2-da-mi-ca-ra-am,Udamišaram[]PN
22412,U2-ta-mi-car-ra-am,Utamišaram[]PN
22413,U2-ta-mi-car-ra-am-ta,Utamišaram[]PN
22414,U2-ta-mi-car-ru-um,Utamišarum[]PN
22415,U2-ta1-mi-car-ra-am-ta,Utamišaram[]PN
22422,U2-ta2-mi-ca-ra-am,Utamišaram[]PN
22423,U2-ta2-mi-car-MI-ra-am,U₂.ta₂.mi.šar.MI.ra.am[]PN
22424,U2-ta2-mi-car-am,Utamišaram[]PN
22425,U2-ta2-mi-car-am-ta,Utamišaram[]PN
22426,U2-ta2-mi-car-ra-am,Utamišaram[]PN


Turns out that 10 different spellings/forms are correctly identified with `Utamišaram`; others are normalized as `Udamišaram` or `Utamišarum`, or, in one case `U₂.ta₂.mi.šar.MI.ra.am` (apparently an ancient spelling mistake). Such entries need to be corrected by hand; further research shows that in one case `U2-ta-mi-car-um-ta` is a modern mistake (= `Utamišaram`); the other cases of `Utamišarum` are correctly transliterated and represent a variant form of the same name refering to the same person (an official in the Drehem administration) as `Utamišaram`.

# Save to File
Note that the data is encoded in `utf-16` (rather than `utf-8`) and the separator is a `TAB` (instead of comma). This way Excel can open the file without problem. However, if another program (such as `emacs`) is used for editing it is better to use `utf-8` and perhaps a different separator.

In [44]:
df.to_csv('output/UrIII-Name_au.csv', sep = '\t', encoding='utf-16')