In [27]:
# Import Libraries
import re
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from collections import OrderedDict, Counter
import pandas as pd

# Extract Transactions

In [28]:
DREHEM_IDS = 'drehem_annotate_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["ox", "cow", "sheep", "goat", "lamb"] # account for plural
labels["wild_animal"] = ["bear", "gazelle", "mountain"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["boots", "sandals"]
labels["precious_object"] = ["copper", "bronze", "silver", "gold"]
labels["wool"] = ["wool"]
labels["queens_archive"] = []
# wool,

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): break
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): break
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): break
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): break
            if find_label("precious_object", line, found): break
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:] # line[line.index("="):] if (line.index("=") != -1) else line.split()[0][1:]
                    if p_id in ids:
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append(line)
    return lst

In [29]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids

all_transactions = read_files("raw-data/", complete_list)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 646


In [30]:
s = pd.Series(list())
file_name = "annotations.csv"
s.value_counts().to_csv(file_name)

with open(file_name, 'w', encoding="utf8") as file:
    for t in transactions:
        file.write(t.p_id + "\n")
        for k, v in t.get_lemmatization().items():
            file.write(k + "\n")
            file.write(v + "\n")

lst = [(t.p_id, t.set_label()) for t in transactions]


# Look at total unknowns

In [31]:
unknowns = []
for item in lst:
    print(item[0], item[1])
    print("\n")
    if "Unknown" in item[1].keys():
        unknowns.append(item)

P100041 {'domesticated_animal': ['#lem: n; udu[sheep]']}


P100189 {'domesticated_animal': ['#lem: n; udu[sheep]; niga[fattened]']}


P100190 {'domesticated_animal': ['#lem: n; sila[lamb]']}


P100191 {'domesticated_animal': ['#lem: n; mašgal[goat]; niga[fattened]; n; udu[sheep]']}


P100211 {'domesticated_animal': ['#lem: n; udu[sheep]; a[water]; u; sag[rare]']}


P100214 {'domesticated_animal': ['#lem: n; udu[sheep]; niga[fattened]; n; sila[lamb]; niga[fattened]']}


P100215 {'domesticated_animal': ['#lem: n; mašgal[goat]; niga[fattened]']}


P100217 {'wild_animal': ['#lem: n; amar[young]; mašda[gazelle]; DN']}


P100218 {'domesticated_animal': ['#lem: n; mašgal[goat]']}


P100219 {'domesticated_animal': ['#lem: n; sila[lamb]']}


P100220 {'domesticated_animal': ['#lem: n; mašgal[goat]; niga[fattened]']}


P100221 {'domesticated_animal': ['#lem: n; gud[ox]']}


P100222 {'domesticated_animal': ['#lem: n; lal[small]; n; udu[sheep]']}


P100223 {'domesticated_animal': ['#lem: n; udu[she

In [32]:
# Print unknowns
print("Transactions with unknown labels:\n")
num = 0

for item in unknowns:
    print(item[0], item[1])
    print("\n")
    num += 1

print("Number without labels:", num)

Transactions with unknown labels:

P100230 {'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) lulim-nita₂', '#lem: n; X', '2. šu-gid₂ e₂-muhaldim', '#lem: šugid[~animal]; emuhaldim[kitchen]', '3. mu gar₃-du-ne-še₃', '#lem: mu[year]; gardu[soldier]', '4. dingir-dan sukkal maškim', '#lem: PN; sukkal[secretary]; maškim[administrator]', '5. iti u₄ 1(u) 6(diš) ba-zal', '#lem: itud[moon]; ud[sun]; n; n; zal[pass]', '6. ki lu₂-dingir-ra-ta', '#lem: ki[place]; PN', '@reverse', '1. ba-zi', '#lem: PN', '$ 1 line blank', '2. iti še-sag₁₁-ku₅', '#lem: itud[moon]; MN', '3. mu en eridu{ki} ba-hun', '#lem: mu[year]; en[priest]; GN; huŋ[hire]', '@left', '1. 1(diš)', '#lem: n']]}


P100292 {'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 2(ban₂) zi₃-gu', '#lem: n; zidgu[flour]', '2. sa₂-du₁₁-še₃', '#lem: sadug[offerings]', '3. ki lu₂-du₁₀-ga-mu#-ta#', '#lem: ki[place]; PN', '4.

# Multinomial Naive Bayes Classifer
For classifying queen's archives transactions