In [1]:
# Import Libraries
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from collections import OrderedDict, Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

# Extract Transactions

In [2]:
DREHEM_IDS = 'drehem_annotate_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["ox", "cow", "sheep", "goat", "lamb"] # account for plural
labels["wild_animal"] = ["bear", "gazelle", "mountain"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["boots", "sandals"]
labels["precious_object"] = ["copper", "bronze", "silver", "gold"]
labels["wool"] = ["wool"]
labels["queens_archive"] = []

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        self.sumerian_lemmas = []
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Get Sumerian lemmatized text only
    def get_sumerian_lemma(self):
        #print(item.sumerian_lemmas)
        item.sumerian_lemmas = []
        for k, v in self.lemmas.items():
            #print(v)
            result = re.findall(" .*\[[a-z]+\]", v)
            if len(result) == 0:
                continue
            lemmas = [s[:s.index("[")].strip() for s in result[0].split(";") if re.search("\[", s)]
            self.sumerian_lemmas += lemmas
        return self.sumerian_lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): break
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): break
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): break
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): break
            if find_label("precious_object", line, found): break
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    if p_id in ids:
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append(line)
    return lst

In [3]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
#complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids

#all_transactions = read_files("raw-data/", complete_list)
non_queen_list = read_files("raw-data/", list_drehem_ids)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 314
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 212
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf

In [4]:
queen_unknowns = 0

for item in queen_training_list:
    print(item.p_id)
    for k,v in item.get_lemmatization().items():
        print(k)
        print(v)
    item.set_label()
    print("Hardcode label: ", item.label)
    print("\n")
    if "Unknown" in item.label.keys():
        queen_unknowns += 1
            
print("Number unknown: ", queen_unknowns, "out of ", len(queen_training_list))
    

P101878
1. 1(diš) šah₂-NE-tur-nita₂-geš-gi-gur₄
#lem: n; X
2. 2(diš) uz-tur
#lem: n; PN
3. 3(diš) ir₇{mušen}
#lem: n; irsaŋ[pigeon]
4. 1(u) 4(diš) tu-gur₄{mušen}
#lem: n; n; tugur[dove]
5. ma₂!-an-na-še₃
#lem: ON
6. iti-ta u₄ 2(u) 4(diš) ba-ta-zal
#lem: itud[moon]; ud[sun]; n; n; zal[pass]
1. zi-ga a₂-bi₂-la-tum
#lem: ziga[expenditure]; PN
2. iti a₂#-ki-ti
#lem: itud[moon]; akiti[festival]
3. mu ša-aš-ru{ki} ba-hul
#lem: mu[year]; GN; hulu[bad]
Hardcode label:  {'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) šah₂-NE-tur-nita₂-geš-gi-gur₄', '#lem: n; X', '2. 2(diš) uz-tur', '#lem: n; PN', '3. 3(diš) ir₇{mušen}', '#lem: n; irsaŋ[pigeon]', '4. 1(u) 4(diš) tu-gur₄{mušen}', '#lem: n; n; tugur[dove]', '5. ma₂!-an-na-še₃', '#lem: ON', '6. iti-ta u₄ 2(u) 4(diš) ba-ta-zal', '#lem: itud[moon]; ud[sun]; n; n; zal[pass]', '@reverse', '$ blank space', '1. zi-ga a₂-bi₂-la-tum', '#lem: ziga[expenditure]; PN', '2. iti a₂#-ki-ti', '#l

# Analysis

Only 53/212 of the Queen's archives do not fit into one of the 6 labels. This means that in Queen's Archives transactions, similar commodities are being traded.

Goal: find features that identify a transaction as belonging to Queen's archives

# Multinomial Naive Bayes Classifer
For classifying queen's archives transactions

In [8]:
# Get lemmatized Sumerian text
all_lemmas = []
labels = []

for item in queen_training_list:
    all_lemmas.append(" ".join(item.get_sumerian_lemma()))
    labels.append("queen")
    
for item in non_queen_list[:200]:
    item.get_lemmatization()
    all_lemmas.append(" ".join(item.get_sumerian_lemma()))
    labels.append("not queen")
    
# Vectorize lemmas
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(all_lemmas) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(412, 303)
(412, 303)


In [9]:
# Classifier
clf = MultinomialNB().fit(X_train_tfidf, labels)

# Predict
docs_new = []
labels_new = []

for item in queen_test_set:
    item.get_lemmatization()
    docs_new.append(" ".join(item.get_sumerian_lemma()))
    labels_new.append("queen")

for item in non_queen_list[200:]:
    item.get_lemmatization()
    docs_new.append(" ".join(item.get_sumerian_lemma()))
    labels_new.append("not queen")

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'irsaŋ tugur itud ud zal irsaŋ a tugur itud ud zal irsaŋ a itud ud lal zal mu.DU dab itud mu ara lal hulu' => queen
'gud niga udu niga sag udu niga maškim udu niga maš maškim e u ziga ki itud akiti mu us en huŋ' => queen
'udu u maš udu u maš sila kuli šabra sila ensik mu.DU dab itud mu ara hulu' => queen
'udu niga u ud ma e ziga ŋiri itud akiti mu us hulu' => queen
'udu u ziga itud mu us bad mada du' => queen
'udu niga itud ud zal gud u udu niga u niga uzud niga emuhaldim udu niga sila siškur ud itud ud zal udu niga šag sila maš zag du udu niga sila siškur ud itud ud zal udu niga u gudeʾusa sila niŋ kizah ud itud ud zal udu niga siškur sila niga niŋgu lugal ud itud ud lal zal e ziga itud mu hulu' => queen
'sila mu.DU dab itud mu hulu' => queen
'u ašgar sadug u itud mu us e du mu us' => queen
'udu gukkal niga uš egal udu niga ezem itud ud lal zal udu niga giranum udu niga u giranum u udu niga siškur sila šag egal sila niŋ kizah maš udu niga mašgal niga šag itud ud zal maš uš egal ziga i

# Multinomial Naive Bayes: Accuracy, Precision, and Recall

<b>Accuracy</b>: 
(# true positives + # true negatives) / total #<br><br>
<b>Recall</b>:
true positives / (true positives + false positives) <br>
High recall means that an algorithm returned most of the relevant results <br><br>
<b>Precision</b>:
true positives / (true positives + false negatives) <br>
High precision means that an algorithm returned substantially more relevant results than irrelevant ones

In [14]:
print("Accuracy: ", np.mean(predicted == labels_new))
print("Recall: ", str(metrics.recall_score(labels_new, predicted, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(labels_new, predicted, ["queen", "not queen"], average="macro")))

Accuracy:  0.764957264957
Recall:  0.759429824561
Precision:  0.823076923077
