In [31]:
# Import Libraries
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from collections import OrderedDict, Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

# Extract Transactions

In [32]:
DREHEM_IDS = 'clean_drehem_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["ox", "cow", "sheep", "goat", "lamb"] # account for plural
labels["wild_animal"] = ["bear", "gazelle", "mountain"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["boots", "sandals"]
labels["precious_object"] = ["copper", "bronze", "silver", "gold"]
labels["wool"] = ["wool"]
labels["queens_archive"] = []

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        self.sumerian_lemmas = []
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Get Sumerian lemmatized text only
    def get_sumerian_lemma(self):
        #print(item.sumerian_lemmas)
        item.sumerian_lemmas = []
        for k, v in self.lemmas.items():
            #print(v)
            result = re.findall(" .*\[[a-z]+\]", v)
            if len(result) == 0:
                continue
            lemmas = [s[:s.index("[")].strip() for s in result[0].split(";") if re.search("\[", s)]
            self.sumerian_lemmas += lemmas
        return self.sumerian_lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): break
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): break
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): break
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): break
            if find_label("precious_object", line, found): break
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    if p_id in ids:
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append(line)
    return lst

In [33]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
#complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids

#all_transactions = read_files("raw-data/", complete_list)
non_queen_list = read_files("raw-data/", list_drehem_ids)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 256
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 270
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf

In [50]:
# Populate training and data set

for item in queen_training_list:
    item.get_lemmatization()
    item.set_label()
    
for item in non_queen_list:
    item.get_lemmatization()
    item.set_label()
    
for item in queen_test_set:
    item.get_lemmatization()
    item.set_label()
    
training_data = []
training_labels = []
test_data = []
test_labels = []

for item in queen_training_list:
    training_data.append(" ".join(item.get_sumerian_lemma()))
    training_labels.append("queen")
    
for i in range(len(non_queen_list)):
    if i < 200:
        training_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        training_labels.append("not queen")
    else:
        test_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        test_labels.append("not queen")
        
for item in queen_test_set:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")

print(len(training_data))
print(len(training_labels))
print(len(test_data))
print(len(test_labels))

#print(training_data)
# print(test_labels)

470
470
176
176


# Multinomial Naive Bayes Classifer
For classifying queen's archives transactions


<b>Accuracy</b>: 
(# true positives + # true negatives) / total #<br><br>
<b>Recall</b>:
true positives / (true positives + false positives) <br>
High recall means that an algorithm returned most of the relevant results <br><br>
<b>Precision</b>:
true positives / (true positives + false negatives) <br>
High precision means that an algorithm returned substantially more relevant results than irrelevant ones

In [51]:
# Bag of Words model
count_vect = CountVectorizer(analyzer = "word",
                                          tokenizer = None,    
                                          preprocessor = None,
                                          ngram_range = (1, 1),
                                          binary = False,
                                          strip_accents='unicode')

X_train_counts = count_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
bag_of_words_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = bag_of_words_classifier.predict(X_new_tfidf)

# for doc, category in zip(docs_new, predicted):
#     print('%r => %s' % (doc, category))
    
print("Accuracy: ", np.mean(predicted == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, predicted, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, predicted, ["queen", "not queen"], average="macro")))

(470, 320)
(470, 320)
Accuracy:  0.926136363636
Recall:  0.883928571429
Precision:  0.951127819549


In [55]:
# Bigram Model
bigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (2, 2),
                                    strip_accents='unicode')

# Train
X_train_counts = bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
bigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

bigram_multinomial_nb_prediction = bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(bigram_multinomial_nb_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))

(470, 2126)
(470, 2126)
Accuracy:  0.920454545455
Recall:  0.875
Precision:  0.94776119403


In [64]:
# Trigram Model
trigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (3, 3),
                                    strip_accents='unicode')

# Train
X_train_counts = trigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
trigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = trigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

trigram__prediction = trigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(trigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))

(470, 4104)
(470, 4104)
Accuracy:  0.857954545455
Recall:  0.781547619048
Precision:  0.900932400932


In [62]:
# Unigram and Bigram Model
uni_and_bigram_vectorizer = CountVectorizer(analyzer = "word",
                                            tokenizer = None,
                                            preprocessor = None,
                                            binary = False,
                                            ngram_range = (1,2),
                                            strip_accents='unicode')

# Train
X_train_counts = uni_and_bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
uni_and_bigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(uni_and_bigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))

(470, 2446)
(470, 2446)
Accuracy:  0.9375
Recall:  0.901785714286
Precision:  0.958015267176


# Clustering

GOAL: find more commodity labels in the set of non-queen data

DBSCAN Model:
Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density

In [80]:
# Build DBSCAN model with Tf-idf vectorizer
tfidfvec = TfidfVectorizer(ngram_range=(1,2), min_df = 0.0, max_df = 1.0, decode_error = "ignore")

# Run DBSCAN
trans_list = []

for trans in non_queen_list:
    trans_list.append(" ".join([lemma[2:] for lemma in trans.lemmas]))
    
X1 = tfidfvec.fit_transform(trans_list).toarray()
db1 = DBSCAN(eps=1.0, min_samples=len(q_list1)/100).fit(X1)  # Higher eps => More leniency to be same cluster
core_samples_mask = np.zeros_like(db1.labels_, dtype=bool)
core_samples_mask[db1.core_sample_indices_] = True

labels1 = db1.labels_
n_clusters_ = len(set(labels1)) - (1 if -1 in labels1 else 0) # Number of clusters in labels
print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 9


In [81]:
# Print clusters
clusters = {}
for c, i in enumerate(labels1):
    if i == -1:
        continue
    elif i in clusters:
        clusters[i].append(trans_list[c] )
    else:
        clusters[i] = [trans_list[c]]

for c in clusters:
    print(clusters[c])
    print()
    
# Brief analysis: number of clusters depends on eps and ngram range

[' 6(diš) udu  kišib₃ lu₂-{d}suen  ki ab-ba-kal-la-ta  ba-zi#  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na#  dub-sar#  dumu [...]  ARAD₂-[zu]', ' 2(diš) udu  ki a-ba-{d}en-lil₂-gin₇-ta  ur-ku₃-nun-na  i₃-dab₅  iti ezem-mah  mu e₂ {d}šara₂ ba-du₃  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su [kurušda]  ARAD₂-zu', ' [n] gu₄ [niga]  ki puzur₄-{d}en-lil₂-ta  ur-ku₃-nun-na  i₃-dab₅  iti ezem-mah  mu e₂ {d}šara₂ umma#{ki} ba-du₃  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su kurušda  ARAD₂-zu', ' [x] gu₄  [ki] la-diš-ip-ta  ur-ku₃-nun-na  i₃-dab₅  iti a₂-ki-ti  mu {d}i-bi₂-{d}suen  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su kurušda  ARAD₂-zu', ' 1(diš) sila₄  ki ur-{d}ig-alim-ta  ur-ku₃-nun-na  i₃-dab₅  ša₃ unu{ki}  iti 