In [2]:
# Import Libraries
import re
import random
import numpy as np
import pandas as pd
from collections import OrderedDict, Counter
from sklearn import metrics, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

# Extract Transactions

In [3]:
DREHEM_IDS = 'clean_drehem_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["[ox]", "[cow]", "[sheep]", "[goat]", "[lamb]", "[~sheep]", "[equid]"] # account for plural
labels["wild_animal"] = ["[bear]", "[gazelle]", "[mountain]"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["[boots]", "[sandals]"]
labels["precious_object"] = ["[copper]", "[bronze]", "[silver]", "[gold]"]
labels["wool"] = ["[wool]", "[~wool]"]
labels["queens_archive"] = []

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        self.sumerian_lemmas = []
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Get Sumerian lemmatized text only
    def get_sumerian_lemma(self):
        #print(item.sumerian_lemmas)
        item.sumerian_lemmas = []
        for k, v in self.lemmas.items():
            #print(v)
            result = re.findall(" .*\[[a-z]+\]", v)
            if len(result) == 0:
                continue
            lemmas = [s[:s.index("[")].strip() for s in result[0].split(";") if re.search("\[", s)]
            self.sumerian_lemmas += lemmas
        return self.sumerian_lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            if line == '@object seal':
                found['seal'] = [line]
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): continue
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): continue
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): continue
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): continue
            if find_label("precious_object", line, found): continue
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            if line.startswith("P"):
                line = line.strip()
                lst.append(line)
    return lst

In [4]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids

non_queen_list = read_files("raw-data/", list_drehem_ids)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 429
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 275
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf

In [5]:
# Populate training and data set

for item in queen_training_list:
    item.get_lemmatization()
    item.set_label()
    
for item in non_queen_list:
    item.get_lemmatization()
    item.set_label()
    
for item in queen_test_set:
    item.get_lemmatization()
    item.set_label()
    
            
training_data = []
training_labels = []
test_data = []
test_labels = []

for item in queen_training_list[:175]:
    training_data.append(" ".join(item.get_sumerian_lemma()))
    training_labels.append("queen")
    
for i in range(len(non_queen_list)):
    if i < 350:
        training_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        training_labels.append("not queen")
    else:
        test_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        test_labels.append("not queen")
        
for item in queen_test_set:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")
    
for item in queen_training_list[175:]:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")

print(len(training_data))
print(len(training_labels))
print(len(test_data))
print(len(test_labels))

all_data = training_data + test_data
all_labels = training_labels + test_labels

queen_data = [x for x, y in zip(all_data, all_labels) if y == "queen"]
queen_labels = ["queen"] * len(queen_data)
non_queen_data = [x for x, y in zip(all_data, all_labels) if y == "not queen"]
non_queen_labels = ["not queen"] * len(non_queen_data)

525
525
299
299


# Multinomial Naive Bayes Classifer
First pass at classifying Queen's Archives transactions.

We will use the following measures to determine how well our classifier does.

<b>Accuracy</b>: 
(# true positives + # true negatives) / total #<br><br>
<b>Recall</b>:
true positives / (true positives + false positives) <br>
High recall means that an algorithm returned most of the relevant results <br><br>
<b>Precision</b>:
true positives / (true positives + false negatives) <br>
High precision means that an algorithm returned substantially more relevant results than irrelevant ones

In [6]:
# Bag of Words model (Unigram)
count_vect = CountVectorizer(analyzer = "word",
                                          tokenizer = None,    
                                          preprocessor = None,
                                          ngram_range = (1, 1),
                                          binary = False,
                                          strip_accents='unicode',
                                          token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = count_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
bag_of_words_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = bag_of_words_classifier.predict(X_new_tfidf)
    
print("Accuracy: ", np.mean(predicted == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, predicted, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, predicted, ["queen", "not queen"], average="macro")))

(525, 368)
Accuracy:  0.8595317725752508
Recall:  0.9004890678941312
Precision:  0.824953314659197


In [7]:
# Bigram Model
bigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (2, 2),
                                    strip_accents='unicode',
                                    token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
bigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

bigram_multinomial_nb_prediction = bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(bigram_multinomial_nb_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))

(525, 2476)
Accuracy:  0.8996655518394648
Recall:  0.9237054085155351
Precision:  0.861512027491409


In [8]:
# Trigram Model
trigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (3, 3),
                                    strip_accents='unicode',
                                    token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = trigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
trigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = trigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

trigram_prediction = trigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(trigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))

(525, 4711)
Accuracy:  0.8929765886287625
Recall:  0.9151035673187572
Precision:  0.8541728031418754


In [9]:
# Unigram and Bigram Model
uni_and_bigram_vectorizer = CountVectorizer(analyzer = "word",
                                            tokenizer = None,
                                            preprocessor = None,
                                            binary = False,
                                            ngram_range = (1,2),
                                            strip_accents='unicode',
                                            token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = uni_and_bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
uni_and_bigram_classifier = MultinomialNB(0.5).fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(uni_and_bigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))

(525, 2844)
Accuracy:  0.9163879598662207
Recall:  0.9350690448791714
Precision:  0.8799748743718593


From above, we can see the unigram-bigram model does the best. This is the one we will use later on.

# Predict

We will now do a preliminary prediction using just the (unigram-bigram) Naive Bayes Classifier. All files will be read in to get a tentative result for the size of the Queen's Archive.

In [10]:
# Read all files to get all Drehem transactions
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    #print(p_id)
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append("P" + line)
    return lst

In [11]:
all_ids = get_drehem_ids("drehem_p_ids.txt")
all_transactions = read_files("raw-data/", all_ids)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 14594


In [12]:
data = []
mapping = {}
labs = []

for t in all_transactions:
    t.get_lemmatization()
    labs.append(t.set_label())
    lemma = " ".join(t.get_sumerian_lemma())
    data.append(lemma)
    if lemma in mapping:
        mapping[lemma].append(t)
    else:
        mapping[lemma] = [t]
        
# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

In [13]:
# Small sample of results
for doc, category in zip(data[:30], uni_and_bigram_prediction[:30]):
    print('%r => %s' % (doc, category))

'udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad' => not queen
'udu niga sila ga uš ud ki šu teŋ itud mu u hulu' => not queen
'sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu' => not queen
'mašgal niga udu uš ud šag ki šu teŋ itud mu lugal' => not queen
'udu a sag udu niga mašgal niga udu sila ga kir ga uš ud ki šu teŋ itud mu lugal' => not queen
'udu niga sila niga udu sila ensik sila ensik sila mu.DU itud mu en maš pad ud' => not queen
'mašgal niga egia ensik ragaba maškim itud ud lal zal ki ŋiri dubsar itud mu en huŋ udu' => not queen
'amar mašda mašda amar amar mašda sila amar mašda maš sila ensik mu.DU dab itud gu mu us hulu ud' => not queen
'mašgal ki ensik dab itud mu huŋ udu' => not queen
'sila zabardab sila ensik sila ensik mu.DU dab itud akiti mu u mada ud hulu ud' => not queen
'mašgal niga lu maškim itud ud zal ki itud mu lugal hulu' => not queen
'gud ab mu.DU lugal ki bala zig ensik dab ziga itud mu en maš pad' => not qu

In [14]:
# Percentange of Queen transactions
print(len([i for i in uni_and_bigram_prediction if i == "queen"])/len(uni_and_bigram_prediction))

0.05543373989310676


# SVM

Now we will try to improve our classification by using an SVM, or a Support Vector Machine. We'll use `GridSearchCV` to determine the best hyperparameters for our model.

We will use the following hyperparameters:

**Kernel:** A transformation function; takes in data and transforms it into different forms. 

The kernels we will use are linear and RBF (radial basis function).
RBF kernels are useful when there is no prior knowledge about the data.

**Gamma:** Influence of a single sample.

A low `gamma` value means a single sample can have a very large influence on the classifier; a high `gamma` value means its influence is more limited.

**C:** Regularization parameter - trades off between model simplicity and classification accuracy.

A low `C` value means a very smooth decision surface (easy to explain); a high `C` value means high accuracy but possible overfitting.

### Pre-classification Check

Ideally, we want to use all of our training data to train our model for the best accuracy. Because we won't be holding anything back for validation, we should make sure that this is appropriate first; given our data, the model should be able to generalize well to other data.

The following section is very intensive and will take a long time to run (roughly 30 minutes for this dataset) but only needs to be run once. Here we do a k-fold cross-validation with 10 folds. For each of 10 iterations, 9 sections are used as training for a model and one section is held out for validation. For each section, we make sure to include the same proportion of queen and non-queen samples as with the overall data.

After running this, visually inspect the results and make sure that the results for each fold are similar; that is, there is no section where results drop dramatically, as this would indicate that the data does not generalize well to other data

In [None]:
# Only run this section once

n1 = len(queen_data)
n2 = len(non_queen_data)
q_fold_size = [i * (n1 // 10) for i in range(10)] + [n1] 
nq_fold_size = [i * (n2 // 10) for i in range(10)] + [n2] 
all_pred = []
all_true_lab = []
acc = []
rec = []
prec = []

for i in range(10):
    print("Starting fold", i)
    q_start, q_end = q_fold_size[i], q_fold_size[i + 1]
    nq_start, nq_end = nq_fold_size[i], nq_fold_size[i + 1]
    test_data = queen_data[q_start:q_end] + non_queen_data[nq_start:nq_end]
    test_labels = queen_labels[q_start:q_end] + non_queen_labels[nq_start:nq_end]
    training_data = queen_data[:q_start] + queen_data[q_end:] + non_queen_data[:nq_start] + non_queen_data[nq_end:]
    training_labels = queen_labels[:q_start] + queen_labels[q_end:] + non_queen_labels[:nq_start] + non_queen_labels[nq_end:]

    uni_bi_vect = CountVectorizer(analyzer = "word",
                                  ngram_range = (1,2),
                                  token_pattern='(?u)\\b\\w+\\b')

    # Train
    X_train_counts = uni_bi_vect.fit_transform(training_data)

    # Get TF-IDF
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # Classifier
    params = [{'kernel': ['rbf', 'linear'],
              'gamma': [1e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100],
              'C': [1, 5, 10, 50]}]
    uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params)
    uni_bi_clf.fit(X_train_tfidf, training_labels)
    print("Best parameters found:")
    print(uni_bi_clf.best_params_)

    # Predict
    X_new_counts = uni_bi_vect.transform(test_data)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    all_pred.extend(uni_bi_clf.predict(X_new_tfidf))
    all_true_lab.extend(test_labels)
    
    uni_bi_pred = uni_bi_clf.predict(X_new_tfidf)

    acc.append(np.mean(uni_bi_pred == test_labels))
    rec.append(metrics.recall_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro"))
    prec.append(metrics.precision_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro"))

print("Total Accuracy: ", sum([1 if all_pred[i] == all_true_lab[i] else 0 for i in range(len(all_pred))])/len(all_pred))
print("Average Accuracy: ", sum(acc) / len(acc))
print("Total Recall: ", str(metrics.recall_score(all_true_lab, all_pred, ["queen", "not queen"], average="macro")))
print("Average Recall: ", sum(rec) / len(rec))
print("Total Precision: ", str(metrics.precision_score(all_true_lab, all_pred, ["queen", "not queen"], average="macro")))
print("Average Precision: ", sum(prec) / len(prec))

Once the generalizability of the data is verified, we will use all of our pre-labeled data to train the model.

In [15]:
uni_bi_vect = CountVectorizer(analyzer = "word",
                                  ngram_range = (1,2),
                                  token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = uni_bi_vect.fit_transform(all_data) 

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
params = [{'kernel': ['rbf', 'linear'],
          'gamma': [1e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100],
          'C': [1, 5, 10, 50]}]
uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params)
uni_bi_clf.fit(X_train_tfidf, all_labels)
print("Best parameters found:")
print(uni_bi_clf.best_params_)

X_all_counts = uni_bi_vect.transform(data) 
X_all_tfidf = tfidf_transformer.transform(X_all_counts)
uni_bi_pred_v2 = uni_bi_clf.predict(X_all_tfidf)
print("Percentage Queens' Texts Predicted")
print(len([i for i in uni_bi_pred_v2 if i == 'queen'])/len(uni_bi_pred_v2))

Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Percentage Queens' Texts Predicted
0.06838426750719474


In [16]:
queen_data = [x for x, y in zip(data, uni_bi_pred_v2) if y == "queen"]
non_queens = [x for x, y in zip(data, uni_bi_pred_v2) if y == "not queen"]
non_queen_labs = [x for x, y in zip(labs, uni_bi_pred_v2) if y == "not queen"]
print(len(non_queens))

13596


## Separating into Archives

Now that we have our classifier, we'll separate the data into archives. We'll also do an inspection of the text that was labeled as "Unknown" to see if there are more labels we can use to classify.

In [20]:
dead_animal_archive = []
domesticated_animal_archive = []
wild_animal_archive = []
leather_object_archive = []
precious_object_archive = []
wool_archive = []
seal_archive = []
Unknown_archive = []
Unknown_labs = []

for x, y in zip(non_queens, non_queen_labs):
    if "seal" in y.keys():
        seal_archive.append(x)
    else:
        lab = max(y, key = lambda x: len(y[x]))
        if lab == "Unknown":
            Unknown_labs.append(list(y.values()))
        if "animal" in lab and "dead_animal" in y.keys():
            dead_animal_archive.append(x)
        else:
            exec(lab + "_archive.append(x)")
      
print("Dead Animals archive size:", len(dead_animal_archive))
print("Domesticated Animals archive size:", len(domesticated_animal_archive))
print("Leather Objects archive size:", len(leather_object_archive))
print("Precious Objects archive size:", len(precious_object_archive))
print("Wild Animals archive size:", len(wild_animal_archive))
print("Wool archive size:", len(wool_archive))
print("Seal archive size:", len(seal_archive))
print("Unknown archive size:", len(Unknown_archive))

{'domesticated_animal': ['#lem: n; udu[sheep]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; niga[fattened]', '#lem: n; sila[lamb]; ga[milk]'], 'dead_animal': ['#lem: uš[die]']}
{'domesticated_animal': ['#lem: n; sila[lamb]', '#lem: n; sila[lamb]; FN', '#lem: n; u[ewe]; n; udu[sheep]; n; maš[goat]'], 'dead_animal': ['#lem: uš[die]; ekišibak[storeroom]']}
{'domesticated_animal': ['#lem: n; mašgal[goat]; niga[fattened]; n; udu[sheep]'], 'dead_animal': ['#lem: uš[die]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; a[water]; u; sag[rare]', '#lem: n; udu[sheep]; niga[fattened]', '#lem: n; mašgal[goat]; niga[fattened]', '#lem: n; udu[sheep]', '#lem: n; sila[lamb]; ga[milk]', '#lem: n; kir[lamb]; ga[milk]'], 'dead_animal': ['#lem: uš[die]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; niga[fattened]; n; sila[lamb]; niga[fattened]', '#lem: n; udu[sheep]; n; sila[lamb]', '#lem: n; sila[lamb]', '#lem: n; sila[lamb]; PN', '#lem: mu[year]; en[priest]; DN; maš[goat]; pad[find]']}
{'domesti

{'precious_object': ['#lem: n; n; giŋ[unit]; kugbabbar[silver]'], 'domesticated_animal': ['#lem: mu[year]; en[priest]; DN; maš[goat]; pad[find]']}
{'domesticated_animal': ['#lem: n; kuš[skin]; dusu[equid]']}
{'domesticated_animal': ['#lem: n; kuš[skin]; gud[ox]', '#lem: n; adda[corpse]; gud[ox]', '#lem: n; kuš[skin]; dusu[equid]']}
{'Unknown': [['#project: epsd2/u3adm/p002', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(u) 7(aš) 2(barig) 5(diš) sila₃ še gur', '#lem: n; n; n; n; sila[unit]; še[barley]; gur[unit]', '2. sa₂-du₁₁ ku₅-ra₂', '#lem: sadug[offerings]; kud[cut]', '3. ki na-lu₅-ta', '#lem: ki[place]; PN', '4. mu-kuₓ(DU)', '#lem: mu.DU[delivery]', '@reverse', '1. nu-ur₂-{d}suen', '#lem: PN', '2. šu ba-ti', '#lem: šu[hand]; teŋ[approach]', '$ 1 line blank', '3. iti maš-da₃-gu₇', '#lem: itud[moon]; MN', '4. mu gu-za {d}en-lil₂-la₂ ba-dim₂', '#lem: mu[year]; guza[chair]; PN; dim[create]']]}
{'Unknown': [['#project: epsd2/u3adm/p002', '#atf: use unicode', '#atf:

{'domesticated_animal': ['#lem: n; n; gud[ox]; niga[fattened]; n; lal[small]; n; gud[ox]', '#lem: n; n; n; gud[ox]; niga[fattened]; n; n; gud[ox]', '#lem: n; n; gud[ox]; niga[fattened]; n; n; gud[ox]', '#lem: n; n; gud[ox]; niga[fattened]; n; n; gud[ox]', '#lem: n; n; gud[ox]; niga[fattened]; n; n; gud[ox]', '#lem: šuniŋin[total]; n; n; n; gud[ox]; niga[fattened]', '#lem: šuniŋin[total]; n; n; n; gud[ox]', '#lem: šuniŋin[total]; n; n; gud[ox]; hi[mix]']}
{'domesticated_animal': ['#lem: n; n; gud[ox]', '#lem: mu[year]; en[priest]; DN; maš[goat]; pad[find]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; niga[fattened]', '#lem: n; mašgal[goat]; niga[fattened]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; sadug[offerings]; FN', '#lem: n; n; udu[sheep]; n; u[ewe]', '#lem: n; uzud[goat]'], 'dead_animal': ['#lem: uš[die]; mu[year]; X']}
{'domesticated_animal': ['#lem: n; udu[sheep]', '#lem: n; kir[lamb]', '#lem: n; n; uzud[goat]', '#lem: n; maš[goat]']}
{'domesticated_animal': ['#lem: n

{'domesticated_animal': ['#lem: n; sila[lamb]', '#lem: n; maš[goat]']}
{'domesticated_animal': ['#lem: n; n; n; udu[sheep]', '#lem: n; n; sila[lamb]', '#lem: n; n; mašgal[goat]', '#lem: n; maš[goat]', '#lem: n; maš[goat]; ga[milk]']}
{'domesticated_animal': ['#lem: n; sila[lamb]']}
{'domesticated_animal': ['#lem: n; n; n; sila[lamb]', '#lem: n; maš[goat]']}
{'domesticated_animal': ['#lem: gud[ox]', '#lem: n; udu[sheep]', '#lem: n; sila[lamb]', '#lem: n; mašgal[goat]', '#lem: gud[ox]; n; n; udu[sheep]']}
{'domesticated_animal': ['#lem: n; sila[lamb]', '#lem: n; maš[goat]']}
{'domesticated_animal': ['#lem: n; sila[lamb]']}
{'domesticated_animal': ['#lem: n; sila[lamb]', '#lem: n; sila[lamb]; PN; lu[person]; lumumun[priest]; maškim[administrator]', '#lem: šuniŋin[total]; n; sila[lamb]']}
{'domesticated_animal': ['#lem: n; udu[sheep]', '#lem: n; sila[lamb]']}
{'domesticated_animal': ['#lem: n; udu[sheep]; PN', '#lem: n; udu[sheep]; PN; kuš[official]', '#lem: n; maš[goat]; PN', '#lem: n; gu

{'domesticated_animal': ['#lem: n; n; n; udu[sheep]', '#lem: n; sila[lamb]', '#lem: n; mašgal[goat]']}
{'wild_animal': ['#lem: n; amar[young]; az[bear]']}
{'domesticated_animal': ['#lem: n; sila[lamb]']}
{'domesticated_animal': ['#lem: n; lal[small]; n; udu[sheep]', '#lem: n; sila[lamb]', '#lem: n; maš[goat]']}
{'domesticated_animal': ['#lem: n; sila[lamb]']}
{'Unknown': [['#project: epsd2/u3adm/p006', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 2(diš) {munus}aš₂-gar₃', '#lem: n; ašgar[kid]', '2. u₄ 2(u) 2(diš)-kam', '#lem: ud[sun]; n; n', '3. ki ab-ba-sa₆-ga-ta', '#lem: ki[place]; PN', '4. na-lu₅', '#lem: PN', '@reverse', '1. i₃-dab₅', '#lem: dab[seize]', '$ 1 line blank', '2. iti u₅-bi₂-gu₇', '#lem: itud[moon]; MN', '3. mu en-unu₆-gal {d}inanna unu{ki}-ga ba-hun', '#lem: mu[year]; X; TN; GN; huŋ[hire]', '@left', '1. 2(diš)', '#lem: n']]}
{'domesticated_animal': ['#lem: n; gud[ox]; niga[fattened]', '#lem: n; udu[sheep]', '#lem: n; maš[goat]']}
{'domesticated_anim

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [89]:
lem_lists = [list(x.values())[0] for x in non_queen_labs]
lines = []
for lem in lem_lists:
    for line in lem:
        if line[:5] == "#lem:":
            lines.append(line)
line_string = "".join(lines)
translations = [t.strip() for t in re.split(";|#lem:|\|", line_string) if t != "" and len(t) > 2 and "[" in t]

In [17]:
trans_dict = {}
for tr in translations:
    word, defn = re.split("\[", tr)
    if defn[-1] == "]":
        defn = defn[:-1]
    if defn in trans_dict.keys():
        trans_dict[defn].append(word)
    else:
        trans_dict[defn] = [word]

In [18]:
# Checking what words fall under the Unknown category

Unknown_lines = [x[0][0] for x in Unknown_labs]
Unknown_lemmas = []
for l in Unknown_lines:
    for val in l:
        if val[:5] == "#lem:":
            Unknown_lemmas.append(val)

In [19]:
line_string_unknown = "".join(Unknown_lemmas)
translations_unknown = list(set([t.strip() for t in re.split(";|#lem:|\|", line_string_unknown) if t != "" and len(t) > 2 and "[" in t]))
# print(translations_unknown)

for tr in translations_unknown:
    word, defn = re.split("\[", tr)
    if defn[-1] == "]":
        defn = defn[:-1]
    if defn in trans_dict.keys():
        trans_dict[defn].append(word)
    else:
        trans_dict[defn] = [word]
        
for k, v in trans_dict.items():
    trans_dict[k] = list(set(v))

In [20]:
non_tagged_words = re.findall("\[(.*?)\]", " ".join(Unknown_lemmas))
non_tagged_freq = Counter(non_tagged_words)
sorted_words_by_freq = sorted(list(non_tagged_freq.keys()), key = lambda x: (-non_tagged_freq[x], x))
words_with_sumerian = [[x, trans_dict[x]] for x in sorted_words_by_freq]
# print(words_with_sumerian)

In [21]:
unknown_words = open('unknown_text_words.csv', 'w', encoding='utf-8')
unknown_words.write("Word, Frequency, List of Sumerian Words\n")
for w, defs in words_with_sumerian:
    unknown_words.write(w + ", " + str(non_tagged_freq[w]) + ", " + str(defs) + "\n")
unknown_words.close()

In [22]:
id_archive_map = {}
    
for text in dead_animal_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "dead animal"

for text in wild_animal_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "wild animal"
    
for text in domesticated_animal_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "domesticated animal"
        
for text in leather_object_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "leather object"
    
for text in precious_object_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "precious object"

for text in wool_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "wool"
    
for text in Unknown_archive:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "unknown"

for text in queen_data:
    t_list = mapping[text]
    for t in t_list:
        id_archive_map[t.p_id] = "queen"

14594

In [23]:
# Write results of archive classification to CSV file

archive = open('archive_map.csv', 'w', encoding='utf-8')
archive.write("PID,Archive\n")
for k, v in id_archive_map.items():
    archive.write(k + "," + v + "\n")
archive.close()

## Random Sample Creation

In [64]:
training_ids = [id[1:] for id in complete_list]
all_texts_ids = list(id_archive_map.keys())
non_training_ids = [id for id in all_texts_ids if id not in training_ids]

random_indices = random.sample(range(len(non_training_ids)), 50)
random_ids = [non_training_ids[index] for index in range(len(non_training_ids)) if index in random_indices]

random_sample_ids = []
random_sample_texts = []
random_sample_labels = []

for text, t_list in mapping.items():
    for t in t_list:
        if t.p_id in random_ids:
            random_sample_ids.append(t.p_id)
            random_sample_texts.append(text)
            random_sample_labels.append(id_archive_map[t.p_id])
            
# print(random_sample_labels)
# print(random_sample_texts)

In [65]:
# Write results of random sample to CSV file
# import unicodecsv as csv

sample = open('random_sample.csv', 'w', encoding='utf-8')
sample.write("PID,Text,Archive\n")
for i in range(len(random_sample_ids)):
# for i in range(1):
    sample.write(random_sample_ids[i] + "," + random_sample_texts[i] + "," + random_sample_labels[i] + "\n")
sample.close()

## Multiple Transactions

In [19]:
t = None
for tr in all_transactions:
#     if tr.p_id == "P125693":
#     if tr.p_id == "P142790":
    if tr.p_id == "P430140":
#     if tr.p_id == "P124036":
        t = tr
        break
t.lines

['#project: epsd2/u3adm/p013',
 '#atf: use unicode',
 '#atf: lang sux',
 '@object seal',
 '@surface a',
 '1. an-ne₂-ba-ab-du₇',
 '#lem: PN',
 '2. dub-sar',
 '#lem: dubsar[scribe]',
 '3. dumu ka-sa₆',
 '#lem: dumu[child]; X']

In [78]:
use = [line for line in t.lines if line[:5] == "#lem:" or line[0] not in "#@"]
use_str = "\n".join(use)
# use_str
re.findall("([1-9]\. [1-9]+\(.*\) .*\n#lem: n; .+\[.+\]\n)", use_str)

['4. 4(diš) udu niga 1(diš) sila₄#\n#lem: n; udu[sheep]; niga[fattened]; n; sila[lamb]\n',
 '5. 3(diš) udu niga 1(diš) sila₄\n#lem: n; udu[sheep]; niga[fattened]; n; sila[lamb]\n']

In [46]:
# for t in all_transactions:
#     if t.p_id in random_sample_ids:
#         print(t.p_id)
#         print(" ".join(t.get_sumerian_lemma()))