In [155]:
# Import Libraries
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from collections import OrderedDict, Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

# Extract Transactions

In [245]:
DREHEM_IDS = 'clean_drehem_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["[ox]", "[cow]", "[sheep]", "[goat]", "[lamb]"] # account for plural
labels["wild_animal"] = ["[bear]", "[gazelle]", "[mountain]"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["[boots]", "[sandals]"]
labels["precious_object"] = ["[copper]", "[bronze]", "[silver]", "[gold]"]
labels["wool"] = ["[wool]"]
labels["queens_archive"] = []

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        self.sumerian_lemmas = []
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Get Sumerian lemmatized text only
    def get_sumerian_lemma(self):
        #print(item.sumerian_lemmas)
        item.sumerian_lemmas = []
        for k, v in self.lemmas.items():
            #print(v)
            result = re.findall(" .*\[[a-z]+\]", v)
            if len(result) == 0:
                continue
            lemmas = [s[:s.index("[")].strip() for s in result[0].split(";") if re.search("\[", s)]
            self.sumerian_lemmas += lemmas
        return self.sumerian_lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): continue
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): continue
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): continue
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): continue
            if find_label("precious_object", line, found): continue
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            if line.startswith("P"):
                line = line.strip()
                lst.append(line)
    return lst

In [246]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids
# list_more_data = get_drehem_ids("more_training_data.txt")
# more_training_data = read_files("raw-data/", list_more_data)
# text = []
# with open('more_training_data2.txt', 'w', encoding="utf8") as f:
#     for item in more_training_data:
#         f.write(item.p_id+"\n")
#         item.get_lemmatization()
#         for i in item.lemmas.keys():
#             f.write(i+"\n")
#         f.write("\n")
#all_transactions = read_files("raw-data/", complete_list)
non_queen_list = read_files("raw-data/", list_drehem_ids)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)
#more_training_data = read_files("raw-data/", complete_list, True)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 429
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 275
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf

In [247]:
# Populate training and data set

for item in queen_training_list:
    item.get_lemmatization()
    item.set_label()
    
for item in non_queen_list:
    item.get_lemmatization()
    item.set_label()
    
for item in queen_test_set:
    item.get_lemmatization()
    item.set_label()
    
            
training_data = []
training_labels = []
test_data = []
test_labels = []

for item in queen_training_list[:175]:
    training_data.append(" ".join(item.get_sumerian_lemma()))
    training_labels.append("queen")
    
for i in range(len(non_queen_list)):
    if i < 350:
        training_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        training_labels.append("not queen")
    else:
        test_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        test_labels.append("not queen")
        
for item in queen_test_set:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")
    
for item in queen_training_list[175:]:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")

print(len(training_data))
print(len(training_labels))
print(len(test_data))
print(len(test_labels))


all_data = training_data + test_data
all_labels = training_labels + test_labels

queen_data = [x for x, y in zip(all_data, all_labels) if y == "queen"]
queen_labels = ["queen"] * len(queen_data)
non_queen_data = [x for x, y in zip(all_data, all_labels) if y == "not queen"]
non_queen_labels = ["not queen"] * len(non_queen_data)

525
525
299
299


# Multinomial Naive Bayes Classifer
For classifying queen's archives transactions


<b>Accuracy</b>: 
(# true positives + # true negatives) / total #<br><br>
<b>Recall</b>:
true positives / (true positives + false positives) <br>
High recall means that an algorithm returned most of the relevant results <br><br>
<b>Precision</b>:
true positives / (true positives + false negatives) <br>
High precision means that an algorithm returned substantially more relevant results than irrelevant ones

In [159]:
# Bag of Words model
count_vect = CountVectorizer(analyzer = "word",
                                          tokenizer = None,    
                                          preprocessor = None,
                                          ngram_range = (1, 1),
                                          binary = False,
                                          strip_accents='unicode',
                                          token_pattern='(?u)\\b\\w+\\b')

X_train_counts = count_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.

for x in enumerate(count_vect.get_feature_names()):
    print(x)
    

(0, 'a')
(1, 'ab')
(2, 'abba')
(3, 'abzu')
(4, 'adda')
(5, 'agaʾus')
(6, 'aguziga')
(7, 'ak')
(8, 'akiti')
(9, 'alan')
(10, 'am')
(11, 'ama')
(12, 'amar')
(13, 'amarsag')
(14, 'an')
(15, 'anna')
(16, 'anse')
(17, 'anubda')
(18, 'apinla')
(19, 'ara')
(20, 'arad')
(21, 'arar')
(22, 'arua')
(23, 'asag')
(24, 'asgab')
(25, 'asgar')
(26, 'aslum')
(27, 'atu')
(28, 'atua')
(29, 'az')
(30, 'aŋarak')
(31, 'ba')
(32, 'babbar')
(33, 'bad')
(34, 'bala')
(35, 'bappir')
(36, 'bar')
(37, 'barag')
(38, 'barakara')
(39, 'bisaŋdubak')
(40, 'da')
(41, 'dab')
(42, 'dabin')
(43, 'dag')
(44, 'dam')
(45, 'damgar')
(46, 'dara')
(47, 'de')
(48, 'deg')
(49, 'dida')
(50, 'didli')
(51, 'dikud')
(52, 'dim')
(53, 'dirig')
(54, 'diŋir')
(55, 'du')
(56, 'dubsar')
(57, 'dug')
(58, 'duga')
(59, 'duh')
(60, 'duksium')
(61, 'dumu')
(62, 'dumumunus')
(63, 'dupsik')
(64, 'dur')
(65, 'durah')
(66, 'dusia')
(67, 'dusu')
(68, 'e')
(69, 'eban')
(70, 'edula')
(71, 'egal')
(72, 'egia')
(73, 'egir')
(74, 'ekisibak')
(75, 'eme')
(

In [72]:
print(training_data[0:5])
print(X_train_counts)

['udunita u kir ur itud uzud kir ur udunita mašgal itud akiti sadug u mu e du', 'gud udu u mu.DU kurušda dab itud mu us hulu', 'sila maš sila mu.DU dab itud mu hulu', 'udu maš mu.DU dab itud mu hulu', 'udu niga sadug kag eš udu niga sadug kag ŋipar udu u itud ud zal ziga šag itud mu us hulu']
  (0, 55)	1
  (0, 68)	1
  (0, 211)	1
  (0, 256)	1
  (0, 8)	1
  (0, 206)	1
  (0, 342)	1
  (0, 148)	2
  (0, 329)	2
  (0, 163)	2
  (0, 315)	2
  (0, 319)	2
  (1, 129)	1
  (1, 333)	1
  (1, 41)	1
  (1, 183)	1
  (1, 318)	1
  (1, 113)	1
  (1, 55)	1
  (1, 211)	2
  (1, 148)	1
  (1, 315)	1
  (2, 202)	1
  (2, 279)	2
  (2, 129)	1
  :	:
  (523, 316)	1
  (523, 228)	3
  (523, 41)	1
  (523, 318)	1
  (523, 211)	1
  (523, 206)	1
  (523, 148)	1
  (523, 315)	1
  (524, 307)	1
  (524, 201)	1
  (524, 100)	1
  (524, 76)	1
  (524, 288)	1
  (524, 189)	5
  (524, 133)	1
  (524, 77)	1
  (524, 291)	1
  (524, 366)	1
  (524, 207)	1
  (524, 157)	1
  (524, 257)	1
  (524, 316)	1
  (524, 318)	7
  (524, 211)	2
  (524, 148)	1


In [160]:
# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
# print(X_train_tfidf)

# Classifier
bag_of_words_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = bag_of_words_classifier.predict(X_new_tfidf)

# for doc, category in zip(docs_new, predicted):
#     print('%r => %s' % (doc, category))
# print(predicted)
    
print("Accuracy: ", np.mean(predicted == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, predicted, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, predicted, ["queen", "not queen"], average="macro")))

(525, 368)
Accuracy:  0.8595317725752508
Recall:  0.9004890678941312
Precision:  0.824953314659197


In [161]:
# Bigram Model
bigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (2, 2),
                                    strip_accents='unicode',
                                    token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
bigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

bigram_multinomial_nb_prediction = bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(bigram_multinomial_nb_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))

(525, 2476)
(525, 2476)
Accuracy:  0.8996655518394648
Recall:  0.9237054085155351
Precision:  0.861512027491409


In [162]:
# Trigram Model
trigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (3, 3),
                                    strip_accents='unicode',
                                    token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = trigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
trigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = trigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

trigram_prediction = trigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(trigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))

(525, 4711)
(525, 4711)
Accuracy:  0.8929765886287625
Recall:  0.9151035673187572
Precision:  0.8541728031418754


In [163]:
# Unigram and Bigram Model
uni_and_bigram_vectorizer = CountVectorizer(analyzer = "word",
                                            tokenizer = None,
                                            preprocessor = None,
                                            binary = False,
                                            ngram_range = (1,2),
                                            strip_accents='unicode',
                                            token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = uni_and_bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
uni_and_bigram_classifier = MultinomialNB(0.5).fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(uni_and_bigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))

(525, 2844)
(525, 2844)
Accuracy:  0.9163879598662207
Recall:  0.9350690448791714
Precision:  0.8799748743718593


# Clustering

GOAL: find more commodity labels in the set of non-queen data

DBSCAN Model:
Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density

In [78]:
# Build DBSCAN model with Tf-idf vectorizer
tfidfvec = TfidfVectorizer(ngram_range=(1,2), min_df = 0.0, max_df = 1.0, decode_error = "ignore", token_pattern='(?u)\\b\\w+\\b')

# Run DBSCAN
trans_list = []
trans_dict = {}

for trans in non_queen_list:
    trans_dict[" ".join([lemma[2:] for lemma in trans.lemmas])] = trans
    trans_list.append(" ".join([lemma[2:] for lemma in trans.lemmas]))
    
X1 = tfidfvec.fit_transform(trans_list).toarray()
# (1,2),1.0 = 9, (1,2),1.1 = 6, (1,3),1.2 = 7
db1 = DBSCAN(eps=1.0, min_samples=len(trans_list)/100).fit(X1)  # Higher eps => More leniency to be same cluster
core_samples_mask = np.zeros_like(db1.labels_, dtype=bool)
core_samples_mask[db1.core_sample_indices_] = True

labels1 = db1.labels_
n_clusters_ = len(set(labels1)) - (1 if -1 in labels1 else 0) # Number of clusters in labels
print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 9


In [79]:
# Print clusters
clusters = {}
for c, i in enumerate(labels1):
    if i == -1:
        continue
    elif i in clusters:
        clusters[i].append(trans_list[c] )
    else:
        clusters[i] = [trans_list[c]]

i= 0
for c in clusters:
    print("Cluster", i)
    print("=========","\n")
    for trans in clusters[c]:
        print(trans_dict[trans].p_id)
        print(trans)
    print()
    i += 1
    
# Brief analysis: number of clusters depends on eps and ngram range

Cluster 0

P100041
 6(diš) udu  kišib₃ lu₂-{d}suen  ki ab-ba-kal-la-ta  ba-zi#  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na#  dub-sar#  dumu [...]  ARAD₂-[zu]
P101340
 2(diš) udu  ki a-ba-{d}en-lil₂-gin₇-ta  ur-ku₃-nun-na  i₃-dab₅  iti ezem-mah  mu e₂ {d}šara₂ ba-du₃  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su [kurušda]  ARAD₂-zu
P101341
 [n] gu₄ [niga]  ki puzur₄-{d}en-lil₂-ta  ur-ku₃-nun-na  i₃-dab₅  iti ezem-mah  mu e₂ {d}šara₂ umma#{ki} ba-du₃  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su kurušda  ARAD₂-zu
P101344
 [x] gu₄  [ki] la-diš-ip-ta  ur-ku₃-nun-na  i₃-dab₅  iti a₂-ki-ti  mu {d}i-bi₂-{d}suen  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na  dub-sar  dumu lu₂-{d}nin-gir₂-su kurušda  ARAD₂-zu
P101346
 1(diš) sila₄  ki ur-{d}ig-alim-ta  ur-k

# Predict

In [248]:
# Read all files to get all Drehem transactions
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    #print(p_id)
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append("P" + line)
    return lst

In [None]:
all_ids = get_drehem_ids("drehem_p_ids.txt")
print(all_ids[:10])
all_transactions = read_files("raw-data/", all_ids)

['P125693', 'P131063', 'P103742', 'P118642', 'P337724', 'P212008', 'P103154', 'P105823', 'P390986', 'P115492']
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf


In [166]:
data = []
mapping = {}
labs = []

for t in all_transactions:
    t.get_lemmatization()
    labs.append(t.set_label())
    lemma = " ".join(t.get_sumerian_lemma())
    data.append(lemma)
    if lemma in mapping:
        mapping[lemma].append(t)
    else:
        mapping[lemma] = [t]
        
# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

In [83]:
for doc, category in zip(data[:100], uni_and_bigram_prediction[:100]):
    print('%r => %s' % (doc, category))

'udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad' => not queen
'udu niga sila ga uš ud ki šu teŋ itud mu u hulu' => not queen
'sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu' => not queen
'mašgal niga udu uš ud šag ki šu teŋ itud mu lugal' => not queen
'udu a sag udu niga mašgal niga udu sila ga kir ga uš ud ki šu teŋ itud mu lugal' => not queen
'udu niga sila niga udu sila ensik sila ensik sila mu.DU itud mu en maš pad ud' => not queen
'mašgal niga egia ensik ragaba maškim itud ud lal zal ki ŋiri dubsar itud mu en huŋ udu' => not queen
'amar mašda mašda amar amar mašda sila amar mašda maš sila ensik mu.DU dab itud gu mu us hulu ud' => not queen
'mašgal ki ensik dab itud mu huŋ udu' => not queen
'sila zabardab sila ensik sila ensik mu.DU dab itud akiti mu u mada ud hulu ud' => not queen
'mašgal niga lu maškim itud ud zal ki itud mu lugal hulu' => not queen
'gud ab mu.DU lugal ki bala zig ensik dab ziga itud mu en maš pad' => not qu

In [167]:
# Percentange of Queen transactions
print(len([i for i in uni_and_bigram_prediction if i == "queen"])/len(uni_and_bigram_prediction))

0.05543373989310676


In [169]:
queens = [data[i] for i in range(len(uni_and_bigram_prediction)) if uni_and_bigram_prediction[i] == "queen"]

# This code is outdated; it relied on a one-to-one mapping of lemmatized lines to PIDs, but there are some duplicate lines.

# with open("predicted_queen_v2.txt", 'w', encoding="utf8") as f:
#     for q in queens:
#         qu = mapping[q]
#         f.write(qu.p_id+"\n")
#         for line in qu.lemmas.keys():
#             f.write(line+"\n")
#         f.write("\n")

# Support Vector Machine
Good for classification and when you have small datasets (<1000 points)

In [170]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
        ('vect', CountVectorizer(analyzer="word", ngram_range=(1, 2),token_pattern='(?u)\\b\\w+\\b')),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42))])

text_clf.fit(training_data, training_labels)
predicted = text_clf.predict(test_data)
np.mean(predicted == test_labels)



0.9297658862876255

In [171]:
predict_v2 = text_clf.predict(data)

In [173]:
print(len([i for i in predict_v2 if i == 'queen'])/len(predict_v2))
print(predict_v2[:10])
#print(all_labels)

0.05961353981088118
['not queen' 'not queen' 'not queen' 'not queen' 'not queen' 'not queen'
 'not queen' 'not queen' 'not queen' 'not queen']


## SVM v2

In [174]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

uni_bi_vect = CountVectorizer(analyzer = "word",
                              ngram_range = (1,2),
                              token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = uni_bi_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [175]:
# Classifier
# gamma = how far single training example influence can reach
# low values = far influence
# high values = close influence
# c = tradeoff of misclassification vs simplicity
# low values = smooth decision surface
# high values = classify everything correctly

params = [{'kernel': ['rbf', 'linear'],
          'gamma': [1e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100],
          'C': [1, 5, 10, 50]}]
uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params)
# uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params, cv = 10)
uni_bi_clf.fit(X_train_tfidf, training_labels)
print("Best parameters found:")
print(uni_bi_clf.best_params_)

Best parameters found:
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [176]:
# Predict
X_new_counts = uni_bi_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_bi_pred = uni_bi_clf.predict(X_new_tfidf)

print("Accuracy: ", np.mean(uni_bi_pred == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro")))

Accuracy:  0.9331103678929766
Recall:  0.9383199079401611
Precision:  0.9024725274725274


In [177]:
X_all_counts = uni_bi_vect.transform(data) 
X_all_tfidf = tfidf_transformer.transform(X_all_counts)
uni_bi_pred_v2 = uni_bi_clf.predict(X_all_tfidf)
print("Percentage Queens' Texts Predicted")
print(len([i for i in uni_bi_pred_v2 if i == 'queen'])/len(uni_bi_pred_v2))


Percentage Queens' Texts Predicted
0.0650267233109497


# SVM v3

In [178]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

n1 = len(queen_data)
n2 = len(non_queen_data)
q_fold_size = [i * (n1 // 10) for i in range(10)] + [n1] 
nq_fold_size = [i * (n2 // 10) for i in range(10)] + [n2] 
all_pred = []
all_true_lab = []
acc = []
rec = []
prec = []

for i in range(10):
    print("Starting fold", i)
    q_start, q_end = q_fold_size[i], q_fold_size[i + 1]
    nq_start, nq_end = nq_fold_size[i], nq_fold_size[i + 1]
    test_data = queen_data[q_start:q_end] + non_queen_data[nq_start:nq_end]
    test_labels = queen_labels[q_start:q_end] + non_queen_labels[nq_start:nq_end]
    training_data = queen_data[:q_start] + queen_data[q_end:] + non_queen_data[:nq_start] + non_queen_data[nq_end:]
    training_labels = queen_labels[:q_start] + queen_labels[q_end:] + non_queen_labels[:nq_start] + non_queen_labels[nq_end:]

    uni_bi_vect = CountVectorizer(analyzer = "word",
                                  ngram_range = (1,2),
                                  token_pattern='(?u)\\b\\w+\\b')

    # Train
    X_train_counts = uni_bi_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.

    # Get TF-IDF
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # Classifier
    params = [{'kernel': ['rbf', 'linear'],
              'gamma': [1e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100],
              'C': [1, 5, 10, 50]}]
    uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params)
    uni_bi_clf.fit(X_train_tfidf, training_labels)
    print("Best parameters found:")
    print(uni_bi_clf.best_params_)

    # Predict
    X_new_counts = uni_bi_vect.transform(test_data)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    all_pred.extend(uni_bi_clf.predict(X_new_tfidf))
    all_true_lab.extend(test_labels)
    
    uni_bi_pred = uni_bi_clf.predict(X_new_tfidf)

    acc.append(np.mean(uni_bi_pred == test_labels))
    rec.append(metrics.recall_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro"))
    prec.append(metrics.precision_score(test_labels, uni_bi_pred, ["queen", "not queen"], average="macro"))

print("Total Accuracy: ", sum([1 if all_pred[i] == all_true_lab[i] else 0 for i in range(len(all_pred))])/len(all_pred))
print("Average Accuracy: ", sum(acc) / len(acc))
print("Total Recall: ", str(metrics.recall_score(all_true_lab, all_pred, ["queen", "not queen"], average="macro")))
print("Average Recall: ", sum(rec) / len(rec))
print("Total Precision: ", str(metrics.precision_score(all_true_lab, all_pred, ["queen", "not queen"], average="macro")))
print("Average Precision: ", sum(prec) / len(prec))

Starting fold 0
Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Starting fold 1
Best parameters found:
{'C': 5, 'gamma': 0.0001, 'kernel': 'linear'}
Starting fold 2
Best parameters found:
{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}
Starting fold 3
Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Starting fold 4
Best parameters found:
{'C': 5, 'gamma': 1, 'kernel': 'rbf'}
Starting fold 5
Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Starting fold 6
Best parameters found:
{'C': 10, 'gamma': 0.05, 'kernel': 'rbf'}
Starting fold 7
Best parameters found:
{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
Starting fold 8
Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Starting fold 9
Best parameters found:
{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
Total Accuracy:  0.9514563106796117
Average Accuracy:  0.9517089018843405
Total Recall:  0.9507715912779204
Average Recall:  0.9510029186499775
Total Precision:  0.9522447795480247
Average Pre

In [179]:
uni_bi_vect = CountVectorizer(analyzer = "word",
                                  ngram_range = (1,2),
                                  token_pattern='(?u)\\b\\w+\\b')

# Train
X_train_counts = uni_bi_vect.fit_transform(all_data) # Learn the vocabulary dictionary and return term-document matrix.

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Classifier
params = [{'kernel': ['rbf', 'linear'],
          'gamma': [1e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100],
          'C': [1, 5, 10, 50]}]
uni_bi_clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), params)
uni_bi_clf.fit(X_train_tfidf, all_labels)
print("Best parameters found:")
print(uni_bi_clf.best_params_)

X_all_counts = uni_bi_vect.transform(data) 
X_all_tfidf = tfidf_transformer.transform(X_all_counts)
uni_bi_pred_v2 = uni_bi_clf.predict(X_all_tfidf)
print("Percentage Queens' Texts Predicted")
print(len([i for i in uni_bi_pred_v2 if i == 'queen'])/len(uni_bi_pred_v2))

Best parameters found:
{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}
Percentage Queens' Texts Predicted
0.06838426750719474


In [195]:
queen_data = [x for x, y in zip(data, uni_bi_pred_v2) if y == "queen"]
non_queens = [x for x, y in zip(data, uni_bi_pred_v2) if y == "not queen"]
non_queen_labs = [x for x, y in zip(labs, uni_bi_pred_v2) if y == "not queen"]

# set([x for x in non_queens if non_queens.count(x) > 1])
len(non_queens)

13596

In [190]:
mult = [x for x in non_queen_labs if len(x) > 1]
len(mult)

2871

In [191]:
len([x for x, y in zip(non_queens, non_queen_labs) if len(y) > 1 and "dead_animal" not in y])

560

In [216]:
dead_animal_archive = []
domesticated_animal_archive = []
wild_animal_archive = []
leather_object_archive = []
precious_object_archive = []
wool_archive = []
Unknown_archive = []

counter = 0
for x, y in zip(non_queens, non_queen_labs):
    lab = max(y, key = lambda x: len(y[x]))
    if lab == "Unknown":
        print(y)
    if "animal" in lab and "dead_animal" in y.keys():
        dead_animal_archive.append(x)
    else:
        exec(lab + "_archive.append(x)")
    counter += 1
      
print("Dead Animals archive size:", len(dead_animal_archive))
print("Domesticated Animals archive size:", len(domesticated_animal_archive))
print("Leather Objects archive size:", len(leather_object_archive))
print("Precious Objects archive size:", len(precious_object_archive))
print("Wild Animals archive size:", len(wild_animal_archive))
print("Wool archive size:", len(wool_archive))
print("Unknown archive size:", len(Unknown_archive))

{'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) lulim-nita₂', '#lem: n; X', '2. šu-gid₂ e₂-muhaldim', '#lem: šugid[~animal]; emuhaldim[kitchen]', '3. mu gar₃-du-ne-še₃', '#lem: mu[year]; gardu[soldier]', '4. dingir-dan sukkal maškim', '#lem: PN; sukkal[secretary]; maškim[administrator]', '5. iti u₄ 1(u) 6(diš) ba-zal', '#lem: itud[moon]; ud[sun]; n; n; zal[pass]', '6. ki lu₂-dingir-ra-ta', '#lem: ki[place]; PN', '@reverse', '1. ba-zi', '#lem: PN', '$ 1 line blank', '2. iti še-sag₁₁-ku₅', '#lem: itud[moon]; MN', '3. mu en eridu{ki} ba-hun', '#lem: mu[year]; en[priest]; GN; huŋ[hire]', '@left', '1. 1(diš)', '#lem: n']]}
{'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 2(ban₂) zi₃-gu', '#lem: n; zidgu[flour]', '2. sa₂-du₁₁-še₃', '#lem: sadug[offerings]', '3. ki lu₂-du₁₀-ga-mu#-ta#', '#lem: ki[place]; PN', '4. ba-zi', '#lem: PN', '@reverse', '1. iti ezem-an-na',

{'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) amar maš-da₃-nita₂', '#lem: n; amar[young]; X', '2. e₂-uz-ga', '#lem: eʾuzga[building]', '3. mu-kuₓ(DU) wa-ta₂-ru-um sanga', '#lem: mu.DU[delivery]; PN; FN', '4. ur-{d}ba-ba₆ muhaldim maškim', '#lem: PN; muhaldim[cook]; maškim[administrator]', '@reverse', '1. u₄ 2(u) 1(diš)-kam', '#lem: ud[sun]; n; n', '2. ki ab-ba-sa₆-ga-ta ba-zi', '#lem: ki[place]; PN; PN', '3. giri₃ da-a-a-ti dub-sar', '#lem: ŋiri[foot]; PN; dubsar[scribe]', '4. iti ezem-mah', '#lem: itud[moon]; MN', '5. mu en eridu{ki} ba-hun', '#lem: mu[year]; en[priest]; GN; huŋ[hire]', '@left', '1. 1(diš)', '#lem: n']]}
{'Unknown': [['#project: epsd2/u3adm/p001', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 8(geš₂) in-nu gur', '#lem: n; X; gur[unit]', '2. gi NE', '#lem: gi[reed]; NE[~silver]', '3. ša₃ e₂-amar e-kal-lum', '#lem: šag[heart]; X; PN', '4. ki ur-{d}ba-ba₆-ta', '#lem: ki[place]; PN'

{'Unknown': [['#project: epsd2/u3adm/p003', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) {kuš}e-sir₂ u₂-hab₂ e₂#-ba#-an', '#lem: n; esir[shoe]; hab[plant]; eban[pair]', '2. a tu₅-a u₄ 1(u) 5(diš)', '#lem: a[water]; tu[cvve]; ud[sun]; n; n', '3. 1(diš) {kuš}e-sir₂ u₂-hab₂ e₂-ba-an', '#lem: n; esir[shoe]; hab[plant]; eban[pair]', '4. a tu₅-a sag u₄-sakar', '#lem: a[water]; tu[cvve]; saŋ[head]; usakar[moon]', '5. giri₃ nu-ur₂-{d}utu', '#lem: ŋiri[foot]; PN', '6. ki ta₂-hi-iš-a-tal-ta#', '#lem: ki[place]; PN', '@reverse', '1. ba-zi#', '#lem: PN', '2. ša₃ nibru#[{ki}]', '#lem: šag[heart]; GN', '3. iti še-sag₁₁-ku₅', '#lem: itud[moon]; MN', '4. mu {d}šu-{d}suen lugal uri₅{ki}-ma-ke₄ na-ru₂-a-mah {d}en-lil₂ {d}nin-lil₂-ra mu-ne-du₃', '#lem: mu[year]; TN; lugal[king]; GN; X; FN; DN; du[build]', '@left', '1. 2(diš)', '#lem: n']]}
{'Unknown': [['#project: epsd2/u3adm/p003', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 3(geš₂) sa {u₂}|ZI&ZI|-še₃', 

{'Unknown': [['#project: epsd2/u3adm/p006', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. mu-kuₓ(DU) nu-ub-tuku', '#lem: mu.DU[delivery]; tuku[acquire]', '2. u₄ 2(u) 6(diš)', '#lem: ud[sun]; n; n', '3. iti ezem-me-ki-gal₂', '#lem: itud[moon]; MN', '@reverse', '1. mu en-unu₆-gal {d}inanna ba-hun', '#lem: mu[year]; X; TN; huŋ[hire]', '$ 1 line blank']]}
{'Unknown': [['#project: epsd2/u3adm/p006', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(u) 3(diš) 1/3(diš)? KUM? šum₂ gaz ša₃-kal', '#lem: n; n; n; X; šum[garlic]; gaz[kill]; PN', '2. 1(u) 3(diš) 1/3(diš)? šum₂ gaz ša₃-kal', '#lem: n; n; n; šum[garlic]; gaz[kill]; PN', '3. ki puzur₄-i₃-li₂-ta', '#lem: ki[place]; X', '@reverse', '1. mu-kuₓ(DU)', '#lem: mu.DU[delivery]', '2. dingir-ba-ni', '#lem: PN', '3. šu ba-ti', '#lem: šu[hand]; teŋ[approach]', '$ blank space', '4. iti u₅-bi₂-gu₇', '#lem: itud[moon]; MN', '5. mu us₂-sa gu-za ba-dim₂', '#lem: mu[year]; us[follow]; guza[chair]; dim[create]']]}
{

{'Unknown': [['#project: epsd2/u3adm/p010', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. [n] dusu₂ munus', '#lem: n; dusu[equid]; munus[woman]', '2. šu-gid₂', '#lem: šugid[~animal]', '3. mu ur-mah-še₃', '#lem: mu[year]; urmah[lion]', '4. iti u₄ 2(u) 4(diš) ba-zal', '#lem: itud[moon]; ud[sun]; n; n; zal[pass]', '@reverse', '1. ki ur-ku₃-nun-na-ta ba-zi', '#lem: ki[place]; PN; PN', '2. giri₃ nu-ur₂-{d}suen dub-sar', '#lem: ŋiri[foot]; PN; dubsar[scribe]', '3. iti ezem-{d}nin-a-zu', '#lem: itud[moon]; MN', '4. mu us₂-sa si-ma-num₂{ki} ba-hun', '#lem: mu[year]; us[follow]; GN; huŋ[hire]', '@left', '1. [...] dusu₂', '#lem: AN; dusu[equid]', '@seal 1', '1. nu-ur₂-{d}suen', '#lem: PN', '2. dub-sar', '#lem: dubsar[scribe]', '3. dumu i-di₃-[er₃-ra]', '#lem: dumu[child]; PN']]}
{'Unknown': [['#project: epsd2/u3adm/p010', '#atf: use unicode', '#atf: lang sux', '@tablet', '@obverse', '1. 1(diš) [...]', '#lem: n; AN', '2. [siskur₂ ...]', '#lem: siškur[prayer]; AN', '3. giri₃ [a

From above, we know that some lines correspond to multiple texts (with distinct PIDs). They should all be classified under the same archive; below, we'll match each PID with an archive. In addition, we'll do a check to make sure that duplicate texts are in the same archive according to the classifier.

In [209]:
id_archive_map = {}
extra_map = {}
    
for text in dead_animal_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "dead animal"])
            id_archive_map[t.p_id] = "dead animal"

for text in wild_animal_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "wild animal"])
            id_archive_map[t.p_id] = "wild animal"
    
for text in domesticated_animal_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "domesticated animal"])
            id_archive_map[t.p_id] = "domesticated animal"
        
for text in leather_object_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "leather object"])
            id_archive_map[t.p_id] = "leather object"
    
for text in precious_object_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "precious object"])
            id_archive_map[t.p_id] = "precious object"

for text in wool_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "wool"])
            id_archive_map[t.p_id] = "wool"
    
for text in Unknown_archive:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "unknown"])
            id_archive_map[t.p_id] = "unknown"

for text in queen_data:
    if text not in extra_map:
        extra_map[text] = []
        t_list = mapping[text]
        for t in t_list:
            extra_map[text].append([t.p_id, "queen"])
            id_archive_map[t.p_id] = "queen"
            
len(id_archive_map)

14594

In [213]:
duplicated_texts = [p for p in extra_map.items() if len(p[1]) > 1]

dup_archive_map = {}

for pair in duplicated_texts:
    archives = [t[1] for t in pair[1]]
    dup_archive_map[pair[0]] = set(archives)
    
conflicts = [p for p in dup_map.items() if len(p[1]) > 1]
len(conflicts)

0

From the above results, we can see that all duplicated texts are classified as belonging to the same archive. There shouldn't be any issues.

In [214]:
# Write results of archive classification to CSV file

archive = open('archive_map.csv', 'w')
archive.write("PID,Archive\n")
for k, v in id_archive_map.items():
    archive.write(k + "," + v + "\n")
archive.close()

In [235]:
import random

training_ids = [id[1:] for id in complete_list]
all_texts_ids = list(id_archive_map.keys())
non_training_ids = [id for id in all_texts_ids if id not in training_ids]

random_indices = random.sample(range(len(non_training_ids)), 50)
random_ids = [non_training_ids[index] for index in range(len(non_training_ids)) if index in random_indices]

random_sample_ids = []
random_sample_texts = []
random_sample_labels = []

for text, t_list in mapping.items():
    for t in t_list:
        if t.p_id in random_ids:
            random_sample_ids.append(t.p_id)
            random_sample_texts.append(text)
            random_sample_labels.append(id_archive_map[t.p_id])
            
print(random_sample_labels)
print(random_sample_texts)

['domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'dead animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'dead animal', 'dead animal', 'queen', 'domesticated animal', 'dead animal', 'domesticated animal', 'dead animal', 'dead animal', 'domesticated animal', 'dead animal', 'unknown', 'domesticated animal', 'domesticated animal', 'domesticated animal', 'wild animal', 'domesticated animal', 'domesticated animal', 'dead animal', 'dead animal', 'domesticated animal', 'dead animal', 'domesticated animal', 'unknown', 'unknown', 'domesticated animal', 'dead animal', 'domesticated animal', 'domesticated animal', 'unknown', 'domesticated animal', 'dead animal', 'domesticated animal', 'unknown', 'dead animal', 'unknown', 'domesticated animal']
['kišib ensik ab udu kišib ab kišib ab kišib', 

In [244]:
for t in all_transactions:
    if t.p_id in random_sample_ids:
        print(t.p_id)
        print(" ".join(t.get_sumerian_lemma()))

P101458
kišib ensik ab udu kišib ab kišib ab kišib kišib ensik ab udu kišib ab kišib ab kišib kišib ensik ab udu kišib ab kišib ab kišib kišib ensik ab udu kišib ab kišib ab kišib
P102321
amar gud ga sila ga kir ga utuda šag ud dab itud mu en huŋ gud udu amar gud ga sila ga kir ga utuda šag ud dab itud mu en huŋ gud udu amar gud ga sila ga kir ga utuda šag ud dab itud mu en huŋ gud udu amar gud ga sila ga kir ga utuda šag ud dab itud mu en huŋ gud udu
P102926
udu maš hi ki lu kiŋgia lugal dab šag itud mu lugal udu maš hi ki lu kiŋgia lugal dab šag itud mu lugal udu maš hi ki lu kiŋgia lugal dab šag itud mu lugal udu maš hi ki lu kiŋgia lugal dab šag itud mu lugal
P103205
udu mašgal ki dab itud mu en en huŋ udu mašgal ki dab itud mu en en huŋ udu mašgal ki dab itud mu en en huŋ udu mašgal ki dab itud mu en en huŋ
P103576
udu mašgal sila uš ud ki šu teŋ itud mu us lugal bad MAR.TU du udu mašgal sila uš ud ki šu teŋ itud mu us lugal bad MAR.TU du udu mašgal sila uš ud ki šu teŋ itud mu us

In [239]:
# Write results of random sample to CSV file
# import unicodecsv as csv

sample = open('random_sample.csv', 'w', encoding='utf-8')
sample.write("PID,Text,Archive\n")
for i in range(len(random_sample_ids)):
# for i in range(1):
    sample.write(random_sample_ids[i] + "," + random_sample_texts[i] + "," + random_sample_labels[i] + "\n")
sample.close()