### Load python file

In [57]:
# %load get_roles_edited.py
import re

ORACC_FILE = 'raw-data/p001.atf'
DREHEM_P_IDS_FILE = 'drehem_p_ids.txt'

NUM_TEXTS = 25
# things to keep:
#	set: (p_index)
#	dictionary: {p_index: transaction}
#	object transaction has p_index, source, receiver, 

complete_transaction_ls = list()
complete_drehem_p_sets = set()
p_sets_of_interest = set()
transaction_ls = list()

class Transaction:
	def __init__(self, p, line):
		self.p_index = p
		# can add date/place/etc.
		self.roles = {}
		# role name: name of person (ex. 'source': 'Turamdatan')
		self.people = set()

		self.line = line;
		# ls of lines

		self.ls_lines_containing_PN = list()
		# ls of lines containing PN

	def __str__(self):
		return 'P' + str(self.p_index) + '\nlines: ' + self.line 
		# + '\n\t' + str(self.people)
	def get_num_people(self):
		return len(self.people)

def get_p_index(line):
	# line of the form '&P100259 = ...': return '100259'
	return line.split(' ')[0][2:]


def get_drehem_p_ids():
	with open(DREHEM_P_IDS_FILE) as read_file:
		for line in read_file:
			complete_drehem_p_sets.add(line[:-1])
	return complete_drehem_p_sets

def collect_p_id_of_interest(file_name):
	get_drehem_p_ids();
	with open(file_name) as input_file:
		count = 0
		for line in input_file:
			line = line.strip()# remove \n
			if line.startswith('&P'):
				p_id = get_p_index(line);
				if p_id in complete_drehem_p_sets:
					p_sets_of_interest.add(p_id);
	# print( p_sets_of_interest)
	return p_sets_of_interest

def get_transactions(file_name, p_id_set_sort=None):
	
	with open(file_name) as input_file:
		currentTransaction = None;
		for line in input_file:
			line = line.strip() # remove \n
			if line.startswith('&P'):
				if currentTransaction is None:		
					p_index = get_p_index(line)
					currentTransaction = Transaction(p_index, line)
				else:
					if p_id_set_sort is None or currentTransaction.p_index in p_id_set_sort:
						transaction_ls.append(currentTransaction) # add the transaction to the list
					

					p_index = get_p_index(line)
					currentTransaction = Transaction(p_index, line) # start a new transaction
			else:
				currentTransaction.line += "\n" + line;

		# add the last transaction to the list
		if p_id_set_sort is None or currentTransaction.p_index in p_id_set_sort:
			transaction_ls.append(currentTransaction)
		
	# print(len(transaction_ls))
	# print(transaction_ls[-1])
	return transaction_ls


# with open(ORACC_FILE) as input_file:
# 	for line in input_file:
# 		line = line.strip()
# 		if not re.match(r'^&P|^#lem|^\d+\.|^@|^#|^\$|^\d+ʾ\.|^=:|^\s+$|^$', line):
# 			print(line);
		# if re.match(r'^&P|^#lem|^\d+\.|^\d+ʾ\.', line):
		# 	print(line)
		# elif re.match(r'|^\s+$|^$', line):
			#ignore
		# elif re.match(r'^@|^#|^\$|')
def clean_transaction(transaction):
	# change the whole transaction.line into
	# a list of important text, discarding unimportant lines
	# transaction.line = ["1. ~~ #lem: ~~" ... ]

	# print(transaction.p_index)
	searchObj = re.findall(r'(\d+ʾ\..*\n#lem:.*|\d+\..*\n#lem:.*)', transaction.line)
	
	transaction.line = searchObj


def get_PN(transaction):
	# get the list of lines containing PN --> transaction.ls_lines_containing_PN
	# get the set of PN --> transaction.people
	txt,translit = None, None
	for line in transaction.line:
		if "PN" in line:
			transaction.ls_lines_containing_PN.append(line)

			
	for line in transaction.ls_lines_containing_PN:
		txt,translit = line.split("\n")
		
		translit = translit.replace("#lem: ","")
		
		translit, txt = translit.split(";"), txt.split(" ")[1:]

		for index, word  in enumerate(translit):
		    if "PN" in word:
		        # print(translit, txt, index)

		        transaction.people.add(txt[index])
	# if len(transaction.people) == 0:
	# 	print(transaction.line,txt, translit, transaction.p_index)

	# print(transaction.people)


def process_files():
	#return a complete list of transactions out of all input files
	global p_sets_of_interest
	global transaction_ls
	global complete_transaction_ls
	i = 1; # oracc file number


	
	while i <= 15:
		p_sets_of_interest = set()
		transaction_ls = list()
		if i < 10:
			ORACC_FILE = 'raw-data/p00'+str(i)+'.atf'
		else:
			ORACC_FILE = 'raw-data/p0'+str(i)+'.atf'
		collect_p_id_of_interest(ORACC_FILE);
		get_transactions(ORACC_FILE);
		for trans in transaction_ls:		
			clean_transaction(trans)
			get_PN(trans)
		complete_transaction_ls += transaction_ls
		# print(len(transaction_ls), len(p_sets_of_interest))
		print("completed "+ORACC_FILE)
		i+=1

	print("***Total of ", len(complete_transaction_ls), " transactions.***")
	return complete_transaction_ls

def main():
	ls = process_files()
	no_PN_count = 0
	contain_ki_count = 0
	contain_subati_count = 0
	contain_ragaba_count = 0
	for trans in ls:
		if trans.get_num_people() == 0:
			no_PN_count+=1
		line = ''.join(trans.line)
		if "ki[place]" in line:
			contain_ki_count += 1
			
		if "šu ba-ti" in line:
			contain_subati_count += 1
		if "ra₂-gaba" in line:
			contain_ragaba_count += 1


	print(no_PN_count, " transactions DO NOT have PN.")
	print(contain_ki_count, " transactions contain a word ki[place].")
	print(contain_subati_count, " transactions contain a word šu ba-ti.")
	print(contain_ragaba_count, " transactions contain a word ra₂-gaba.")

main()



completed raw-data/p001.atf
completed raw-data/p002.atf
completed raw-data/p003.atf
completed raw-data/p004.atf
completed raw-data/p005.atf
completed raw-data/p006.atf
completed raw-data/p007.atf
completed raw-data/p008.atf
completed raw-data/p009.atf
completed raw-data/p010.atf
completed raw-data/p011.atf
completed raw-data/p012.atf
completed raw-data/p013.atf
completed raw-data/p014.atf
completed raw-data/p015.atf
***Total of  67499  transactions.***
3794  transactions DO NOT have PN.
33241  transactions contain a word ki[place].
10993  transactions contain a word šu ba-ti.
590  transactions contain a word ra₂-gaba.


In [73]:
ls1 = list() # list of words in the same line with PN
ls2 = list() # list of transliterated lines containing PN
for trans in complete_transaction_ls:
    for line in trans.ls_lines_containing_PN:
        txt,translit = line.split("\n")
        translit = translit.replace("#lem: ","")
        ls2.append(translit)
        translit, txt = translit.split(";"), txt.split(" ")[1:]
        ls1 += [a for a in translit]

### import libraries

In [59]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### playing with data

In [60]:
s = pd.Series(ls1)

In [62]:
# number of unique words in the lines containing PN
s.value_counts().count()

1699

In [72]:
# 10 most common words in the lines containing PN 
s.value_counts().head(200)

 PN                         203268
PN                           92262
n                            59525
ki[place]                    31367
 n                           31193
dumu[child]                  20411
kišib[seal]                  16666
 dumu[child]                 12144
mu[year]                     12050
ŋiri[foot]                   11291
ugula[overseer]               8753
 sila[unit]                   7052
 lugal[king]                  6846
 X                            6432
 dab[seize]                   5311
 lu[person]                   5086
 u                            4717
 GN                           4439
 maškim[administrator]        3167
 sukkal[secretary]            3164
 gur[unit]                    3044
 sila[lamb]                   2959
 engar[farmer]                2925
 udu[sheep]                   2865
 ugula[overseer]              2786
AN                            2641
 ki[place]                    2555
 giŋ[unit]                    2456
 AN                 

In [70]:
# 10 most uncommon words in the lines containing PN 
s.value_counts().tail(10)

 be[diminish]       1
kabduga[checked]    1
 hum[snap]          1
zagbar[scraps]      1
dida[wort]          1
 atah[helper]       1
iŋeš[oil]           1
hazin[ax]           1
gi[thicket]         1
pakud[~tree]        1
dtype: int64

### Looking at lines containing common words 

In [89]:
def filter_lines(word):
    # return a series of lines containing the word
    return pd.Series(list(filter(lambda x:word in x , ls2)))

###### kišib[seal]

In [92]:
filter_lines("kišib[seal]").value_counts().head(10)

kišib[seal]; PN                              12990
kišib[seal]; PN; dumu[child]; PN               797
kišib[seal]; namšatam[administration]; PN      224
n; kišib[seal]; PN                             212
n; gur[unit]; kišib[seal]; PN                  166
kišib[seal]; PN; PN                            140
gabari[copy]; kišib[seal]; PN                  123
n; n; kišib[seal]; PN                          122
n; udu[sheep]; kišib[seal]; PN                 114
kišib[seal]; PN; šabra[administrator]          111
dtype: int64

###### ŋiri[foot]

In [97]:
filter_lines("ŋiri[foot]").value_counts().head(10)

ŋiri[foot]; PN                             6790
ŋiri[foot]; PN; dubsar[scribe]             1004
ŋiri[foot]; PN; sukkal[secretary]           529
ŋiri[foot]; PN; dumu[child]; PN             432
ŋiri[foot]; PN; šagia[cup-bearer]           167
ŋiri[foot]; PN; šabra[administrator]        166
ŋiri[foot]; PN; šarrabdu[administrator]     123
ŋiri[foot]; PN; ragaba[rider]               120
ŋiri[foot]; PN; PN                           97
n; udu[sheep]; ŋiri[foot]; PN                85
dtype: int64

###### ugula[overseer]

In [98]:
filter_lines("ugula[overseer]").value_counts().head(10)

ugula[overseer]; PN                          7891
PN; ugula[overseer]                           293
n; ŋuruš[male]; ugula[overseer]; PN           291
n; ugula[overseer]; PN                        275
ugula[overseer]; PN; PN                       149
n; n; ŋuruš[male]; ugula[overseer]; PN        144
n; n; ugula[overseer]; PN                     142
ugula[overseer]; PN; dumu[child]; PN          131
ugula[overseer]; PN; kuš[official]            121
ugula[overseer]; PN; šabra[administrator]      90
dtype: int64