# Load python file

In [5]:
# %load get_roles_edited.py

import re

# ORACC_FILE = 'raw-data/p001.atf'
DREHEM_P_IDS_FILE = 'drehem_p_ids.txt'

NUM_TEXTS = 25
# things to keep:
#	set: (p_index)
#	dictionary: {p_index: transaction}
#	object transaction has p_index, source, receiver, 

complete_transaction_ls = list()
drehem_transaction_ls = list()

complete_drehem_p_sets = set()
p_sets_of_interest = set()
transaction_ls = list()

class Transaction:
	def __init__(self, p, line):
		self.p_index = p
		# can add date/place/etc.
		self.roles = {}
		# role name: name of person (ex. 'source': 'Turamdatan')
		self.people = set()

		self.line = line;
		# ls of lines

		self.ls_lines_containing_PN = list()
		# ls of lines containing PN

	def __str__(self):
		return 'P' + str(self.p_index) + '\nlines: ' + self.line 
		# + '\n\t' + str(self.people)
	def get_num_people(self):
		return len(self.people)

def get_p_index(line):
	# line of the form '&P100259 = ...': return '100259'
	return line.split(' ')[0][2:]


def get_drehem_p_ids():
	with open(DREHEM_P_IDS_FILE) as read_file:
		for line in read_file:
			complete_drehem_p_sets.add(line[:-1])
	return complete_drehem_p_sets

def collect_p_id_of_interest(file_name):
	get_drehem_p_ids();
	with open(file_name) as input_file:
		count = 0
		for line in input_file:
			line = line.strip()# remove \n
			if line.startswith('&P'):
				p_id = get_p_index(line);
				if p_id in complete_drehem_p_sets:
					p_sets_of_interest.add(p_id);
	# print( p_sets_of_interest)
	return p_sets_of_interest

def get_transactions(file_name, p_id_set_sort=None):
	
	with open(file_name) as input_file:
		currentTransaction = None;
		for line in input_file:
			line = line.strip() # remove \n
			if line.startswith('&P'):
				if currentTransaction is None:		
					p_index = get_p_index(line)
					currentTransaction = Transaction(p_index, line)
				else:
					if p_id_set_sort is None or currentTransaction.p_index in p_id_set_sort:
						transaction_ls.append(currentTransaction) # add the transaction to the list
					

					p_index = get_p_index(line)
					currentTransaction = Transaction(p_index, line) # start a new transaction
			else:
				currentTransaction.line += "\n" + line;

		# add the last transaction to the list
		if p_id_set_sort is None or currentTransaction.p_index in p_id_set_sort:
			transaction_ls.append(currentTransaction)
		
	# print(len(transaction_ls))
	# print(transaction_ls[-1])
	return transaction_ls


# with open(ORACC_FILE) as input_file:
# 	for line in input_file:
# 		line = line.strip()
# 		if not re.match(r'^&P|^#lem|^\d+\.|^@|^#|^\$|^\d+ʾ\.|^=:|^\s+$|^$', line):
# 			print(line);
		# if re.match(r'^&P|^#lem|^\d+\.|^\d+ʾ\.', line):
		# 	print(line)
		# elif re.match(r'|^\s+$|^$', line):
			#ignore
		# elif re.match(r'^@|^#|^\$|')
def clean_transaction(transaction):
	# change the whole transaction.line into
	# a list of important text, discarding unimportant lines
	# transaction.line = ["1. ~~ #lem: ~~" ... ]

	# print(transaction.p_index)
	searchObj = re.findall(r'(\d+ʾ\..*\n#lem:.*|\d+\..*\n#lem:.*)', transaction.line)
	
	transaction.line = searchObj


def get_PN(transaction):
	# get the list of lines containing PN --> transaction.ls_lines_containing_PN
	# get the set of PN --> transaction.people
	txt,translit = None, None
	for line in transaction.line:
		if "PN" in line:
			transaction.ls_lines_containing_PN.append(line)

			
	for line in transaction.ls_lines_containing_PN:
		txt,translit = line.split("\n")
		
		translit = translit.replace("#lem: ","")
		
		translit, txt = translit.split(";"), txt.split(" ")[1:]

		for index, word  in enumerate(translit):
		    if "PN" in word:
		        # print(translit, txt, index)

		        transaction.people.add(txt[index])
	# if len(transaction.people) == 0:
	# 	print(transaction.line,txt, translit, transaction.p_index)

	# print(transaction.people)


def process_files():
	#return a complete list of transactions out of all input files
	global p_sets_of_interest
	global transaction_ls
	global complete_transaction_ls
	global drehem_transaction_ls
	i = 1; # oracc file number


	
	while i <= 15:
		p_sets_of_interest = set()
		transaction_ls = list()
		if i < 10:
			ORACC_FILE = 'raw-data/p00'+str(i)+'.atf'
		else:
			ORACC_FILE = 'raw-data/p0'+str(i)+'.atf'
		collect_p_id_of_interest(ORACC_FILE);

		#get complete list of transactions
		get_transactions(ORACC_FILE);
		for trans in transaction_ls:		
			clean_transaction(trans)
			get_PN(trans)
		complete_transaction_ls += transaction_ls

		# get drehem list of transactions
		# TO CHANGE LATER (BAD IMPLEMENTATION; READING THE FILE TWICE)
		transaction_ls = list()
		get_transactions(ORACC_FILE,p_sets_of_interest);
		for trans in transaction_ls:		
			clean_transaction(trans)
			get_PN(trans)
		drehem_transaction_ls += transaction_ls


		print("Got transactions from "+ORACC_FILE)
		i+=1

	print("***FINISH***")
	print("***Total of ", len(complete_transaction_ls), " transactions.***")
	print("***Total of ", len(drehem_transaction_ls), " Drehem transactions.***")
	return complete_transaction_ls

def main():
	ls = process_files()
	no_PN_count = 0
	contain_ki_count = 0
	contain_subati_count = 0
	contain_ragaba_count = 0
	for trans in ls:
		if trans.get_num_people() == 0:
			no_PN_count+=1
		line = ''.join(trans.line)
		if "ki[place]" in line:
			contain_ki_count += 1
			
		if "šu ba-ti" in line:
			contain_subati_count += 1
		if "ra₂-gaba" in line:
			contain_ragaba_count += 1


	print(no_PN_count, " transactions DO NOT have PN.")
	print(contain_ki_count, " transactions contain a word ki[place].")
	print(contain_subati_count, " transactions contain a word šu ba-ti.")
	print(contain_ragaba_count, " transactions contain a word ra₂-gaba.")

main()



Scraped raw-data/p001.atf
Scraped raw-data/p002.atf
Scraped raw-data/p003.atf
Scraped raw-data/p004.atf
Scraped raw-data/p005.atf
Scraped raw-data/p006.atf
Scraped raw-data/p007.atf
Scraped raw-data/p008.atf
Scraped raw-data/p009.atf
Scraped raw-data/p010.atf
Scraped raw-data/p011.atf
Scraped raw-data/p012.atf
Scraped raw-data/p013.atf
Scraped raw-data/p014.atf
Scraped raw-data/p015.atf
***FINISH***
***Total of  67499  transactions.***
***Total of  14594  Drehem transactions.***
3794  transactions DO NOT have PN.
33241  transactions contain a word ki[place].
10993  transactions contain a word šu ba-ti.
590  transactions contain a word ra₂-gaba.


In [6]:
wordsSamelineWithPN_ls = list() # list of words in the same line with PN
transLinesContainingPN_ls = list() # list of transliterated lines containing PN
for trans in complete_transaction_ls:
    for line in trans.ls_lines_containing_PN:
        txt,translit = line.split("\n")
        translit = translit.replace("#lem: ","")
        transLinesContainingPN_ls.append(translit)
        translit, txt = translit.split(";"), txt.split(" ")[1:]
        wordsSamelineWithPN_ls += [a for a in translit]

### Useful Variables

* complete_transaction_ls: list of all transactions
* drehem_transaction_ls: list of drehem transactions
* complete_drehem_p_sets: set of drehem p_id
* transLinesContainingPN_ls: list of all lines containing PN
* wordsSamelineWithPN_ls: list of all words in the same line with PN



### Transaction Object
* self.p_index
* self.people
* self.line
* self.ls_lines_containing_PN
* self.roles, self.place, self.year (possible)

# Import Libraries

In [7]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Playing with data

In [8]:
s = pd.Series(wordsSamelineWithPN_ls)

### export to csv

In [24]:
s.value_counts().to_csv('SameLineWordsPN.csv')

In [9]:
# number of unique words in the lines containing PN
s.value_counts().count()

1699

In [11]:
# most common words in the lines containing PN 
s.value_counts()

 PN                         203268
PN                           92262
n                            59525
ki[place]                    31367
 n                           31193
dumu[child]                  20411
kišib[seal]                  16666
 dumu[child]                 12144
mu[year]                     12050
ŋiri[foot]                   11291
ugula[overseer]               8753
 sila[unit]                   7052
 lugal[king]                  6846
 X                            6432
 dab[seize]                   5311
 lu[person]                   5086
 u                            4717
 GN                           4439
 maškim[administrator]        3167
 sukkal[secretary]            3164
 gur[unit]                    3044
 sila[lamb]                   2959
 engar[farmer]                2925
 udu[sheep]                   2865
 ugula[overseer]              2786
AN                            2641
 ki[place]                    2555
 giŋ[unit]                    2456
 AN                 

In [22]:
print("Total number of peronsal names: ", 203268+ 92262, "(may contain duplicates)")

Total number of peronsal names:  295530 (may contain duplicates)


# Filter out lines containing the above common words 
To see the context:

"filter_lines(word).value_counts()"

In [12]:
def filter_lines(word):
    # return a series of lines containing the word
    return pd.Series(list(filter(lambda x:word in x , transLinesContainingPN_ls)))

###  kišib[seal]

In [92]:
filter_lines("kišib[seal]").value_counts().head(10)

kišib[seal]; PN                              12990
kišib[seal]; PN; dumu[child]; PN               797
kišib[seal]; namšatam[administration]; PN      224
n; kišib[seal]; PN                             212
n; gur[unit]; kišib[seal]; PN                  166
kišib[seal]; PN; PN                            140
gabari[copy]; kišib[seal]; PN                  123
n; n; kišib[seal]; PN                          122
n; udu[sheep]; kišib[seal]; PN                 114
kišib[seal]; PN; šabra[administrator]          111
dtype: int64

##### Most Common: 
1. kišib[seal]; PN --> sealed by that person?

### mu[year]

In [13]:
filter_lines("mu[year]").value_counts().head(10)

mu[year]; PN; lugal[king]                         3034
mu[year]; PN; lugal[king]; GN; hulu[bad]          1327
mu[year]; ma[ship]; PN; du[spread]                1287
mu[year]; PN                                       973
mu[year]; PN; en[priest]; DN; huŋ[hire]            721
mu[year]; guza[chair]; PN; dim[create]             493
mu[year]; us[follow]; ma[ship]; PN; du[spread]     392
mu[year]; ON; PN; dim[create]                      279
mu[year]; PN; huŋ[hire]                            275
mu[year]; us[follow]; PN; lugal[king]              227
dtype: int64

##### Most Common:
1. mu[year]; PN; lugal[king] --> year of transaction

### ŋiri[foot]

In [14]:
filter_lines("ŋiri[foot]").value_counts().head(10)

ŋiri[foot]; PN                             6790
ŋiri[foot]; PN; dubsar[scribe]             1004
ŋiri[foot]; PN; sukkal[secretary]           529
ŋiri[foot]; PN; dumu[child]; PN             432
ŋiri[foot]; PN; šagia[cup-bearer]           167
ŋiri[foot]; PN; šabra[administrator]        166
ŋiri[foot]; PN; šarrabdu[administrator]     123
ŋiri[foot]; PN; ragaba[rider]               120
ŋiri[foot]; PN; PN                           97
n; udu[sheep]; ŋiri[foot]; PN                85
dtype: int64

##### Most Common:
1. ŋiri[foot]; PN --> delivered on foot by that person?
2. ŋiri[foot]; PN; dubsar[scribe] --> delivered on foot by a person who is a scribe???
8. ŋiri[foot]; PN; ragaba[rider] --> PN's role is the rider

### ugula[overseer]

In [98]:
filter_lines("ugula[overseer]").value_counts().head(10)

ugula[overseer]; PN                          7891
PN; ugula[overseer]                           293
n; ŋuruš[male]; ugula[overseer]; PN           291
n; ugula[overseer]; PN                        275
ugula[overseer]; PN; PN                       149
n; n; ŋuruš[male]; ugula[overseer]; PN        144
n; n; ugula[overseer]; PN                     142
ugula[overseer]; PN; dumu[child]; PN          131
ugula[overseer]; PN; kuš[official]            121
ugula[overseer]; PN; šabra[administrator]      90
dtype: int64

##### Most common:
1. ugula[overseer]; PN --> that person is the overseer?

### ragaba[rider]

In [25]:
filter_lines("ragaba[rider]").value_counts().head(10)

PN; ragaba[rider]                                                123
ŋiri[foot]; PN; ragaba[rider]                                    120
PN; ragaba[rider]; maškim[administrator]                          68
PN; ragaba[rider]; lu[person]; kiŋgia[messenger]; lugal[king]     27
kišib[seal]; PN; ragaba[rider]                                    13
n; PN; ragaba[rider]                                              10
arua[offering]; PN; ragaba[rider]                                  8
ragaba[rider]; PN                                                  8
mu[year]; PN; ragaba[rider]                                        7
n; sila[lamb]; PN; ragaba[rider]                                   6
dtype: int64

In [27]:
filter_lines("ki[place]").value_counts().head(10)

ki[place]; PN                          27514
ki[place]; PN; PN                       1162
ki[place]; PN; dumu[child]; PN           490
ziga[expenditure]; ki[place]; PN         168
n; ki[place]; PN                         165
ki[place]; X; PN                         136
ki[place]; PN; šabra[administrator]      135
n; udu[sheep]; ki[place]; PN              95
n; gud[ox]; ki[place]; PN                 90
ki[place]; PN; ašgab[leatherworker]       88
dtype: int64

In [28]:
filter_lines("PN; šu[hand]; teŋ[approach]").value_counts().head(10)

PN; šu[hand]; teŋ[approach]                                             1620
PN; dumu[child]; PN; šu[hand]; teŋ[approach]                              35
dumu[child]; PN; šu[hand]; teŋ[approach]                                   8
PN; sipad[shepherd]; PN; šu[hand]; teŋ[approach]                           4
ugula[overseer]; PN; šu[hand]; teŋ[approach]                               4
ki[place]; PN; šu[hand]; teŋ[approach]                                     4
uš[die]; PN; šu[hand]; teŋ[approach]                                       4
n; PN; šu[hand]; teŋ[approach]                                             4
n; gur[unit]; PN; šu[hand]; teŋ[approach]                                  4
še[barley]; erin[people]; engar[farmer]; PN; šu[hand]; teŋ[approach]       3
dtype: int64

# Experiment 

In [1]:
# Assume I know how to sort out: source, receiver, rider
# Test look at transactions having a rider


for trans in complete_transaction_ls:
    for line in trans.ls_lines_containing_PN:
        if re.search(r'ŋiri[foot]; PN; ragaba[rider]', line):
            print("yes")
#     break;
#         if re.search(r'ŋiri[foot]; PN; ragaba[rider]', line):
#             txt,translit = line.split("\n")
#             translit = translit.replace("#lem: ","")
#             translit, txt = translit.split(";"), txt.split(" ")[1:]
            
    
#             name = txt[translit.index("PN")]
#             role = "intermediate"
#             profession = "rider"
#             trans.roles[name] = [role, profession] #adding to the roles
#         elif re.search(r'PN; šu[hand]; teŋ[approach]', line):
#             txt,translit = line.split("\n")
#             translit = translit.replace("#lem: ","")
#             translit, txt = translit.split(";"), txt.split(" ")[1:]
#             name = txt[translit.index("PN")]
#             role = "reciever"
#             profession = "NA"
#             trans.roles[name] = [role, profession] #adding to the roles
#         elif re.search(r'ki .*-ta\s*', line):
#             txt,translit = line.split("\n")
#             translit = translit.replace("#lem: ","")
#             translit, txt = translit.split(";"), txt.split(" ")[1:]
#             name = txt[translit.index("PN")]
#             role = "source"
#             profession = "NA"
#             trans.roles[name] = [role, profession] #adding to the roles
            
    
        

NameError: name 'complete_transaction_ls' is not defined

# To do:
1. look at the next line of PN, too. For example, su ba-ti can be in the next line.
2. what are some important words? Make a list of roles, professions. HELP...
3. How to look for them. see patterns?

# Think about:
1. how to efficiently categorize commodities (animals, dead animals, leather, precious stuff)