In [1]:
import pandas as pd
import zipfile
import json
from tqdm.auto import tqdm
import os
import sys
util_dir = os.path.abspath('../compass/utils')
sys.path.append(util_dir)
import utils

In [2]:
os.makedirs('jsonzip', exist_ok = True)
os.makedirs('output', exist_ok = True)

In [3]:
projects = input('Project(s): ').lower().strip()
project_list = utils.format_project_list(projects)

Project(s):  epsd2/literary, dsst


In [4]:
project_list = utils.oracc_download(project_list)

Saving http://oracc.org/dsst/json/dsst.zip as jsonzip/dsst.zip.


dsst: 0.00B [00:00, ?B/s]

Saving http://oracc.org/epsd2/literary/json/epsd2-literary.zip as jsonzip/epsd2-literary.zip.


epsd2/literary: 0.00B [00:00, ?B/s]

In [5]:
def parsejson(text, meta_d):
    lemmas = []
    for JSONobject in text["cdl"]:
        if "cdl" in JSONobject: 
            lemmas.extend(parsejson(JSONobject, meta_d))
        if "label" in JSONobject: 
            meta_d["label"] = JSONobject['label']   # `label` is the line number; it stays constant until
                                                    # the process move to a new line
        
        if JSONobject.get("type") == "field-start": # this is for sign lists, identifying fields such as
            meta_d["field"] = JSONobject["subtype"]  # sign, pronunciation, translation.
        elif JSONobject.get("type") == "field-end":
            meta_d.pop("field", None)                           # remove the key "field" to prevent it from being copied 
                                                              # to all subsequent lemmas (which may not have fields)
        if "f" in JSONobject:
            lemma = JSONobject["f"]
            lemma["id_word"] = JSONobject["ref"]
            lemma['label'] = meta_d["label"]
            lemma["id_text"] = meta_d["id_text"]
            if "field" in meta_d:
                lemma["field"] = meta_d["field"]
            lemmas.append(lemma)
    return lemmas

In [6]:
lemm_l = []
meta_d = {"label": None, "id_text": None}
for project in project_list:
    file = f'jsonzip/{project.replace("/", "-")}.zip'
    try:
        zip_file = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        errors = sys.exc_info() # get error information
        print(file), print(errors[0]), print(errors[1]) # and print it
        continue
    files = zip_file.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):       #iterate over the file names
        id_text = project + filename[-13:-5] # id_text is, for instance, blms/P414332
        meta_d["id_text"] = id_text
        try:
            text_json_string = zip_file.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text_json_string)                # make it into a json object (essentially a dictionary)
            lemm_l.extend(parsejson(data_json, meta_d))     # and send to the parsejson() function
        except:
            e = sys.exc_info() # get error information
            print(filename), print(e[0]), print(e[1]) # and print it
    zip_file.close()

dsst:   0%|          | 0/535 [00:00<?, ?it/s]

epsd2/literary:   0%|          | 0/915 [00:00<?, ?it/s]

In [7]:
words_df = pd.DataFrame(lemm_l)
words_df = words_df.fillna('')   # replace NaN (Not a Number) with empty string
words_df

Unnamed: 0,lang,form,delim,gdl,id_word,label,id_text,cf,gw,sense,norm0,pos,epos,base,morph,cont,norm,aform,field,stem
0,sux,edin-še₃,,"[{'v': 'edin', 'id': 'P356733.3.1.0', 'break':...",P356733.3.1,o 1’,dsst/P356733,,,,,,,,,,,,,
1,sux,mu-da,,"[{'v': 'mu', 'id': 'P356733.3.2.0', 'delim': '...",P356733.3.2,o 1’,dsst/P356733,,,,,,,,,,,,,
2,sux,igi,,"[{'v': 'igi', 'id': 'P356733.3.3.0', 'break': ...",P356733.3.3,o 1’,dsst/P356733,,,,,,,,,,,,,
3,sux,hu-mu-de₃-du₈,,"[{'v': 'hu', 'id': 'P356733.3.4.0', 'break': '...",P356733.3.4,o 1’,dsst/P356733,,,,,,,,,,,,,
4,sux,{ŋeš}al-e,,"[{'det': 'semantic', 'pos': 'pre', 'seq': [{'v...",P356733.4.1,o 2’,dsst/P356733,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262727,sux,rib-x,,"[{'v': 'rib', 'id': 'P346641.8.3.0', 'break': ...",P346641.8.3,6',epsd2/literary/P346641,,,,,u,,,,,,,,
262728,sux,ši-in-x,,"[{'v': 'ši', 'id': 'P346641.8.4.0', 'delim': '...",P346641.8.4,6',epsd2/literary/P346641,,,,,u,,,,,,,,
262729,sux,x,,"[{'x': 'ellipsis', 'id': 'P346641.9.1.0', 'bre...",P346641.9.1,7ʾ,epsd2/literary/P346641,,,,,u,,,,,,,,
262730,sux,x,,"[{'v': 'x', 'id': 'P346641.9.2.0', 'break': 'd...",P346641.9.2,7ʾ,epsd2/literary/P346641,,,,,u,,,,,,,,


In [8]:
findreplace = {' ' : '-', ',' : ''}
words_df = words_df.replace({'gw' : findreplace, 'sense' : findreplace}, regex=True)

In [9]:
words_df = words_df[words_df.lang.str.startswith('sux')]

In [10]:
keep = ['form', 'cf', 'gw', 'pos', 'norm0', 'base', 'id_text']
words_df = words_df[keep]

In [11]:
dsst_id = [idtext[-7:] for idtext in words_df['id_text'] if idtext.startswith('dsst')]
dsst_id = list(set(dsst_id))
duplicates = [idtext for idtext in words_df['id_text'] if idtext[-7:] in dsst_id]
duplicates = list(set(duplicates))
duplicates = [idtext for idtext in duplicates if not idtext.startswith('dsst')]

In [12]:
words_df = words_df[~words_df.id_text.isin(duplicates)]

In [13]:
words_df

Unnamed: 0,form,cf,gw,pos,norm0,base,id_text
0,edin-še₃,,,,,,dsst/P356733
1,mu-da,,,,,,dsst/P356733
2,igi,,,,,,dsst/P356733
3,hu-mu-de₃-du₈,,,,,,dsst/P356733
4,{ŋeš}al-e,,,,,,dsst/P356733
...,...,...,...,...,...,...,...
262727,rib-x,,,u,,,epsd2/literary/P346641
262728,ši-in-x,,,u,,,epsd2/literary/P346641
262729,x,,,u,,,epsd2/literary/P346641
262730,x,,,u,,,epsd2/literary/P346641


In [14]:
words_l = []
separators = ['{', '}', '-']
separators2 = ['.', '+', '|']
operators = ['&', '%', '@', '×']
for e in tqdm(words_df.form):
    word = []
    if '1(šar₂{gal})' in e: # this cheating but it seems to work (appears in SKL 38)
            e = e.replace('1(šar₂{gal})', '1(šar₂)-gal')
    for s in separators: # first split word into signs   
        e = e.replace(s, ' ').strip()
    s_l = e.split()
    for sign in s_l:
        if sign[0].isdigit(): # 1(geš₂), 2(DIŠ), etc.
            sign = sign.lower()
        elif sign[-1] == ')': # qualified sign - get only the qualifier
            stack = []  # |GIŠ×(GIŠ%GIŠ)|(LAK277) becomes LAK277
            ind = {}    # LAK277(|GIŠ×(GIŠ%GIŠ)|) becomes |GIŠ×(GIŠ%GIŠ)|
            for i, c in reversed(list(enumerate(sign))):
                if c == ')':
                    stack.append(i)
                if c == '(':
                    ind[stack.pop()] = i   # find the opening parens that belongs to the closing parens at position -1    
            start = ind[len(sign)-1]   # this line fails on 1(šar₂{gal}) in SKL.
            t = sign[start+1:-1]
            if t.isupper(): #leave 1(diš) etc. alone
                sign = t
            
        if '|' in sign:  # separate |DU.DU| and |DU+DU| into its components but not |DU&DU|
                        # and also not |DU.DU&DU|
            flag = False
            for o in operators:
                if o in sign:
                    flag = True
            if not flag:
                for s in separators2:
                    sign = sign.replace(s, ' ').strip() 
                sign_l = sign.split()
                word.extend(sign_l)
                continue
        elif "+" in sign:  # + as marker of gloss
            sign = sign.replace('+', ' ').strip()
            sign_l = sign.split()
            word.extend(sign_l)
            continue
        word.append(sign)
    words_l.append(word)      

  0%|          | 0/259788 [00:00<?, ?it/s]

In [15]:
words_df['signs'] = words_l

In [16]:
o = pd.read_pickle('ogsl.p', compression = None)

In [17]:
val = list(o["value"])
utf = list(o["utf8"])
names = list(o["name"])

In [18]:
d = dict(zip(names, utf))
d2 = dict(zip(val,names))

In [19]:
sign_l = []
for index, row in words_df.iterrows():
    for sign in row['signs']:
        n = [sign, row['id_text']]
        sign_l.append(n)

In [20]:
signs_df = pd.DataFrame(sign_l, columns = ['value', 'id_text'])
signs_df['sign_name'] = [d2.get(s.lower(), s) for s in signs_df.value]
signs_df['utf8'] = [d.get(n, n) for n in signs_df.sign_name]
signs_df = signs_df[['value', 'sign_name', 'utf8', 'id_text']]
signs_df = signs_df[signs_df.sign_name != 'X']
signs_df

Unnamed: 0,value,sign_name,utf8,id_text
0,edin,EDIN,𒂔,dsst/P356733
1,še₃,EŠ₂,𒂠,dsst/P356733
2,mu,MU,𒈬,dsst/P356733
3,da,DA,𒁕,dsst/P356733
4,igi,IGI,𒅆,dsst/P356733
...,...,...,...,...
575823,gi₄,GI₄,𒄄,epsd2/literary/P346641
575824,gin₇,DIM₂,𒁶,epsd2/literary/P346641
575825,rib,KAL,𒆗,epsd2/literary/P346641
575827,ši,IGI,𒅆,epsd2/literary/P346641


In [21]:
with open('aliases.txt') as a:
    alias = a.read().splitlines()

In [22]:
alias_d = {}

In [23]:
for line in alias:
    aliases = line.split()
    for value in aliases[1:]:
        alias_d[value] = aliases[0]

In [24]:
signs_df['value'] = [alias_d.get(value, value) for value in signs_df.value] 
signs_df

Unnamed: 0,value,sign_name,utf8,id_text
0,edin,EDIN,𒂔,dsst/P356733
1,še₃,EŠ₂,𒂠,dsst/P356733
2,ŋu₁₀,MU,𒈬,dsst/P356733
3,da,DA,𒁕,dsst/P356733
4,igi,IGI,𒅆,dsst/P356733
...,...,...,...,...
575823,gi₄,GI₄,𒄄,epsd2/literary/P346641
575824,gin₇,DIM₂,𒁶,epsd2/literary/P346641
575825,rib,KAL,𒆗,epsd2/literary/P346641
575827,ši,IGI,𒅆,epsd2/literary/P346641


In [25]:
SP_collections = {'Q000795' : 'SP1', 
            'Q000796' : 'SP2+6',
            'Q000797' : 'SP3',
            'Q000798' : 'SP4',
            'Q000799' : 'SP5',
            'Q000800' : 'SP7',
            'Q000801' : 'SP8',
            'Q000802' : 'SP9',
            'Q000803' : 'SP10', 
            'Q000804' : 'SP11',
            'Q000805' : 'SP12',
            'Q000806' : 'SP13',
            'Q000807' : 'SP14',
            'Q000808' : 'SP15', 
            'Q000809' : 'SP16',
            'Q000810' : 'SP17',
            'Q000811' : 'SP18',
            'Q000812' : 'SP19',
            'Q000813' : 'SP21',
            'Q000814' : 'SP22',
            'Q000815' : 'SP23',
            'Q000816' : 'SP24',
            'Q000817' : 'SP25',
            'Q000818' : 'SP26',
            'Q000819' : 'SP27',
            'Q000820' : 'SP28',
            'Q000821' : 'SP_Nippur',
            'Q000822' : 'SP_Susa',
            'Q000823' : 'SP_Ur',
            'Q000824' : 'SP_Uruk',
            'Q000825' : 'SP_Unknown'}           
            

In [26]:
signs_df['type'] = ['prov' if item[-7:] in SP_collections.keys() else 'lit' for item in signs_df.id_text  ]

In [27]:
signs_df

Unnamed: 0,value,sign_name,utf8,id_text,type
0,edin,EDIN,𒂔,dsst/P356733,lit
1,še₃,EŠ₂,𒂠,dsst/P356733,lit
2,ŋu₁₀,MU,𒈬,dsst/P356733,lit
3,da,DA,𒁕,dsst/P356733,lit
4,igi,IGI,𒅆,dsst/P356733,lit
...,...,...,...,...,...
575823,gi₄,GI₄,𒄄,epsd2/literary/P346641,lit
575824,gin₇,DIM₂,𒁶,epsd2/literary/P346641,lit
575825,rib,KAL,𒆗,epsd2/literary/P346641,lit
575827,ši,IGI,𒅆,epsd2/literary/P346641,lit


In [28]:
proverbs = signs_df[signs_df.type == 'prov']

In [29]:
literary = signs_df[signs_df.type != 'prov']

In [30]:
proverbs[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,hul₃,|GIŠ%GIŠ|,1
1,sag₁₀,|IGI.ERIN₂|,1
2,U₈,|LAGAB×(GUD&GUD)|,1
3,sagi,|SILA₃.ŠU.GABA|,1
4,saga₁₁,KIN,1
...,...,...,...
924,nu,NU,627
925,ba,BA,698
926,ŋu₁₀,MU,816
927,e,E,996


In [31]:
literary[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,ṭa₂,TA,1
1,LU₂.NE,LU₂.NE,1
2,su(2),su(2),1
3,MAN,|U.U|,1
4,si₂₃,GUL,1
...,...,...,...
2478,na,,10990
2479,ba,BA,11004
2480,e,E,12149
2481,ŋu₁₀,MU,16944


In [32]:
proverbs_s = proverbs.sample(n = 10000)
literary_s = literary.sample(n=10000)

In [33]:
proverbs_s[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,1,1,1
1,ziₓ(IGI@g),ziₓ(IGI@g),1
2,gug₂,LU₃,1
3,zur,AMAR,1
4,|KA×A|,|KA×A|,1
...,...,...,...
698,na,,205
699,ba,BA,235
700,ŋu₁₀,MU,287
701,e,E,355


In [34]:
literary_s[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,$NI,$NI,1
1,zalag₂,ERIN₂,1
2,dubsig,IL₂,1
3,siškur₂,|AMAR×ŠE.AMAR×ŠE|,1
4,ziz₂,ZIZ₂,1
...,...,...,...
760,ba,BA,237
761,na,,238
762,e,E,270
763,ŋu₁₀,MU,351


In [35]:
SP1 = signs_df[signs_df.id_text.str.contains('Q000795')]

In [36]:
SP1

Unnamed: 0,value,sign_name,utf8,id_text,type
195933,niŋ₂,GAR,𒃻,epsd2/literary/Q000795,prov
195934,gi,GI,𒄀,epsd2/literary/Q000795,prov
195935,na,,𒈾,epsd2/literary/Q000795,prov
195936,da,DA,𒁕,epsd2/literary/Q000795,prov
195937,a,A,𒀀,epsd2/literary/Q000795,prov
...,...,...,...,...,...
198710,ta,TA,𒋫,epsd2/literary/Q000795,prov
198711,šar₂,ŠAR₂,𒊹,epsd2/literary/Q000795,prov
198713,hi₂,GAN,𒃶,epsd2/literary/Q000795,prov
198714,a,A,𒀀,epsd2/literary/Q000795,prov


In [37]:
SP1[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,1,1,1
1,engar,APIN,1
2,er₁₀,DU,1
3,er₂,|A.IGI|,1
4,eŋir,EGIR,1
...,...,...,...
375,ba,BA,67
376,e,E,68
377,ŋu₁₀,MU,70
378,nu,NU,72


In [38]:
LiEB = signs_df[signs_df.id_text.str.contains("Q000482")]

In [39]:
LiEB[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,1(diš),DIŠ,1
1,išib,ME,1
2,iš,IŠ,1
3,isimu₂,GUL,1
4,ir,IR,1
...,...,...,...
188,ki,KI,18
189,ŋu₁₀,MU,18
190,li,LI,18
191,en,EN,26


In [40]:
decad = {'Q000395' : 'Shulgi A',
        'Q000481' : 'Lipit-Eshtar A',
        'Q000778' : 'Song of the Hoe',
        'Q000623' : 'Inana B',
        'Q000619' : 'Enlil A',
        'Q000751' : 'Kesh Temple Hymn',
        'Q000335' : "Enki's Journey to Nippur",
        'Q000339' : 'Inana and Ebih',
        'Q000736' : 'Nungal A',
        'Q000365' : 'Gilgamesh and Huwawa A'}


In [41]:
signs_decad = signs_df[signs_df.id_text.str[-7:].isin(decad.keys())]

In [42]:
signs_decad

Unnamed: 0,value,sign_name,utf8,id_text,type
160736,lugal,LUGAL,𒈗,epsd2/literary/Q000481,lit
160737,munus,SAL,𒊩,epsd2/literary/Q000481,lit
160738,dug₄,KA,𒅗,epsd2/literary/Q000481,lit
160739,ga,GA,𒂵,epsd2/literary/Q000481,lit
160740,šag₄,ŠA₃,𒊮,epsd2/literary/Q000481,lit
...,...,...,...,...,...
567560,d,AN,𒀭,epsd2/literary/Q000335,lit
567561,en,EN,𒂗,epsd2/literary/Q000335,lit
567562,ki,KI,𒆠,epsd2/literary/Q000335,lit
567563,zag,ZAG,𒍠,epsd2/literary/Q000335,lit


In [43]:
signs_decad[['value', 'sign_name']].value_counts(ascending=True).reset_index(name='count')

Unnamed: 0,value,sign_name,count
0,siškur₂,|AMAR×ŠE.AMAR×ŠE|,1
1,sim,NAM,1
2,lulim,|GIR₃×(LU.IGI)|,1
3,dab,DIB,1
4,idigna,|GU₂.GAR₃|,1
...,...,...,...
716,an,AN,380
717,en,EN,418
718,ba,BA,422
719,a,A,549


In [51]:
dec = []
for comp in decad.keys():
    c = signs_df[signs_df.id_text.str[-7:] == comp]
    l = len(c)
    s = len(set(c.sign_name))
    v = len(set(c.value))
    co = [decad[comp], l, round(s/l*100, 2), round(v/l*100, 2)]
    dec.append(co)

In [54]:
columns = ["composition", "length", "signs (%)","values (%)"]
pd.DataFrame(dec, columns=columns)

Unnamed: 0,composition,length,signs (%),values (%)
0,Shulgi A,1261,16.26,22.05
1,Lipit-Eshtar A,1086,17.96,22.74
2,Song of the Hoe,1207,15.91,19.8
3,Inana B,1721,12.09,15.63
4,Enlil A,1841,13.04,17.22
5,Kesh Temple Hymn,1740,12.01,14.89
6,Enki's Journey to Nippur,1368,14.62,18.64
7,Inana and Ebih,2111,11.27,14.5
8,Nungal A,1737,12.49,16.52
9,Gilgamesh and Huwawa A,3870,6.33,9.2


In [55]:
SP = []

In [57]:
for comp in SP_collections:
    c = signs_df[signs_df.id_text.str[-7:] == comp]
    l = len(c)
    s = len(set(c.sign_name))
    v = len(set(c.value))
    co = [SP_collections[comp], l, round(s/l*100, 2), round(v/l*100, 2)]
    SP.append(co)

In [59]:
columns = ["composition", "length", "signs (%)","values (%)"]
pd.DataFrame(SP, columns=columns)

Unnamed: 0,composition,length,signs (%),values (%)
0,SP1,2564,10.1,14.82
1,SP2+6,2922,9.24,13.48
2,SP3,2817,9.94,15.55
3,SP4,643,25.19,31.88
4,SP5,2296,10.02,14.11
5,SP7,932,20.92,27.9
6,SP8,1047,18.62,24.83
7,SP9,563,25.22,33.21
8,SP10,177,49.72,53.67
9,SP11,678,21.53,27.88
