In [1]:
import pandas as pd
import zipfile
import json
from tqdm.auto import tqdm
import os
import sys
util_dir = os.path.abspath('../compass/utils')
sys.path.append(util_dir)
import utils

In [2]:
os.makedirs('jsonzip', exist_ok = True)
os.makedirs('output', exist_ok = True)

In [3]:
projects = input('Project(s): ').lower().strip()
project_list = utils.format_project_list(projects)

Project(s):  epsd2/literary, dsst


In [4]:
project_list = utils.oracc_download(project_list)

Saving http://oracc.org/epsd2/literary/json/epsd2-literary.zip as jsonzip/epsd2-literary.zip.


epsd2/literary: 0.00B [00:00, ?B/s]

Saving http://oracc.org/dsst/json/dsst.zip as jsonzip/dsst.zip.


dsst: 0.00B [00:00, ?B/s]

In [5]:
def parsejson(text, meta_d):
    lemmas = []
    for JSONobject in text["cdl"]:
        if "cdl" in JSONobject: 
            lemmas.extend(parsejson(JSONobject, meta_d))
        if "label" in JSONobject: 
            meta_d["label"] = JSONobject['label']   # `label` is the line number; it stays constant until
                                                    # the process move to a new line
        
        if JSONobject.get("type") == "field-start": # this is for sign lists, identifying fields such as
            meta_d["field"] = JSONobject["subtype"]  # sign, pronunciation, translation.
        elif JSONobject.get("type") == "field-end":
            meta_d.pop("field", None)                           # remove the key "field" to prevent it from being copied 
                                                              # to all subsequent lemmas (which may not have fields)
        if "f" in JSONobject:
            lemma = JSONobject["f"]
            lemma["id_word"] = JSONobject["ref"]
            lemma['label'] = meta_d["label"]
            lemma["id_text"] = meta_d["id_text"]
            if "field" in meta_d:
                lemma["field"] = meta_d["field"]
            lemmas.append(lemma)
    return lemmas

In [6]:
lemm_l = []
meta_d = {"label": None, "id_text": None}
for project in project_list:
    file = f'jsonzip/{project.replace("/", "-")}.zip'
    try:
        zip_file = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        errors = sys.exc_info() # get error information
        print(file), print(errors[0]), print(errors[1]) # and print it
        continue
    files = zip_file.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):       #iterate over the file names
        id_text = project + filename[-13:-5] # id_text is, for instance, blms/P414332
        meta_d["id_text"] = id_text
        try:
            text_json_string = zip_file.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text_json_string)                # make it into a json object (essentially a dictionary)
            lemm_l.extend(parsejson(data_json, meta_d))     # and send to the parsejson() function
        except:
            e = sys.exc_info() # get error information
            print(filename), print(e[0]), print(e[1]) # and print it
    zip_file.close()

epsd2/literary:   0%|          | 0/915 [00:00<?, ?it/s]

dsst:   0%|          | 0/535 [00:00<?, ?it/s]

In [7]:
words_df = pd.DataFrame(lemm_l)
words_df = words_df.fillna('')   # replace NaN (Not a Number) with empty string
words_df

Unnamed: 0,lang,form,delim,gdl,cf,gw,sense,norm0,pos,epos,base,morph,id_word,label,id_text,cont,norm,aform,field,stem
0,sux,nir-ŋal₂-e,,"[{'v': 'nir', 'id': 'Q000802.1.1.0', 'delim': ...",nirŋal,authoritative,authoritative,"nirŋal,e",AJ,AJ,nir-ŋal₂,"~,e",Q000802.1.1,"proverb 9.a1, 1",epsd2/literary/Q000802,,,,,
1,sux,a-na,,"[{'v': 'a', 'id': 'Q000802.1.2.0', 'delim': '-...",ana,what?,what?,ana,QP,QP,a-na,~,Q000802.1.2,"proverb 9.a1, 1",epsd2/literary/Q000802,,,,,
2,sux,bi₂-in-dug₄,,"[{'v': 'bi₂', 'id': 'Q000802.1.3.0', 'delim': ...",dug,speak,"to speak, talk, say",ba.i.n:dug,V/t,V/t,dug₄,ba.i.n:~,Q000802.1.3,"proverb 9.a1, 1",epsd2/literary/Q000802,,,,,
3,sux,nu-sag₉,,"[{'v': 'nu', 'id': 'Q000802.1.4.0', 'delim': '...",sag,good,"(to be) good, sweet, beautiful",nu:sag,V/i,V/i,sag₉,nu:~,Q000802.1.4,"proverb 9.a1, 1",epsd2/literary/Q000802,,,,,
4,sux,nir-ŋal₂-e,,"[{'v': 'nir', 'id': 'Q000802.2.1.0', 'delim': ...",nirŋal,authoritative,authoritative,"nirŋal,e",AJ,AJ,nir-ŋal₂,"~,e",Q000802.2.1,"proverb 9.a2, 2",epsd2/literary/Q000802,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262727,sux,x-enim-ŋu₁₀,,"[{'x': 'ellipsis', 'id': 'P356395.13.1.0', 'br...",,,,,,,,,P356395.13.1,12’,dsst/P356395,,,,,
262728,sux,x,,"[{'x': 'ellipsis', 'id': 'P356395.13.2.0', 'br...",,,,,,,,,P356395.13.2,12’,dsst/P356395,,,,,
262729,sux,x,,"[{'x': 'ellipsis', 'id': 'P356395.14.1.0', 'br...",,,,,,,,,P356395.14.1,13’,dsst/P356395,,,,,
262730,sux,x,,"[{'v': 'x', 'id': 'P356395.14.2.0'}]",,,,,,,,,P356395.14.2,13’,dsst/P356395,,,,,


In [8]:
findreplace = {' ' : '-', ',' : ''}
words_df = words_df.replace({'gw' : findreplace, 'sense' : findreplace}, regex=True)

In [9]:
words_df = words_df[~words_df.form.str.contains('x')]
words_df = words_df[~words_df.form.str.contains('X')]
words_df = words_df[words_df.lang.str.startswith('sux')]

In [10]:
keep = ['form', 'cf', 'gw', 'pos', 'norm0', 'base', 'id_text']
words_df = words_df[keep]

In [11]:
dsst_id = [idtext[-7:] for idtext in words_df['id_text'] if idtext.startswith('dsst')]
dsst_id = list(set(dsst_id))
duplicates = [idtext for idtext in words_df['id_text'] if idtext[-7:] in dsst_id]
duplicates = list(set(duplicates))
duplicates = [idtext for idtext in duplicates if not idtext.startswith('dsst')]

In [12]:
words_df = words_df[~words_df.id_text.isin(duplicates)]

In [13]:
words_df

Unnamed: 0,form,cf,gw,pos,norm0,base,id_text
0,nir-ŋal₂-e,nirŋal,authoritative,AJ,"nirŋal,e",nir-ŋal₂,epsd2/literary/Q000802
1,a-na,ana,what?,QP,ana,a-na,epsd2/literary/Q000802
2,bi₂-in-dug₄,dug,speak,V/t,ba.i.n:dug,dug₄,epsd2/literary/Q000802
3,nu-sag₉,sag,good,V/i,nu:sag,sag₉,epsd2/literary/Q000802
4,nir-ŋal₂-e,nirŋal,authoritative,AJ,"nirŋal,e",nir-ŋal₂,epsd2/literary/Q000802
...,...,...,...,...,...,...,...
262718,ib₂-sa₂,,,,,,dsst/P356395
262719,saŋ-du₃,,,,,,dsst/P356395
262721,us₂-us₂-i₃-gu₇,,,,,,dsst/P356395
262724,a-ra-ŋar,,,,,,dsst/P356395


In [14]:
words_l = []
separators = ['{', '}', '-']
separators2 = ['.', '+', '|']
operators = ['&', '%', '@', '×']
for e in tqdm(words_df.form):
    word = []
    if '1(šar₂{gal})' in e: # this cheating but it seems to work (appears in SKL 38)
            e = e.replace('1(šar₂{gal})', '1(šar₂)-gal')
    for s in separators: # first split word into signs   
        e = e.replace(s, ' ').strip()
    s_l = e.split()
    for sign in s_l:
        if sign[0].isdigit(): # 1(geš₂), 2(DIŠ), etc.
            sign = sign.lower()
        elif sign[-1] == ')': # qualified sign - get only the qualifier
            stack = []  # |GIŠ×(GIŠ%GIŠ)|(LAK277) becomes LAK277
            ind = {}    # LAK277(|GIŠ×(GIŠ%GIŠ)|) becomes |GIŠ×(GIŠ%GIŠ)|
            for i, c in reversed(list(enumerate(sign))):
                if c == ')':
                    stack.append(i)
                if c == '(':
                    ind[stack.pop()] = i   # find the opening parens that belongs to the closing parens at position -1    
            start = ind[len(sign)-1]   # this line fails on 1(šar₂{gal}) in SKL.
            t = sign[start+1:-1]
            if t.isupper(): #leave 1(diš) etc. alone
                sign = t
            
        if '|' in sign:  # separate |DU.DU| and |DU+DU| into its components but not |DU&DU|
                        # and also not |DU.DU&DU|
            flag = False
            for o in operators:
                if o in sign:
                    flag = True
            if not flag:
                for s in separators2:
                    sign = sign.replace(s, ' ').strip() 
                sign_l = sign.split()
                word.extend(sign_l)
                continue
        elif "+" in sign:  # + as marker of gloss
            sign = sign.replace('+', ' ').strip()
            sign_l = sign.split()
            word.extend(sign_l)
            continue
        word.append(sign)
    words_l.append(word)      

  0%|          | 0/212755 [00:00<?, ?it/s]

In [15]:
words_df['signs'] = words_l

In [16]:
o = pd.read_pickle('ogsl.p', compression = None)

In [17]:
val = list(o["value"])
utf = list(o["utf8"])
names = list(o["name"])

In [18]:
d = dict(zip(names, utf))
d2 = dict(zip(val,names))

In [19]:
sign_l = []
for index, row in words_df.iterrows():
    for sign in row['signs']:
        n = [sign, row['id_text']]
        sign_l.append(n)

In [20]:
signs_df = pd.DataFrame(sign_l, columns = ['value', 'id_text'])
signs_df['sign_name'] = [d2.get(s.lower(), s) for s in signs_df.value]
signs_df['utf8'] = [d.get(n, n) for n in signs_df.sign_name]
signs_df = signs_df[['value', 'sign_name', 'utf8', 'id_text']]
signs_df

Unnamed: 0,value,sign_name,utf8,id_text
0,nir,|NUN&NUN|,𒉪,epsd2/literary/Q000802
1,ŋal₂,IG,𒅅,epsd2/literary/Q000802
2,e,E,𒂊,epsd2/literary/Q000802
3,a,A,𒀀,epsd2/literary/Q000802
4,na,,𒈾,epsd2/literary/Q000802
...,...,...,...,...
500441,gu₇,|KA×GAR|,𒅥,dsst/P356395
500442,a,A,𒀀,dsst/P356395
500443,ra,RA,𒊏,dsst/P356395
500444,ŋar,GAR,𒃻,dsst/P356395
