In [None]:
import pandas as pd
import zipfile
import json
from tqdm.auto import tqdm
import os
import sys
util_dir = os.path.abspath('../compass/utils')
sys.path.append(util_dir)
import utils

In [None]:
os.makedirs('jsonzip', exist_ok = True)
os.makedirs('output', exist_ok = True)

In [None]:
projects = input('Project(s): ').lower().strip()
project_list = utils.format_project_list(projects)

In [None]:
project_list = utils.oracc_download(project_list)

In [None]:
def parsejson(text, meta_d):
    lemmas = []
    for JSONobject in text["cdl"]:
        if "cdl" in JSONobject: 
            lemmas.extend(parsejson(JSONobject, meta_d))
        if "label" in JSONobject: 
            meta_d["label"] = JSONobject['label']   # `label` is the line number; it stays constant until
                                                    # the process move to a new line
        
        if JSONobject.get("type") == "field-start": # this is for sign lists, identifying fields such as
            meta_d["field"] = JSONobject["subtype"]  # sign, pronunciation, translation.
        elif JSONobject.get("type") == "field-end":
            meta_d.pop("field", None)                           # remove the key "field" to prevent it from being copied 
                                                              # to all subsequent lemmas (which may not have fields)
        if "f" in JSONobject:
            lemma = JSONobject["f"]
            lemma["id_word"] = JSONobject["ref"]
            lemma['label'] = meta_d["label"]
            lemma["id_text"] = meta_d["id_text"]
            if "field" in meta_d:
                lemma["field"] = meta_d["field"]
            lemmas.append(lemma)
    return lemmas

In [None]:
lemm_l = []
meta_d = {"label": None, "id_text": None}
for project in project_list:
    file = f'jsonzip/{project.replace("/", "-")}.zip'
    try:
        zip_file = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        errors = sys.exc_info() # get error information
        print(file), print(errors[0]), print(errors[1]) # and print it
        continue
    files = zip_file.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):       #iterate over the file names
        id_text = project + filename[-13:-5] # id_text is, for instance, blms/P414332
        meta_d["id_text"] = id_text
        try:
            text_json_string = zip_file.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text_json_string)                # make it into a json object (essentially a dictionary)
            lemm_l.extend(parsejson(data_json, meta_d))     # and send to the parsejson() function
        except:
            e = sys.exc_info() # get error information
            print(filename), print(e[0]), print(e[1]) # and print it
    zip_file.close()

In [None]:
words_df = pd.DataFrame(lemm_l)
words_df = words_df.fillna('')   # replace NaN (Not a Number) with empty string
words_df

In [None]:
findreplace = {' ' : '-', ',' : ''}
words_df = words_df.replace({'gw' : findreplace, 'sense' : findreplace}, regex=True)

In [None]:
words_df = words_df[~words_df.form.str.contains('x')]
words_df = words_df[~words_df.form.str.contains('X')]
words_df = words_df[words_df.lang.str.startswith('sux')]

In [None]:
keep = ['form', 'cf', 'gw', 'pos', 'norm0', 'base', 'id_text']
words_df = words_df[keep]

In [None]:
dsst_id = [idtext[-7:] for idtext in words_df['id_text'] if idtext.startswith('dsst')]
dsst_id = list(set(dsst_id))
duplicates = [idtext for idtext in words_df['id_text'] if idtext[-7:] in dsst_id]
duplicates = list(set(duplicates))
duplicates = [idtext for idtext in duplicates if not idtext.startswith('dsst')]

In [None]:
words_df = words_df[~words_df.id_text.isin(duplicates)]

In [None]:
words_df

In [None]:
words_l = []
separators = ['{', '}', '-']
separators2 = ['.', '+', '|']
operators = ['&', '%', '@', '×']
for e in tqdm(words_df.form):
    word = []
    if '1(šar₂{gal})' in e: # this cheating but it seems to work (appears in SKL 38)
            e = e.replace('1(šar₂{gal})', '1(šar₂)-gal')
    for s in separators: # first split word into signs   
        e = e.replace(s, ' ').strip()
    s_l = e.split()
    for sign in s_l:
        if sign[0].isdigit(): # 1(geš₂), 2(DIŠ), etc.
            sign = sign.lower()
        elif sign[-1] == ')': # qualified sign - get only the qualifier
            stack = []  # |GIŠ×(GIŠ%GIŠ)|(LAK277) becomes LAK277
            ind = {}    # LAK277(|GIŠ×(GIŠ%GIŠ)|) becomes |GIŠ×(GIŠ%GIŠ)|
            for i, c in reversed(list(enumerate(sign))):
                if c == ')':
                    stack.append(i)
                if c == '(':
                    ind[stack.pop()] = i   # find the opening parens that belongs to the closing parens at position -1    
            start = ind[len(sign)-1]   # this line fails on 1(šar₂{gal}) in SKL.
            t = sign[start+1:-1]
            if t.isupper(): #leave 1(diš) etc. alone
                sign = t
            
        if '|' in sign:  # separate |DU.DU| and |DU+DU| into its components but not |DU&DU|
                        # and also not |DU.DU&DU|
            flag = False
            for o in operators:
                if o in sign:
                    flag = True
            if not flag:
                for s in separators2:
                    sign = sign.replace(s, ' ').strip() 
                sign_l = sign.split()
                word.extend(sign_l)
                continue
        elif "+" in sign:  # + as marker of gloss
            sign = sign.replace('+', ' ').strip()
            sign_l = sign.split()
            word.extend(sign_l)
            continue
        word.append(sign)
    words_l.append(word)      

In [None]:
words_df['signs'] = words_l

In [None]:
o = pd.read_pickle('ogsl.p', compression = None)

In [None]:
val = list(o["value"])
utf = list(o["utf8"])
names = list(o["name"])

In [None]:
d = dict(zip(names, utf))
d2 = dict(zip(val,names))

In [None]:
sign_l = []
for index, row in words_df.iterrows():
    for sign in row['signs']:
        n = [sign, row['id_text']]
        sign_l.append(n)

In [None]:
signs_df = pd.DataFrame(sign_l, columns = ['value', 'id_text'])
signs_df['sign_name'] = [d2.get(s.lower(), s) for s in signs_df.value]
signs_df['utf8'] = [d.get(n, n) for n in signs_df.sign_name]
signs_df = signs_df[['value', 'sign_name', 'utf8', 'id_text']]
signs_df

In [None]:
SP_collections = {'Q000795' : 'SP1', 
            'Q000796' : 'SP2+6',
            'Q000797' : 'SP3',
            'Q000798' : 'SP4',
            'Q000799' : 'SP5',
            'Q000800' : 'SP7',
            'Q000801' : 'SP8',
            'Q000802' : 'SP9',
            'Q000803' : 'SP10', 
            'Q000804' : 'SP11',
            'Q000805' : 'SP12',
            'Q000806' : 'SP13',
            'Q000807' : 'SP14',
            'Q000808' : 'SP15', 
            'Q000809' : 'SP16',
            'Q000810' : 'SP17',
            'Q000811' : 'SP18',
            'Q000812' : 'SP19',
            'Q000813' : 'SP21',
            'Q000814' : 'SP22',
            'Q000815' : 'SP23',
            'Q000816' : 'SP24',
            'Q000817' : 'SP25',
            'Q000818' : 'SP26',
            'Q000819' : 'SP27',
            'Q000820' : 'SP28',
            'Q000821' : 'SP_Nippur',
            'Q000822' : 'SP_Susa',
            'Q000823' : 'SP_Ur',
            'Q000824' : 'SP_Uruk',
            'Q000825' : 'SP_Unknown'}           
            