Create file with all words (lemmas) of a single text listed in a row.

In [1]:
import pandas as pd
import zipfile
import json
import tqdm
import os
import sys
import pickle
import re
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [2]:
directories = ['jsonzip', 'output', 'corpus']
make_dirs(directories)

In [3]:
projects = input('Project(s): ').lower()

Project(s):  epsd2/admin/lagash2


In [4]:
p = format_project_list(projects)
oracc_download(p)

Saving http://build-oracc.museum.upenn.edu/json/epsd2-admin-lagash2.zip as jsonzip/epsd2-admin-lagash2.zip.


HBox(children=(FloatProgress(value=0.0, description='epsd2/admin/lagash2', max=3423458.0, style=ProgressStyle(…

['epsd2/admin/lagash2']

In [26]:
def parsejson(text):
    for JSONobject in text["cdl"]:
        field = ''
        if "cdl" in JSONobject: 
            parsejson(JSONobject)
        if "type" in JSONobject and JSONobject["type"] == "field-start":
            field = JSONobject["subtype"]
        if "f" in JSONobject and not field in ['sg', 'pr']: # skip the fields "sign" and "pronunciation"
                                # in lexical texts
            if JSONobject["f"]["lang"][:3] == "sux": #only Sumerian and Emesal
                word = JSONobject["f"]
                if "cf" in word:
                    if 'pos' in word:  #for some reason some words appear without pos. Provisionally treated as Noun
                        lemm = word["cf"] + '[' + word["gw"] + "]" + word["pos"]
                    else:
                        lemm = word["cf"] + '[' + word["gw"] + "]N"
                    lemm = lemm.replace(' ', '-') # remove commas and spaces from lemm
                    lemm = lemm.replace(',', '')
                else:
                    lemm = "_" # if word is unlemmatized enter a place holder
                l.append(lemm)
    return

In [27]:
lemm_ = []
ids_ = []
for project in p:
    file = "jsonzip/" + project.replace("/", "-") + ".zip"
    try:
        z = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        print(file + " does not exist or is not a proper ZIP file")
        continue
    files = z.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):                            #iterate over the file names
        l = []
        id_no = filename[-13:-5]
        if id_no in ids_ and not "X" in id_no: # Check if P/Q number is already in there
            continue        # a text may appear in multiple projects
        id_text = project + id_no # id_text is, for instance, blms/P414332
        ids_.append(id_text)
        try:
            text = z.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text)                # make it into a json object (essentially a dictionary)
            #lemm_.append(f"\n{id_text}")     # new text starts on new line with text_id
            parsejson(data_json)
            lemm_.append(l)
        except:
            print(id_text + ' is not available or not complete')

HBox(children=(FloatProgress(value=0.0, description='epsd2/admin/lagash2', max=551.0, style=ProgressStyle(desc…




In [28]:
df = pd.DataFrame({"textid":ids_, "lemm" : lemm_})

In [30]:
lemm_

[['_',
  'tug[textile]N',
  'bardul[garment]N',
  'guz[tufted]V/i',
  'sag[rare]V/i',
  '_',
  'tug[textile]N',
  'mudum[garment]N',
  'sag[rare]V/i',
  '_',
  '_',
  'tug[textile]N',
  'mudum[garment]N',
  'us[follow]V/t',
  '_',
  'tug[textile]N',
  'guʾe[textile]N',
  'us[follow]V/t',
  '_',
  'tug[textile]N',
  'gula[garment]N',
  'us[follow]V/t',
  '_',
  'tug[textile]N',
  'bardul[garment]N',
  'PA[00]PN',
  '_',
  'tug[textile]N',
  'niŋlam[garment]N',
  '_',
  'tug[textile]N',
  'bur[bowl]N',
  '_',
  '_',
  '_',
  '_',
  '_',
  'tug[textile]N',
  'munus[woman]N',
  '_',
  '_',
  'tug[textile]N',
  'aktum[garment]N',
  '_',
  '_',
  'lal[small]V/i',
  '_',
  'tug[textile]N',
  'aktum[garment]N',
  'guz[tufted]V/i',
  'elam[highland]N',
  'egal[palace]N',
  'Lugal.inim.du₁₀[00]PN',
  'šu[hand]N',
  'teŋ[near]V/i'],
 ['_',
  'tug[textile]N',
  'bardul[garment]N',
  '_',
  'tug[textile]N',
  'niŋlam[garment]N',
  'us[follow]V/t',
  'Ur.sukkal[00]PN',
  '_',
  'tug[textile]N',
  'b