## 6.1 Data Acquisition for Aleksi's pmi-embedding

In [None]:
import zipfile
import json
import tqdm
import os
import sys
import pickle
import re
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [None]:
directories = ['jsonzip', 'output', 'corpus']
make_dirs(directories)

In [None]:
projects = "epsd2/admin/ur3"

In [None]:
p = format_project_list(projects)
oracc_download(p);

The `parsejson()` function below works in a way that is similar to the `parsejson()` functions we discussed in Chapter 2. Each .json file to be parsed represents a single text. The list `l` collects lemmatizations in the format CF\[GW\]POS (for instance lugal\[king\]N). When the function has gone through the entire file it adds the "#" symbol to the end of the list (to indicate the end of a document) and extends the list `lemm_l` with the list `l`. This will create one long list of lemmas, with individual documents separated by the "#" symbol.

In addition to text breaks, we also want to mark logical and physical breaks within a single document (so that text windows do not jump over such breaks). Logical and physical breaks in the text are marked in the JSON with a `state` node. This node has a restricted vocabulary to indicate breaks, traces, illegible lines, horizontal rulings, etc. The vocabulary that marks a logical or physical break is collected in the list `breakage`. When such a node is encountered, the symbol "#" is added to the list `l`. Then the process resumes. 

The rest of the function takes care of special situations:
* Unlemmatized words - words that are damaged or unknown should not be included. They are replaced by an underscore ("_"). Such words do not contribute to our analysis, but should also not be removed because we do not want to create artificial neighbors.
* Damaged personal names are like unlemmatized words and are replaced by an underscore. Such names are lemmatized as PN with a citation form in the format Lu₂.x.
* Numbers are of no interest here and are entirely removed.
* Year names are removed. Year names are important for dating, for political history and for understanding the ideology of the period. They do not contribute meaningful collocates to the transactions in the documents studied here.
* Words that are not in Sumerian are removed. Note that loans from Akkadian are considered to be Sumerian words, those are retained. Occasionally Ur III texts may include Akkadian prepositions or fully conjugated Akkadian verbs. Such words are removed.

The process also skips lemmas that derive from the "sign" and "pronunciation" columns of lexical lists. That is not relevant in the current context, but may become relevant if you wish to use this code on a wider set of texts.

In [None]:
lemm_l = []
l = []
ids_ = []
breakage = ['illegible', 'traces', 'missing', 'effaced','other', 'blank', 'ruling']

In [None]:
def parsejson(text):
    for JSONobject in text["cdl"]:
        if "cdl" in JSONobject: 
            parsejson(JSONobject)
        elif JSONobject.get("state", "") in breakage:  # at any logical or physical break
            if l:
                if not l[-1] == "#":
                    l.append("#")
            continue
        elif JSONobject.get("subtype","") in ['sg', 'pr']: # skip the fields "sign" and "pronunciation"
            continue                                     # in lexical texts
        elif JSONobject.get("subtype", "")[:5] in ["seal ", "envel"]: # seal 1, seal 2, etc. or envelope
            if l:
                if not l[-1] == "#":
                    l.append("#")
            continue
        elif JSONobject.get("ftype", "") == "yn":
            continue # skip year names
        elif "f" in JSONobject:          # copy all the lemmatization data in the variable word
            word = JSONobject["f"]
            if word["lang"][:3] != "sux": #only Sumerian and Emesal
                continue
            if word.get("pos", "") == "n":  # omit numbers
                continue
            if "cf" in word:
                #for some reason some words appear without pos. Provisionally treated as Noun
                lemm = f"{word['cf']}[{word['gw']}]{word.get('pos', 'N')}"  
                lemm = lemm.replace(' ', '-') # remove commas and spaces from lemm
                lemm = lemm.replace(',', '')
            else:
                lemm = "_" # if word is unlemmatized enter a place holder
            if "x" in word.get("cf","").lower():  # partly damaged PN; enter placeholder
                lemm = "_"
            l.append(lemm)           # append the lemmatization to the list l
    return l

In [None]:
for project in p:
    file = "jsonzip/" + project.replace("/", "-") + ".zip"
    try:
        z = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        print(file + " does not exist or is not a proper ZIP file")
        continue
    files = z.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):                            #iterate over the file names
        id_no = filename[-13:-5]
        if id_no in ids_ and not "X" in id_no: # Check if P/Q number is already in there
            continue        # a text may appear in multiple projects
        id_text = project + id_no # id_text is, for instance, blms/P414332
        ids_.append(id_text)
        try:
            text = z.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text)                # make it into a json object (essentially a dictionary)
            l = parsejson(data_json)
            if len(l) > 1:
                if not l[-1] == "#":
                    l.append("#")
                lemm_l.extend(l.copy())
                l.clear()
        except:
            print(id_text + ' is not available or not complete')

The above results in the lemm_l, which holds the lemmatizations of all the texts in their original order, one lemma per list entry. Secondly, the list ids_ holds all the text IDs. These IDs are not further used, but were collected to prevent duplication, which may be an issue if you derive data from more than one project.

Save in "Word per Line" format for use in pmi-embedding

In [None]:
with open("output/lemmas.wpl", "w", encoding="utf8") as w:
    w.write("\n".join(lemm_l))

In [None]:
import os
import sys
emb_dir = os.path.abspath('../../pmi-embeddings/src')
sys.path.append(emb_dir)

In [None]:
import make_embeddings as embs

In [None]:
file_name = "output/lemmas.wpl"

In [None]:
chunk_size = 400000
parameters = {
    "window_size": 7,
    "min_count": 1,
    "subsampling_rate": None,
    "k_factor": 3,
    "dynamic_window": True,
    "window_scaling": False,
    "verbose": True
           }

In [None]:
embeddings = embs.Cooc(file_name, chunk_size, **parameters)
embeddings.count_cooc()

In [None]:
pmi_parameters = {
    'shift_type': 0,
    'alpha': None,
    'lambda_': None, 
    'threshold': 5
    }

In [None]:
embeddings.calculate_pmi(**pmi_parameters)

In [None]:
dimensions = 300
eigenvalue_weighting = 0.0
embeddings.factorize(dimensions, eigenvalue_weighting)

In [None]:
vector_file = 'output/sahala.vec'
embeddings.save_vectors(vector_file)