Create file with all words (lemmas) of a single text listed in a row.

In [1]:
import pandas as pd
import zipfile
import json
import tqdm
import os
import sys
import pickle
import re
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [2]:
directories = ['jsonzip', 'output', 'corpus']
make_dirs(directories)

In [3]:
projects = input('Project(s): ').lower()

Project(s):  epsd2/admin/ur3


In [4]:
p = format_project_list(projects)
oracc_download(p)

Saving http://build-oracc.museum.upenn.edu/json/epsd2-admin-ur3.zip as jsonzip/epsd2-admin-ur3.zip.


HBox(children=(FloatProgress(value=0.0, description='epsd2/admin/ur3', max=577877494.0, style=ProgressStyle(de…

['epsd2/admin/ur3']

In [5]:
def parsejson(text):
    for JSONobject in text["cdl"]:
        field = ''
        if "cdl" in JSONobject: 
            parsejson(JSONobject)
        if "type" in JSONobject and JSONobject["type"] == "field-start":
            field = JSONobject["subtype"]
        if "f" in JSONobject and not field in ['sg', 'pr']: # skip the fields "sign" and "pronunciation"
                                # in lexical texts
            if JSONobject["f"]["lang"][:3] == "sux": #only Sumerian and Emesal
                word = JSONobject["f"]
                if "cf" in word:
                    if 'pos' in word:  #for some reason some words appear without pos. Provisionally treated as Noun
                        lemm = word["cf"] + '[' + word["gw"] + "]" + word["pos"]
                    else:
                        lemm = word["cf"] + '[' + word["gw"] + "]N"
                    lemm = lemm.replace(' ', '-') # remove commas and spaces from lemm
                    lemm = lemm.replace(',', '')
                else:
                    lemm = "_" # if word is unlemmatized enter a place holder
                l.append(lemm)
    return

In [6]:
lemm_ = []
ids_ = []
for project in p:
    file = "jsonzip/" + project.replace("/", "-") + ".zip"
    try:
        z = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        print(file + " does not exist or is not a proper ZIP file")
        continue
    files = z.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc = project):                            #iterate over the file names
        l = []
        id_no = filename[-13:-5]
        if id_no in ids_ and not "X" in id_no: # Check if P/Q number is already in there
            continue        # a text may appear in multiple projects
        id_text = project + id_no # id_text is, for instance, blms/P414332
        ids_.append(id_text)
        try:
            text = z.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text)                # make it into a json object (essentially a dictionary)
            #lemm_.append(f"\n{id_text}")     # new text starts on new line with text_id
            parsejson(data_json)
            lemm_.append(l)
        except:
            print(id_text + ' is not available or not complete')

HBox(children=(FloatProgress(value=0.0, description='epsd2/admin/ur3', max=71712.0, style=ProgressStyle(descri…

epsd2/admin/ur3/P143238 is not available or not complete



The above results in the list of lists lemm_, which holds the individual texts. Each individual text is represented by a list of lemmas, with the lemmas in the original order. Currently, breaks etc. are not represented. Secondly, the list ids_ holds all the text IDs; the list ids_ has the same order as the list of lists lemm_.

The following is directly taken from the blog post "simple word vectors with co-occurrence pmi and svd" by Alex Klibisz.

In [7]:
from __future__ import print_function, division
from collections import Counter
from itertools import combinations
from math import log
from pprint import pformat
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from string import punctuation
from time import time
from nltk import ngrams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print('Ready')

Ready


Since Klibisz is working with *titles* he compares all possible bigrams and does not define a moving window. That probably needs to change. Aleksi creates windows with the following code:
```python
wz = self.windowsize - 1
zip(*[text[i:] for i in range(1+wz*2)])
```
This will zip multiple versions of the same text - the first starts at word 0, the next at word 1, etc. Since the window is symmetric (counting forward and backward) take it twice .
An alternative method is to use NLTK ngrams, which will create windows.
```python
from nltk import ngrams
n = 1+wz*2
windows = ngrams(text, n)
```

In [8]:
# 2a. Compute unigram and bigram counts.
# A unigram is a single word (x). A bigram is a pair of words (x,y).
# Bigrams are counted for any two terms occurring in the same title.
# For example, the title "Foo bar baz" has unigrams [foo, bar, baz]
# and bigrams [(bar, foo), (bar, baz), (baz, foo)]
t0 = time()
cx = Counter()
cxy = Counter()
for text in lemm_:
    cx.update(text)

    # Count all pairs of words, even duplicate pairs.
    windows = ngrams(text, 7) # 7 is window length - needs to be changeable
    for w in windows: # this creates the windows
        z = [tuple(l) for l in map(sorted, combinations(w, 2))]
        cxy.update(z)

#     # Alternative: count only 2-grams.
#     for x, y in zip(text[:-1], text[1:]):
#         cxy[(x, y)] += 1

#     # Alternative: count all pairs of words, but don't double count.
#     for x, y in set(map(tuple, map(sorted, combinations(text, 2)))):
#         cxy[(x,y)] += 1

print('%.3lf seconds (%.5lf / iter)' %
      (time() - t0, (time() - t0) / len(lemm_)))

29.824 seconds (0.00042 / iter)


In [9]:
# 2b. Remove frequent and infrequent unigrams.
# Pick arbitrary occurrence count thresholds to eliminate unigrams occurring
# very frequently or infrequently. This decreases the vocab size substantially.
print('%d tokens before' % len(cx))
t0 = time()
sx = sum(cx.values())
min_count = 4
max_count = sx * .01
for x in list(cx.keys()):
    if cx[x] < min_count or cx[x] > max_count:
        del cx[x]
print('%.3lf seconds (%.5lf / iter)' % (time() - t0, (time() - t0) / len(cx)))
print('%d tokens after' % len(cx))
print('Most common:', cx.most_common()[:25])

21536 tokens before
0.012 seconds (0.00000 / iter)
7167 tokens after
Most common: [('2[00]PN', 30472), ('kišib[seal]N', 27779), ('ninda[bread]N', 27081), ('i[oil]N', 22750), ('gud[ox]N', 22281), ('ŋuruš[male]N', 21149), ('hulu[bad]V/i', 20949), ('dubsar[scribe]N', 20174), ('us[follow]V/t', 19627), ('šu[hand]N', 17869), ('niga[fattened]V/i', 17528), ('ugula[overseer]N', 16442), ('sila[lamb]N', 16036), ('ŋiri[foot]N', 16020), ('naŋa[potash]N', 15631), ('šum[garlic]N', 15391), ('šag[heart]N', 15094), ('teŋ[near]V/i', 14838), ('mana[unit]N', 14377), ('maš[goat]N', 14110), ('ašag[field]N', 14024), ('iku[unit]N', 13330), ('lal[small]V/i', 11255), ('dab[seize]V/t', 11003), ('du[go]V/i', 10747)]


In [10]:
# 2c. Remove frequent and infrequent bigrams.
# Any bigram containing a unigram that was removed must now be removed.
t0 = time()
for x, y in list(cxy.keys()):
    if x not in cx or y not in cx:
        del cxy[(x, y)]
print('%.3lf seconds (%.5lf / iter)' % (time() - t0, (time() - t0) / len(cxy)))

0.342 seconds (0.00000 / iter)


In [11]:
# 3. Build unigram <-> index lookup.
t0 = time()
x2i, i2x = {}, {}
for i, x in enumerate(cx.keys()):
    x2i[x] = i
    i2x[i] = x
print('%.3lf seconds (%.5lf / iter)' % (time() - t0, (time() - t0) / len(cx)))

0.003 seconds (0.00000 / iter)


In [12]:
# 4. Sum unigram and bigram counts for computing probabilities.
# i.e. p(x) = count(x) / sum(all counts).
t0 = time()
sx = sum(cx.values())
sxy = sum(cxy.values())
print('%.3lf seconds (%.5lf / iter)' %
      (time() - t0, (time() - t0) / (len(cx) + len(cxy))))

0.009 seconds (0.00000 / iter)


In [13]:
# 5. Accumulate data, rows, and cols to build sparse PMI matrix
# Recall from the blog post that the PMI value for a bigram with tokens (x, y) is: 
# PMI(x,y) = log(p(x,y) / p(x) / p(y)) = log(p(x,y) / (p(x) * p(y)))
# The probabilities are computed on the fly using the sums from above.
t0 = time()
pmi_samples = Counter()
data, rows, cols = [], [], []
for (x, y), n in cxy.items():
    rows.append(x2i[x])
    cols.append(x2i[y])
    data.append(log((n / sxy) / (cx[x] / sx) / (cx[y] / sx)))
    pmi_samples[(x, y)] = data[-1]
PMI = csc_matrix((data, (rows, cols)))
print('%.3lf seconds (%.5lf / iter)' % (time() - t0, (time() - t0) / len(cxy)))
print('%d non-zero elements' % PMI.count_nonzero())
print('Sample PMI values\n', pformat(pmi_samples.most_common()[:10]))

0.970 seconds (0.00000 / iter)
621175 non-zero elements
Sample PMI values
 [(('en.lil₂.da.mah.di[00]PN', 'šul.gi.he₂.ti[00]PN'), 12.10400060590684),
 (('gubar[nape]N', 'gumur[spine]N'), 12.043375984090407),
 (('Šu.diš[00]PN', 'Šu.diš[00]PN'), 11.755693911638625),
 (('mesbabbar[tree]N', 'mesbabbar[tree]N'), 11.755693911638625),
 (('E.pe.eš[00]PN', 'Na.am.tu.ra[00]PN'), 11.731596360059564),
 (('en[lord]N', 'šaŋar[oppressed]V/i'), 11.706903747469193),
 (('Geš.gi.tur.tur[00]FN', 'šeš.da.edin.na[00]DN'), 11.668682534648996),
 (('anki[universe]N', 'suh[confuse]V/t'), 11.622162519014102),
 (('nin.gir₂.su.gu₂.gal[00]FN', 'nin.gir₂.su.ka.bi₂.du₁₁[00]PN'),
  11.57337235484467),
 (('E₂.lu₂.lagar[00]PN', 'Nin.ba.tuku.šem₅.du₁₀.gal[00]PN'),
  11.483760196154984)]


In [14]:
# 6. Factorize the PMI matrix using sparse SVD aka "learn the unigram/word vectors".
# This part replaces the stochastic gradient descent used by Word2vec
# and other related neural network formulations. We pick an arbitrary vector size k=20.
t0 = time()
U, _, _ = svds(PMI, k=20)
print('%.3lf seconds' % (time() - t0))

0.187 seconds


In [15]:
# 7. Normalize the vectors to enable computing cosine similarity in next cell.
# If confused see: https://en.wikipedia.org/wiki/Cosine_similarity#Definition
t0 = time()
norms = np.sqrt(np.sum(np.square(U), axis=1, keepdims=True))
U /= np.maximum(norms, 1e-7)
print('%.3lf seconds' % (time() - t0))

0.001 seconds


In [18]:
# 8. Show some nearest neighbor samples as a sanity-check.
# The format is <unigram> <count>: (<neighbor unigram>, <similarity>), ...
# From this we can see that the relationships make sense.
k = 5
for x in ['suhur[carp]N', 'gigir[chariot]N', 'šah[pig]N', 'Inanak[1]DN']:
    dd = np.dot(U, U[x2i[x]]) # Cosine similarity for this unigram against all others.
    s = ''
    # Compile the list of nearest neighbor descriptions.
    # Argpartition is faster than argsort and meets our needs.
    for i in np.argpartition(-1 * dd, k + 1)[:k + 1]:
        if i2x[i] == x: continue
        xy = tuple(sorted((x, i2x[i])))
        s += '(%s, %.3lf) ' % (i2x[i], dd[i])
    print('%s, %d\n %s' % (x, cx[x], s))
    print('-' * 10)

suhur[carp]N, 78
 (mušenturtur[bird]N, 0.917) (saŋkur[fish]N, 0.938) (tun[container]N, 0.935) (saŋkešed[fish]N, 0.922) (nunuz[egg]N, 0.907) 
----------
gigir[chariot]N, 299
 (šudu[equipped]AJ, 0.746) (šegin[glue]N, 0.739) (dur[door-socket]N, 0.731) (bariga[unit]N, 0.692) (kaŋeškarak[table]N, 0.687) 
----------
šah[pig]N, 486
 (zeda[piglet]N, 0.919) (saŋkur[fish]N, 0.916) (pešgi[rodent]N, 0.931) (Ša₃.gul.lum[00]PN, 0.907) (Šu.bar[00]PN, 0.907) 
----------
Inanak[1]DN, 394
 (Gud-sisa[1]MN, 0.827) (NE.NE.ŋar[1]MN, 0.802) (Gangane[1]MN, 0.763) (Kin-Inana[1]MN, 0.750) (Apinduʾa[1]MN, 0.738) 
----------


In [None]:
x2i

In [None]:
"suhur[carp]N" in x2i

testing

In [None]:
c = Counter()
for text in lemm_: 
    c.update(text)

In [None]:
c

In [None]:
c = Counter()
text = lemm_[-2]
z  = [tuple(l) for l in map(sorted, combinations(text, 2))]
c.update(z)

In [None]:
c

In [None]:
list(map(sorted, combinations(text, 2)))

In [None]:
wz = 2
w = zip(*[text[i:] for i in range(1+wz*2)])
list(w)

In [None]:
text = lemm_[-2]
[text[i:] for i in range(3)]

In [None]:
list(range(5))

In [None]:
len([text[i:] for i in range(1+wz*2)])

In [None]:
from nltk import ngrams

sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 1+wz*2
window = ngrams(text, n)

for grams in window:
  print(grams)

In [None]:
window