In [1]:
import pandas as pd
import zipfile
import json
from itertools import combinations
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [2]:
volumes = ["saao/saa01", "saao/saa05", "saao/saa15"]

In [3]:
json_l = []
for v in volumes:
    file = f"jsonzip/{v.replace('/', '-')}.zip"
    z = zipfile.ZipFile(file)
    filename = f"{v}/gloss-qpn.json"
    qpn = z.read(filename).decode('utf-8')         #read and decode the qpn glossary json file
    data_json = json.loads(qpn)  
    json_l.append(data_json)

In [4]:
t = json_l[0]

In [5]:
df_pn = pd.DataFrame(t["entries"])

In [6]:
keep = ['headword', 'xis', 'pos']
df_pn = df_pn[keep]
df_pn = df_pn.loc[df_pn.pos.isin(['PN', 'RN'])].drop('pos', 1)
df_pn

Unnamed: 0,headword,xis
1,Abattu[1]PN,qpn.r000001
3,Abu-lešir[1]PN,qpn.r000002
5,Adad-abuʾa[1]PN,qpn.r000004
6,Adad-ibni[1]PN,qpn.r000005
7,Adad-iriba[1]PN,qpn.r000008
...,...,...
421,Zabbua[1]PN,qpn.r000243
422,Zabina-ili[1]PN,qpn.r000244
425,Zari[1]PN,qpn.r000247
426,Zeru-ibni[1]PN,qpn.r000248


In [7]:
for i in t["instances"]:
    t["instances"][i] = [i2.split(':')[1] for i2 in t["instances"][i]]
    t["instances"][i] = [i2.split('.')[0] for i2 in t["instances"][i]]    
    t["instances"][i] = set(t["instances"][i])

In [8]:
ids = list(combinations(df_pn['xis'], 2))

In [9]:
inst = t["instances"]
edges = []
for i in ids:
    common = inst[i[0]].intersection(inst[i[1]])
    weight = len(common)
    if weight:
        edges.append([i[0], i[1], weight, common])

In [10]:
edgelist = pd.DataFrame(edges)
edgelist

Unnamed: 0,0,1,2,3
0,qpn.r000001,qpn.r00001c,1,{P313425}
1,qpn.r000001,qpn.r000022,1,{P313425}
2,qpn.r000001,qpn.r00008b,1,{P313425}
3,qpn.r000001,qpn.r0001c3,1,{P313425}
4,qpn.r000002,qpn.r000009,1,{X900008}
...,...,...,...,...
309,qpn.r000266,qpn.r000271,1,{P334125}
310,qpn.r000207,qpn.r000222,1,{P334820}
311,qpn.r000275,qpn.r000248,1,{P334728}
312,qpn.r000232,qpn.r000233,1,{P224485}


In [11]:
test = edgelist.merge(df_pn, left_on = 0, right_on='xis', how = 'left').drop([0, 'xis'], axis=1)

In [12]:
test = test.merge(df_pn, left_on = 1, right_on = 'xis', how='left').drop([1, 'xis'], axis = 1)
test

Unnamed: 0,2,3,headword_x,headword_y
0,1,{P313425},Abattu[1]PN,Amiru[1]PN
1,1,{P313425},Abattu[1]PN,Ammi-leti[1]PN
2,1,{P313425},Abattu[1]PN,Bel-liqbi[1]PN
3,1,{P313425},Abattu[1]PN,Qanni[1]PN
4,1,{X900008},Abu-lešir[1]PN,Adad-isseʾa[1]PN
...,...,...,...,...
309,1,{P334125},Šuzubu[1]PN,Ṭab-šar-Aššur[1]PN
310,1,{P334820},Taklak-ana-Bel[1]PN,Ubru-Nabu[1]PN
311,1,{P334728},Ṭab-ṣil-Ešarra[1]PN,Zeru-ibni[1]PN
312,1,{P224485},Urik[1]PN,Urpalaʾa[1]PN


In [13]:
test.columns = ['weight', 'text_ids', 'source', 'target']

In [14]:
test

Unnamed: 0,weight,text_ids,source,target
0,1,{P313425},Abattu[1]PN,Amiru[1]PN
1,1,{P313425},Abattu[1]PN,Ammi-leti[1]PN
2,1,{P313425},Abattu[1]PN,Bel-liqbi[1]PN
3,1,{P313425},Abattu[1]PN,Qanni[1]PN
4,1,{X900008},Abu-lešir[1]PN,Adad-isseʾa[1]PN
...,...,...,...,...
309,1,{P334125},Šuzubu[1]PN,Ṭab-šar-Aššur[1]PN
310,1,{P334820},Taklak-ana-Bel[1]PN,Ubru-Nabu[1]PN
311,1,{P334728},Ṭab-ṣil-Ešarra[1]PN,Zeru-ibni[1]PN
312,1,{P224485},Urik[1]PN,Urpalaʾa[1]PN
