In [None]:
import pandas as pd
import hvplot
import hvplot.pandas
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import hvplot.networkx as hvnx
import zipfile
import json

Read the catalogs of dcclt and dcclt/signlists.

In [None]:
files = ["jsonzip/dcclt.zip", "jsonzip/dcclt-signlists.zip"]
cat_lex_df = pd.DataFrame()
for file in files:
    z = zipfile.ZipFile(file) 
    namelist = z.namelist()
    cat = [name for name in namelist if "catalogue.json" in name][0]
    st = z.read(cat).decode("utf-8")
    j = json.loads(st)
    df = pd.DataFrame(j["members"]).T
    df["id_text"] = df["id_text"].fillna(df["id_composite"])
    df = df[["id_text", "period", "designation"]]
    cat_lex_df = cat_lex_df.append(df)

From the catalog, keep only the composite texts (those with Q numbers) dating to ED IIIa, ED IIIb or the Old Babylonian period.

In [None]:
cat_lex_df = cat_lex_df.loc[cat_lex_df['id_text'].str.contains("Q")]
periods = ["Early Dynastic IIIa", "Early Dynastic IIIb", "Old Babylonian"]
cat_lex_df = cat_lex_df.loc[cat_lex_df["period"].isin(periods)]

Read catalog of the literary texts.

In [None]:
file = "jsonzip/epsd2-literary.zip"
z = zipfile.ZipFile(file) 
st = z.read("epsd2/literary/catalogue.json").decode("utf-8")
j = json.loads(st)
cat_lit_df = pd.DataFrame(j["members"]).T
cat_lit_df["id_text"] = cat_lit_df["id_text"].fillna(cat_lit_df["id_composite"])
cat_lit_df = cat_lit_df[["id_text", "period", "designation"]]

Read pickled version of lexical lists and OB literary texts.

In [None]:
lex = pd.read_pickle("output/lex_words.p")
lit = pd.read_pickle("output/lit_words.p")

Remove unlemmatized words

In [None]:
lex = lex.loc[~lex.lemma.str.endswith("[na]na")]
lit = lit.loc[~lit.lemma.str.endswith("[na]na")]

From the lexical texts, keep the entries that belong to composite texts from the ED IIIa, ED IIIb, or Old Babylonian periods, by using the catalog.

In [None]:
lex["id_text"] = [textid.split('/')[-1] for textid in lex["id_text"]]
keep = cat_lex_df.index.values
lexQ = lex.loc[lex["id_text"].isin(keep)]

From the literary texts, keep only composites (those with Q numbers). This removes, for instance, all the exemplars from Ur in UET 6. This may be a bit too rough. There are some longer texts that only exists in a single exemplar and have no Q number - e.g. P357170 (Ludwig/Metcalf in ZA 107), and several texts in CUSAS 37. For now, those texts are added by hand.

In [None]:
lit["id_text"] = [textid.split("/")[-1] for textid in lit["id_text"]]
litQ = lit.loc[lit['id_text'].str.contains("Q")]
added = ["P357170", "P254171", "P252333", "P251713", "P251427", "P252296", "P254175", "X010001"]
lit_add = lit.loc[lit['id_text'].isin(added)]
litQ = litQ.append(added)

Group lemmas by composition; create one long string of lemmas for each composition.

In [None]:
lexQ2 = lexQ.groupby(by = "id_text").aggregate({"lemma" : " ".join}).reset_index()
litQ2 = litQ.groupby(by = "id_text").aggregate({"lemma" : " ".join}).reset_index()
lexQ2["text_length"] = [len(set(lemlist.split())) for lemlist in lexQ2["lemma"]]
litQ2["text_length"] = [len(set(lemlist.split())) for lemlist in litQ2["lemma"]]

Create dictionary of texids/text names (designation)

In [None]:
comp_dict = dict(zip(cat_lit_df["id_text"], cat_lit_df["designation"]))
lex_dict = dict(zip(cat_lex_df["id_text"], cat_lex_df["designation"]))
comp_dict.update(lex_dict)

In [None]:
nodes_lexical = lexQ2["id_text"]
nodes_literary = litQ2["id_text"]

In [None]:
B = nx.Graph()
B.add_nodes_from(nodes_lexical, bipartite=0)
B.add_nodes_from(nodes_literary, bipartite=1)

In [None]:
edges = []
for i in range(len(lexQ2)):
    lexwords = set(lexQ2.iloc[i]["lemma"].split())
    id_lex = lexQ2.iloc[i]["id_text"]
    for j in range(len(litQ2)):
        litwords = set(litQ2.iloc[j]["lemma"].split())
        id_lit = litQ2.iloc[j]["id_text"]
        weight = len(lexwords.intersection(litwords))
        if weight > 0:
            edge = (id_lex, id_lit, weight)
            edges.append(edge)

In [None]:
B.add_weighted_edges_from(edges)

Remove edges with low weights. Remove nodes that no longer connect to anything.

In [None]:
def slice_network(G, T):
    """
    Remove all edges with weight<T from G or its copy.
    """
    F = G.copy()
    F.remove_edges_from((n1, n2) for n1, n2, w in G.edges(data="weight") if w < T)
    return F

In [None]:
C = slice_network(B, 50)
C.remove_nodes_from(list(nx.isolates(C)))
nodes_lexical = [node for node in nodes_lexical if node in C.nodes]
nodes_literary = [node for node in nodes_literary if node in C.nodes]

compute total weights for all nodes

In [None]:
w = [(s, w) for s, t, w in C.edges(data="weight")]
weights_total = pd.DataFrame(w, columns=['id_text', 'weight']).groupby(by='id_text').aggregate({"weight" : sum}).reset_index()
weights_total_d = dict(zip(weights_total['id_text'], weights_total["weight"]))

In [None]:
w2 = [(t, w) for s, t, w in B.edges(data="weight")]
weights_total2 = pd.DataFrame(w2, columns = ['id_text', 'weight']).groupby(by='id_text').aggregate({'weight' : sum}).reset_index()
weights_total_d2 = dict(zip(weights_total2['id_text'], weights_total2["weight"]))
weights_total_d.update(weights_total_d2)

Relative weight. Since all computations are done with unique lemmas, text length is also be expressed in terms of number of unique lemmas.

In [None]:
text_length_d = dict(zip(lexQ2["id_text"], lexQ2["text_length"]))
text_length_lit = dict(zip(litQ2["id_text"], litQ2["text_length"]))
text_length_d.update(text_length_lit)
relative_weight_d = {textid : weight/text_length_d.get(textid) for (textid, weight) in weights_total_d.items()}

Add total weights, relative weights, and text name as node attributes

In [None]:
for node in C.nodes:
    C.nodes[node]['total_weight'] = weights_total_d.get(node, 0)
    C.nodes[node]['title'] = comp_dict.get(node, "None")
    C.nodes[node]['rel_weight'] = relative_weight_d.get(node, 0)

order nodes by relative weight

In [None]:
nodes_lexical = sorted(nodes_lexical, key= lambda t: C.nodes[t]['rel_weight'])

compute positions

In [None]:
pos = nx.drawing.layout.bipartite_layout(C, nodes_lexical)

In [None]:
hvnx.draw(C,
          pos,
          height=1800,
          width=1000,
          #with_labels=True,
          #labels=label_dict,
          #edge_width='weight',
          edge_alpha= 0.7,
          edge_color='weight',
          node_color='rel_weight',
          node_cmap=plt.cm.plasma,
          edge_cmap=plt.cm.Blues,
          label="lexical and literary bipartite graph")