In [1]:
import pandas as pd
import rdflib
import plotly.express as px
from notebooks.resources.input_data import AUTHORITY_TABLES_DIR_PATH, PREFIX_DEFINITION_FILE_PATH

In [2]:
input_data_files = [file_path for file_path in AUTHORITY_TABLES_DIR_PATH.iterdir() if
                    file_path.is_file() and file_path.suffix == ".rdf"]

In [190]:
# read rdf graph from all files from input_data_files
g = rdflib.Graph()
for file_path in input_data_files:
    g.parse(str(file_path), format="xml")

In [5]:
def extract_namespace_from_uri(uri):
    if uri.startswith("http"):
        counter = 0
        for char in uri[::-1]:
            if char == "/" or char == "#":
                return uri[:-counter] if counter else uri
            counter += 1
    return "unknown_namespace"

In [6]:
def filter_blank_nodes_and_literals(triples):
    for s, p, o in triples:
        if not isinstance(o, rdflib.term.Literal) and not isinstance(o, rdflib.term.BNode) and not isinstance(s, rdflib.term.BNode):
            yield s, p, o

In [None]:
def filter_literals(triples):
    for s, p, o in triples:
        if not isinstance(o, rdflib.term.Literal):
            yield s, p, o

In [192]:
def inject_namespace_to_blank_nodes(input_graph: rdflib.Graph)->rdflib.Graph:
    """
    This function injects the namespace of the subject to the blank nodes in the object position.
    """
    output_graph = rdflib.Graph()
    for s, p, o in input_graph:
        if isinstance(o, rdflib.term.BNode):
            new_object_uri = f"{extract_namespace_from_uri(s)}{o}"
            for so, po, oo in input_graph.triples(triple=(o, None, None)):
                output_graph.remove((so, po, oo))
                output_graph.add((rdflib.term.URIRef(new_object_uri), po, oo))
            output_graph.remove((s, p, o))
            output_graph.add((s, p, rdflib.term.URIRef(new_object_uri)))
        else:
            output_graph.add((s, p, o))
    return output_graph

In [193]:
g = inject_namespace_to_blank_nodes(g)

In [199]:
results = []
for s, p, o in filter_blank_nodes_and_literals(g):
    namespace_s = extract_namespace_from_uri(s)
    namespace_p = extract_namespace_from_uri(p)
    namespace_o = extract_namespace_from_uri(o)
    predicate = p.split("#")[-1] if "#" in p else p.split("/")[-1]
    results.append((namespace_s, namespace_p, predicate, namespace_o))

In [209]:
tmp_df = pd.DataFrame(pd.Series(results).value_counts().reset_index())
tmp_df.columns = ["results", "count"]
tmp_df["subject_base"] = tmp_df["results"].apply(lambda x: x[0])
tmp_df["predicate_base"] = tmp_df["results"].apply(lambda x: x[1])
tmp_df["predicate"] = tmp_df["results"].apply(lambda x: x[2])
tmp_df["object_base"] = tmp_df["results"].apply(lambda x: x[3])
df = tmp_df[["subject_base", "predicate_base","predicate", "object_base", "count"]]

In [201]:
prefixes_df = pd.read_csv(PREFIX_DEFINITION_FILE_PATH)

In [210]:
df["ns_s"] = df["subject_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["ns_p"] = df["predicate_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["ns_o"] = df["object_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["p"] = df["ns_p"]+":"+df["predicate"]
df["asset_type_s"] = df["subject_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["type"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["asset_type_p"] = df["predicate_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["type"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["asset_type_o"] = df["object_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["type"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")

In [211]:
df

Unnamed: 0,subject_base,predicate_base,predicate,object_base,count,ns_s,ns_p,ns_o,p,asset_type_s,asset_type_p,asset_type_o
0,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://publications.europa.eu/ontology/euvoc#,10615,place,rdf,euvoc,rdf:type,at,language,ontology
1,http://publications.europa.eu/resource/authori...,http://purl.org/dc/terms/,type,http://publications.europa.eu/resource/authori...,10615,place,dct,notation-type,dct:type,at,ontology,at
2,http://publications.europa.eu/resource/authori...,http://www.w3.org/2004/02/skos/core#,notation,http://publications.europa.eu/resource/authori...,10615,place,skos,place,skos:notation,at,ontology,at
3,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://publications.europa.eu/ontology/authority/,9767,language,rdf,at-ont,rdf:type,at,language,ontology
4,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/ontology/authority/,op-mapped-code,http://publications.europa.eu/resource/authori...,9767,language,at-ont,language,at-ont:op-mapped-code,at,ontology,at
5,http://publications.europa.eu/resource/authori...,http://www.w3.org/2004/02/skos/core#,inScheme,http://publications.europa.eu/resource/authority/,8179,language,skos,at-base,skos:inScheme,at,ontology,at
6,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://www.w3.org/2004/02/skos/core#,8179,language,rdf,skos,rdf:type,at,language,ontology
7,http://publications.europa.eu/resource/authori...,http://www.w3.org/2004/02/skos/core#,inScheme,http://publications.europa.eu/resource/authority/,4363,place,skos,at-base,skos:inScheme,at,ontology,at
8,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://www.w3.org/2004/02/skos/core#,4363,place,rdf,skos,rdf:type,at,language,ontology
9,http://publications.europa.eu/resource/authori...,http://www.w3.org/2004/02/skos/core#,topConceptOf,http://publications.europa.eu/resource/authority/,4363,place,skos,at-base,skos:topConceptOf,at,ontology,at


In [212]:
ns_df = df[["ns_s", "ns_p","p", "ns_o", "asset_type_s","asset_type_p", "asset_type_o" , "count"]]

In [213]:
ns_df.to_csv("ns_s_p_o.csv", index=False)

In [229]:
ns_sp = ns_df[["ns_s", "ns_p", "count"]]
ns_so = ns_df[["ns_s", "ns_o", "count"]]
ns_sp.columns = ["source", "target", "count"]
ns_so.columns = ["source", "target", "count"]

In [230]:
ns_sp_so = pd.concat([ns_sp, ns_so])
ns_sp_so = ns_sp_so.groupby(by=["source", "target"]).sum(numeric_only=True).reset_index()

In [232]:
ns_sp_so.to_csv("ns_sp_so.csv", index=False)

In [None]:
at_ns_df = ns_df.groupby(by="ns_s")
for ns_s, ns_p_o in at_ns_df:
    ns_s_to_ns_o = (ns_p_o.groupby(by="ns_o").sum(numeric_only=True)).reset_index()
    ns_s_to_ns_o["ns_s"] = ns_s
    display(ns_s_to_ns_o)
    figure = px.pie(ns_s_to_ns_o, values="count", names="ns_o", title=ns_s,
					width=500,
                    height=500)
    figure.show()

In [240]:
import networkx as nx

In [234]:
net_df = ns_sp_so.copy()

In [235]:
net_df["weight"] = net_df["count"].apply(lambda x: x / 2000)

In [237]:
net_df

Unnamed: 0,source,target,count,weight
0,at-base,rdf,8,0.004
1,at-base,skos,8,0.004
2,cob,at-base,1511,0.7555
3,cob,at-ont,5020,2.51
4,cob,cob,5012,2.506
5,cob,rdf,4021,2.0105
6,cob,skos,5524,2.762
7,cob-clas,at-base,48,0.024
8,cob-clas,at-ont,26,0.013
9,cob-clas,cob-clas,115,0.0575


In [253]:
graph = nx.from_pandas_edgelist(net_df, source="source", target="target", edge_attr="weight", create_using=nx.DiGraph)

In [255]:
from pyvis.network import Network
net = Network('1000px', '100%', directed =True, notebook=True, cdn_resources="remote", neighborhood_highlight=True)
net.from_nx(graph)
frame = net.write_html("result_graph.html", notebook=True)