In [3]:
import pandas as pd
import rdflib
from notebooks.resources.input_data import AUTHORITY_TABLES_DIR_PATH, PREFIX_DEFINITION_FILE_PATH

In [4]:
input_data_files = [file_path for file_path in AUTHORITY_TABLES_DIR_PATH.iterdir() if
                    file_path.is_file() and file_path.suffix == ".rdf"]

In [5]:
# read rdf graph from all files from input_data_files
g = rdflib.Graph()
for file_path in input_data_files:
    g.parse(str(file_path), format="xml")

In [6]:
def extract_namespace_from_uri(uri):
    if uri.startswith("http"):
        counter = 0
        for char in uri[::-1]:
            if char == "/" or char == "#":
                return uri[:-counter] if counter else str(uri)
            counter += 1
    return "unknown_namespace"

In [7]:
def filter_blank_nodes_and_literals(triples):
    for s, p, o in triples:
        if not isinstance(o, rdflib.term.Literal) and not isinstance(o, rdflib.term.BNode) and not isinstance(s, rdflib.term.BNode):
            yield s, p, o

In [8]:
def filter_literals(triples):
    for s, p, o in triples:
        if not isinstance(o, rdflib.term.Literal):
            yield s, p, o

In [9]:
def inject_namespace_to_blank_nodes(input_graph: rdflib.Graph)->rdflib.Graph:
    """
    This function injects the namespace of the subject to the blank nodes in the object position.
    """
    output_graph = rdflib.Graph()
    for s, p, o in input_graph:
        if isinstance(o, rdflib.term.BNode):
            new_object_uri = f"{extract_namespace_from_uri(s)}{o}"
            for so, po, oo in input_graph.triples(triple=(o, None, None)):
                output_graph.remove((so, po, oo))
                output_graph.add((rdflib.term.URIRef(new_object_uri), po, oo))
            output_graph.remove((s, p, o))
            output_graph.add((s, p, rdflib.term.URIRef(new_object_uri)))
        else:
            output_graph.add((s, p, o))
    return output_graph

In [10]:
g = inject_namespace_to_blank_nodes(g)

In [11]:
remove_predicates = {"hasTelephone", "hasAddress","homepage","hasEmail"}

In [12]:
results = []
unique_namespaces = set()
for s, p, o in filter_blank_nodes_and_literals(g):
    namespace_s = extract_namespace_from_uri(s)
    namespace_p = extract_namespace_from_uri(p)
    namespace_o = extract_namespace_from_uri(o)
    predicate = p.split("#")[-1] if "#" in p else p.split("/")[-1]
    if predicate in remove_predicates:
        continue
    unique_namespaces.add(namespace_s)
    unique_namespaces.add(namespace_p)
    unique_namespaces.add(namespace_o)
    results.append((namespace_s, namespace_p, predicate, namespace_o))

In [13]:
len(unique_namespaces)

40

In [14]:
pd.DataFrame(unique_namespaces, columns=["namespace"]).to_csv("unique_namespaces.csv", index=False)

In [15]:
tmp_df = pd.DataFrame(pd.Series(results).value_counts().reset_index())
tmp_df.columns = ["results", "count"]
tmp_df["subject_base"] = tmp_df["results"].apply(lambda x: x[0])
tmp_df["predicate_base"] = tmp_df["results"].apply(lambda x: x[1])
tmp_df["predicate"] = tmp_df["results"].apply(lambda x: x[2])
tmp_df["object_base"] = tmp_df["results"].apply(lambda x: x[3])
df = tmp_df[["subject_base", "predicate_base","predicate", "object_base", "count"]]

In [50]:
prefixes_df = pd.read_csv(PREFIX_DEFINITION_FILE_PATH)

In [51]:
prefixes_df

Unnamed: 0,base URI,prefix,label,type
0,http://publications.europa.eu/ontology/authority/,at-ont,Authority Table (legacy),ontology
1,http://publications.europa.eu/ontology/euvoc#,euvoc,EU Vocabulary Ontology,ontology
2,http://publications.europa.eu/resource/authority/,at-base,Base AT namespace,at
3,http://publications.europa.eu/resource/authori...,cob-clas,Corporate Body Classification,at
4,http://publications.europa.eu/resource/authori...,cob,Corporate Body,at
5,http://publications.europa.eu/resource/authori...,country,Country,at
6,http://publications.europa.eu/resource/authori...,currency,Currency,at
7,http://publications.europa.eu/resource/authori...,language,Language,at
8,http://publications.europa.eu/resource/authori...,membership-clas,Membership Classification,at
9,http://publications.europa.eu/resource/authori...,notation-type,Notation Type,at


In [29]:
def get_prefix_by_namespace(namespace: str, prefixes_df: pd.DataFrame, new_prefixes: dict)->str:
    """
    This function injects the prefixes to the namespaces.
    """
    if namespace in prefixes_df["base URI"].values:
        return prefixes_df[prefixes_df["base URI"] == namespace]["prefix"].values[0]
    elif namespace not in new_prefixes.keys():
        new_prefixes[namespace] = f"ns{len(new_prefixes.keys())+1}"
    return new_prefixes[namespace]

def get_prefix_type_by_namespace(namespace: str, prefixes_df: pd.DataFrame)->str:
    """
    This function injects the prefixes to the namespaces.
    """
    if namespace in prefixes_df["base URI"].values:
        return prefixes_df[prefixes_df["base URI"] == namespace]["type"].values[0]
    else:
        return "unknown"


In [52]:
new_prefixes = dict()

In [53]:
df["ns_s"] = df["subject_base"].apply(lambda x: get_prefix_by_namespace(x, prefixes_df, new_prefixes))
df["ns_p"] = df["predicate_base"].apply(lambda x: get_prefix_by_namespace(x, prefixes_df, new_prefixes))
df["ns_o"] = df["object_base"].apply(lambda x: get_prefix_by_namespace(x, prefixes_df, new_prefixes))
df["p"] = df["ns_p"]+":"+df["predicate"]

In [54]:
df["asset_type_s"] = df["subject_base"].apply(lambda x: get_prefix_type_by_namespace(x, prefixes_df))
df["asset_type_p"] = df["predicate_base"].apply(lambda x: get_prefix_type_by_namespace(x, prefixes_df))
df["asset_type_o"] = df["object_base"].apply(lambda x: get_prefix_type_by_namespace(x, prefixes_df))

In [55]:
new_prefixes_df = pd.DataFrame.from_dict(new_prefixes, orient="index").reset_index()
new_prefixes_df.columns = ["base URI", "prefix"]

In [56]:
new_prefixes_df

Unnamed: 0,base URI,prefix
0,http://publications.europa.eu/resource/authori...,ns1
1,http://publications.europa.eu/resource/authori...,ns2
2,http://publications.europa.eu/resource/authori...,ns3
3,http://publications.europa.eu/resource/authori...,ns4
4,http://publications.europa.eu/resource/authori...,ns5
5,http://publications.europa.eu/resource/authori...,ns6
6,http://publications.europa.eu/resource/authori...,ns7
7,unknown_namespace,ns8
8,http://publications.europa.eu/resource/authori...,ns9
9,http://eurovoc.europa.eu/,ns10


In [57]:
new_prefixes_df.to_csv("new_prefixes.csv", index=False)

In [33]:
df.head(50)

Unnamed: 0,subject_base,predicate_base,predicate,object_base,count,ns_s,ns_p,ns_o,p,asset_type_s,asset_type_p,asset_type_o
0,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/ontology/euvoc#,status,http://publications.europa.eu/resource/authori...,65733,cob,euvoc,ns3,euvoc:status,at,ontology,unknown
1,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://www.w3.org/2008/05/skos-xl#,59401,cob,rdf,ns1,rdf:type,at,language,unknown
2,http://publications.europa.eu/resource/authori...,http://purl.org/dc/terms/,type,http://publications.europa.eu/resource/authori...,59401,cob,dct,ns4,dct:type,at,ontology,unknown
3,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/ontology/euvoc#,status,http://publications.europa.eu/resource/authori...,53388,place,euvoc,ns3,euvoc:status,at,ontology,unknown
4,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/ontology/euvoc#,status,http://publications.europa.eu/resource/authori...,39018,country,euvoc,ns3,euvoc:status,at,ontology,unknown
5,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://www.w3.org/2008/05/skos-xl#,38410,place,rdf,ns1,rdf:type,at,language,unknown
6,http://publications.europa.eu/resource/authori...,http://purl.org/dc/terms/,type,http://publications.europa.eu/resource/authori...,38409,place,dct,ns4,dct:type,at,ontology,unknown
7,http://publications.europa.eu/resource/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#,type,http://www.w3.org/2008/05/skos-xl#,37421,country,rdf,ns1,rdf:type,at,language,unknown
8,http://publications.europa.eu/resource/authori...,http://www.w3.org/2008/05/skos-xl#,prefLabel,http://publications.europa.eu/resource/authori...,34859,cob,ns1,cob,ns1:prefLabel,at,unknown,at
9,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/ontology/euvoc#,status,http://publications.europa.eu/resource/authori...,34453,language,euvoc,ns3,euvoc:status,at,ontology,unknown


In [58]:
ns_df = df[["ns_s", "ns_p","p", "ns_o", "asset_type_s","asset_type_p", "asset_type_o" , "count"]]

In [59]:
ns_df.to_csv("ns_s_p_o.csv", index=False)

In [60]:
ns_sp = ns_df[["ns_s", "ns_p", "count"]]
ns_so = ns_df[["ns_s", "ns_o", "count"]]
ns_sp.columns = ["source", "target", "count"]
ns_so.columns = ["source", "target", "count"]

In [61]:
ns_sp_so = pd.concat([ns_sp, ns_so])
ns_sp_so = ns_sp_so.groupby(by=["source", "target"]).sum(numeric_only=True).reset_index()

In [62]:
ns_sp_so.to_csv("ns_sp_so.csv", index=False)

In [63]:
import networkx as nx

In [64]:
net_df = ns_sp_so.copy()

In [65]:
net_df["weight"] = net_df["count"].apply(lambda x: x / net_df["count"].mean())

In [66]:
net_df

Unnamed: 0,source,target,count,weight
0,at-base,at-base,4,0.000335
1,at-base,cob,1,0.000084
2,at-base,cob-clas,2,0.000168
3,at-base,country,1,0.000084
4,at-base,currency,2,0.000168
...,...,...,...,...
168,site,rdf,3455,0.289421
169,site,site,2262,0.189485
170,site,skos,1212,0.101528
171,site,skoxl,1584,0.132690


In [67]:
graph = nx.from_pandas_edgelist(net_df, source="source", target="target", edge_attr="weight", create_using=nx.DiGraph)

In [68]:
from pyvis.network import Network
net = Network('1000px', '100%', directed =True, notebook=True, cdn_resources="remote", neighborhood_highlight=True)
net.from_nx(graph)
net.toggle_physics(False)
frame = net.write_html("result_graph.html", notebook=True)