In [1]:
import pandas as pd
import rdflib
import plotly.express as px
from notebooks.resources.input_data import AUTHORITY_TABLES_DIR_PATH, PREFIX_DEFINITION_FILE_PATH

In [2]:
input_data_files = [file_path for file_path in AUTHORITY_TABLES_DIR_PATH.iterdir() if
                    file_path.is_file() and file_path.suffix == ".rdf"]

In [4]:
# read rdf graph from all files from input_data_files
g = rdflib.Graph()
for file_path in input_data_files:
    g.parse(str(file_path), format="xml")

In [5]:
def extract_namespace_from_uri(uri):
    if uri.startswith("http"):
        counter = 0
        for char in uri[::-1]:
            if char == "/" or char == "#":
                return uri[:-counter] if counter else uri
            counter += 1
    return "unknown_namespace"

In [6]:
def filter_blank_nodes_and_literals(triples):
    for s, p, o in triples:
        if not isinstance(o, rdflib.term.Literal) and not isinstance(o, rdflib.term.BNode) and not isinstance(s,
                                                                                                              rdflib.term.BNode):
            yield s, p, o

In [7]:
unique_namespaces = set()
triples = []
for s, p, o in filter_blank_nodes_and_literals(g):
    namespace_s = extract_namespace_from_uri(s)
    namespace_p = extract_namespace_from_uri(p)
    namespace_o = extract_namespace_from_uri(o)
    triples.append((namespace_s, namespace_p, namespace_o))
    unique_namespaces.add(extract_namespace_from_uri(s))
    unique_namespaces.add(extract_namespace_from_uri(p))
    unique_namespaces.add(extract_namespace_from_uri(o))

In [54]:
tmp_df = pd.DataFrame(pd.Series(triples).value_counts().reset_index())
tmp_df.columns = ["triple", "count"]
tmp_df["subject_base"] = tmp_df["triple"].apply(lambda x: x[0])
tmp_df["predicate_base"] = tmp_df["triple"].apply(lambda x: x[1])
tmp_df["object_base"] = tmp_df["triple"].apply(lambda x: x[2])
df = tmp_df[["subject_base", "predicate_base", "object_base", "count"]]

In [27]:
prefixes_df = pd.read_csv(PREFIX_DEFINITION_FILE_PATH)

In [55]:
df["ns_s"] = df["subject_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["ns_p"] = df["predicate_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")
df["ns_o"] = df["object_base"].apply(
    lambda x: prefixes_df[prefixes_df["base URI"] == x]["prefix"].values[0] if x in prefixes_df[
        "base URI"].values else "unknown")

In [166]:
ns_df = df[["ns_s", "ns_p", "ns_o", "count"]]

In [152]:
ns_df.to_csv("ns_s_p_o.csv", index=False)

In [153]:
ns_s_o = ns_df.groupby(by=["ns_s", "ns_o"]).sum(numeric_only=True).reset_index()
ns_s_o.to_csv("ns_s_o.csv", index=False)

In [147]:
ns_df = ns_df.groupby(by=["ns_s", "ns_p"]).sum(numeric_only=True).reset_index()
ns_df

Unnamed: 0,ns_s,ns_p,count
0,at-base,rdf,8
1,cob,rdf,1511
2,cob,skos,4013
3,cob-clas,dct,14
4,cob-clas,rdf,60
5,cob-clas,skos,150
6,country,rdf,284
7,country,skos,336
8,currency,dct,468
9,currency,rdf,757


In [None]:
# write in csv ns_s and ns_o with their count

# write in csv ns_s , ns_p . ns_o with their count

In [145]:
at_ns_df = ns_df.groupby(by="ns_s")
for ns_s, ns_p_o in at_ns_df:
    ns_s_to_ns_o = (ns_p_o.groupby(by="ns_o").sum(numeric_only=True)).reset_index()
    ns_s_to_ns_o["ns_s"] = ns_s
    display(ns_s_to_ns_o)
    figure = px.pie(ns_s_to_ns_o, values="count", names="ns_o", title=ns_s,
					width=500,
                    height=500)
    figure.show()

Unnamed: 0,ns_o,count,ns_s
0,skos,8,at-base


Unnamed: 0,ns_o,count,ns_s
0,at-base,1511,cob
1,cob,2502,cob
2,skos,1511,cob


Unnamed: 0,ns_o,count,ns_s
0,at-base,48,cob-clas
1,cob-clas,102,cob-clas
2,euvoc,14,cob-clas
3,notation-type,14,cob-clas
4,skos,46,cob-clas


Unnamed: 0,ns_o,count,ns_s
0,at-base,284,country
1,country,52,country
2,skos,284,country


Unnamed: 0,ns_o,count,ns_s
0,at-base,578,currency
1,currency,468,currency
2,euvoc,468,currency
3,notation-type,468,currency
4,skos,289,currency


Unnamed: 0,ns_o,count,ns_s
0,at-base,8179,language
1,skos,8179,language


Unnamed: 0,ns_o,count,ns_s
0,at-base,16,membership-clas
1,cob,4,membership-clas
2,skos,8,membership-clas


Unnamed: 0,ns_o,count,ns_s
0,at-base,8726,place
1,euvoc,10615,place
2,notation-type,10615,place
3,place,10615,place
4,skos,4363,place


Unnamed: 0,ns_o,count,ns_s
0,at-base,404,site
1,skos,404,site


In [154]:
import networkx as nx

In [177]:
ns_df["weight"] = ns_df["count"].apply(lambda x: x / 2000)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [178]:
graph = nx.from_pandas_edgelist(ns_df, source="ns_s", target="ns_o", edge_attr="weight")

In [179]:
from pyvis.network import Network
net = Network('1000px', '100%', directed =True, notebook=True, cdn_resources="remote", neighborhood_highlight=True)
net.from_nx(graph)
frame = net.write_html("result_graph.html", notebook=True)