In [199]:
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import math
import tqdm

colors = ['magenta', 'springgreen', 'orange', 'cyan', 'black', 'grey', 'blue', 'yellow', 'orange']

In [7]:
df = pd.read_csv('../../datasets/sdm_lov_df.csv')
df

Unnamed: 0,property,num_results,prefix_name,score,type,uri,vocab_prefix,reused_by
0,id,2948,mv:id,0.555556,property,http://schema.mobivoc.org/id,mv,0
1,id,2948,biopax:id,0.534883,property,http://www.biopax.org/release/biopax-level3.ow...,biopax,1
2,id,2948,sioc:id,0.520599,property,http://rdfs.org/sioc/ns#id,sioc,1
3,id,2948,dcterms:identifier,0.504435,property,http://purl.org/dc/terms/identifier,dcterms,58
4,id,2948,npg:id,0.501353,property,http://ns.nature.com/terms/id,npg,0
...,...,...,...,...,...,...,...,...
47294,country,518,roh:patentCountry,0.204413,property,http://w3id.org/roh#patentCountry,roh,0
47295,country,518,pext:CountryCapital,0.204413,class,http://www.ontotext.com/proton/protonext#Count...,pext,0
47296,country,518,dicom:CountryOfResidence,0.204343,property,http://purl.org/healthcarevocab/v1#CountryOfRe...,dicom,0
47297,country,518,uri4uri:TopLevelDomain-CountryCode,0.204343,class,http://uri4uri.net/vocab#TopLevelDomain-Countr...,uri4uri,0


In [49]:
len(df[df['property'] == 'id'])

149

In [56]:
len(df.vocab_prefix.unique())

763

In [45]:
G_subset_df = df.loc[df['property'] == 'id'][['property', 'prefix_name', 'score', 'num_results']]
G_subset_df.head(2)

Unnamed: 0,property,prefix_name,score,num_results
0,id,mv:id,0.555556,2948
1,id,biopax:id,0.534883,2948


In [57]:
df.groupby(['property','vocab_prefix']).size().reset_index().rename(columns={0:'count'}).sort_values('count', ascending=False)

Unnamed: 0,property,vocab_prefix,count
10008,p6,rdau,826
9978,p1,rdaw,297
9983,p10,rdaw,297
10005,p5,rdaa,270
9997,p3,rdam,259
...,...,...,...
7197,ki,m3lite,1
7198,ki,maso,1
7199,ki,mexalgo,1
7200,ki,nno,1


In [286]:
G = nx.Graph()
G.clear()

# Add unique properties nodes
for p in df.property.unique():
    if p not in G:
        l = len(df[df['property'] == p])
        G.add_node(p, num_results = l, type='sdm')
# Add unique prefixes nodes
for p in df.vocab_prefix.unique():
    if p not in G:
        l = len(df[df['vocab_prefix'] == p])
        G.add_node(p, num_results = l, type='lov')
# Add edges
for index, row in df.iterrows():
    if (row["property"], row["vocab_prefix"]) not in G.edges():
        G.add_edge(row["property"], row["vocab_prefix"], score = row['score'] / 10, weight = 0)
    else:
        G[row["property"]][row["vocab_prefix"]]['weight'] += 1


In [289]:
df = pd.DataFrame(G.edges(data=True))
df['weight'] = df[2].apply(lambda x: x['weight'])
df.sort_values('weight', ascending=False)

Unnamed: 0,0,1,2,weight
13489,p6,rdau,"{'score': 0.14923278, 'weight': 825}",825
13478,p10,rdaw,"{'score': 0.16626205, 'weight': 296}",296
12135,p1,rdaw,"{'score': 0.032721972, 'weight': 296}",296
13487,p5,rdaa,"{'score': 0.049976212, 'weight': 269}",269
12147,p3,rdam,"{'score': 0.044326496, 'weight': 258}",258
...,...,...,...,...
7135,email,akt,"{'score': 0.067942053, 'weight': 0}",0
7138,email,og,"{'score': 0.049284542, 'weight': 0}",0
7139,email,swrc,"{'score': 0.049284542, 'weight': 0}",0
7140,email,mads,"{'score': 0.049284542, 'weight': 0}",0


In [145]:
nx.write_graphml(G, '../../graphs/sdm_lov_graph_v2.graphml')

# Graph 2

In [254]:
from pysmartdatamodels import pysmartdatamodels as sdm

sdm_datamodels = sdm.load_all_datamodels()
sdm_attributes = sdm.load_all_attributes()

In [255]:
sdm_datamodels_df = pd.DataFrame(sdm_datamodels)
sdm_attributes_df = pd.DataFrame(sdm_attributes)

In [274]:
domain_mapper = {}
for p in tqdm.tqdm(df.property.unique()):
    domains = []
    for r in sdm_attributes_df.loc[(sdm_attributes_df['property'] == p)]['repoName']:
        domains.append(sdm_datamodels_df[sdm_datamodels_df['repoName'] == r]['domains'].values[0])
    
    domain_mapper[p] = list(set([item for sublist in domains for item in sublist]))

100%|██████████| 1615/1615 [00:02<00:00, 651.49it/s] 


In [275]:
from collections import Counter
from itertools import chain

unique_domains = dict(Counter([item for sublist in domain_mapper.values() for item in sublist]))
unique_domains

{'SmartLogistics': 34,
 'CrossSector': 320,
 'SmartDestination': 94,
 'SmartWater': 121,
 'SmartRobotics': 42,
 'SmartAgrifood': 87,
 'SmartEnergy': 531,
 'SmartHealth': 121,
 'SmartManufacturing': 40,
 'SmartAeronautics': 47,
 'SmartEnvironment': 139,
 'SmartCities': 511,
 'Smart-Sensoring': 200}

In [283]:
G = nx.Graph()
G.clear()

# Add unique properties nodes
for k, v in unique_domains.items():
    if k not in G:
        G.add_node(k, num_results = v, type='domain')
# Add unique prefixes nodes
for p in df.vocab_prefix.unique():
    if p not in G:
        l = len(df[df['vocab_prefix'] == p])
        G.add_node(p, num_results = l, type='lov')
# Add edges
for property, domains in domain_mapper.items():
    vocabs = df[df['property'] == property]['vocab_prefix'].values
    for d in domains:
        for v in vocabs:
            if (d, v) not in G.edges():
                G.add_edge(d, v, weight = 0)
            else:
                G[d][v]['weight'] += 1

In [285]:
nx.write_graphml(G, '../../graphs/domain_lov.graphml')