In [1019]:
import json, re, sys, os, time
from operator import itemgetter
import numpy as np
import pandas as pd
import networkx as nx
from copy import deepcopy
from utils import HTTPUtils, MatrixIO, FileUtils

# --------------------
#### Establish variables, functions, and prefixes
# --------------------

In [1067]:
final_folder = "final/raw_data_folder/"

In [1068]:
ad = pd.read_csv(final_folder + "util_files/LDProjects.tsv", sep="\t").set_index("index")
ad = ad.to_dict(orient="index")
for k in ad: ad[k] = ad[k]['0']
print ("Linked Data Projects", len(ad))

('Linked Data Projects', 19)


In [1069]:
ndbc = pd.read_csv(final_folder + "util_files/NBDC_prefix.tsv", sep="\t").set_index("index")
ndbc = ndbc.to_dict(orient="index")
print ("NBDC Graphs", len(ndbc))

('NBDC Graphs', 25)


In [1070]:
disAllPrefs = pd.read_csv(final_folder + "util_files/ld_onto_prefixes.tsv", sep="\t").set_index("index")
disAllPrefs = disAllPrefs.to_dict(orient="index")
for k in disAllPrefs: disAllPrefs[k] = disAllPrefs[k]['0']
print("Linked data ontology prefixes", len(disAllPrefs))

('Linked data ontology prefixes', 13)


In [1071]:
ontoLabels = pd.read_csv(final_folder + "util_files/ontology_labels.tsv", sep="\t").set_index("index")
ontoLabels = ontoLabels.to_dict(orient="index")
print("Ontology Labels", len(ontoLabels))

('Ontology Labels', 55)


In [1072]:
old_dset_mappings = pd.read_csv(final_folder + "util_files/old_dataset_mappings.tsv", sep="\t").set_index("index")
old_dset_mappings = old_dset_mappings.to_dict(orient="index")
for k in old_dset_mappings: old_dset_mappings[k] = old_dset_mappings[k]['0']
old_dset_mappings[''] = ''
print ("Old Dataset Mappings", len(old_dset_mappings))

('Old Dataset Mappings', 75)


In [1073]:
first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
ap = mfio.load_matrix(final_folder + "network_data/bio2rdf_classdsets.dat")

def parse_camel_case(name):
    s1 = first_cap_re.sub(r'\1_\2', name)
    return all_cap_re.sub(r'\1_\2', s1).lower()

def parse_uri_token(token):
    prop = parse_camel_case(token)
    pparts = re.split("[-_]", prop)
    name = " ".join([x.title() for x in pparts])
    return name

def parseURI(uri):
    nparts = re.split("[#:/]", uri)
    name = parse_uri_token(nparts[len(nparts)-1])
    return name

def model_proc(k):
    if "resource" in k:
        a = parseURI(k).split()
        if len(a) > 1: return "ModelOrganism-" + a[0]
    return "ModelOrganism"

def nbdc_proc(node):
    nparts = re.split("[#:/]", node)
    prefix = node[:len(node)-len(nparts[len(nparts)-1])]
    if prefix in ndbc: return ndbc[prefix]["id"]
    else: return "NBDC"

def bio2rdf_proc(k):
    dsets = []
    if k in ap:
        for m in ap[k]:
            if "bio2rdf" in m:
                a = m.split(".")
                dset = "Bio2Rdf-" + a[len(a)-2].title()
                dsets.append(dset)
            elif "disgenet" in m:
                dset = "DisGenet"
                dsets.append(dset)
        return ":-:".join(dsets)
    else: return "Bio2Rdf"

def pref_dsets_cst(dset_l):
    ndsets = []
    dsets = dset_l.split(":-:")
    for m in dsets:
        if m == 'Pdb': ndsets.append("NBDC")
        elif m == 'BioSamples': ndsets.append('EBI-BioSamples')
        elif m == 'tcga': ndsets.append("LinkedTCGA")
        else: ndsets.append(m)
    return ":-:".join(ndsets)

def det_dset(ami):
    (a, m, i) = ami
    dsets = []
    inst_count = []
    for xi in range(len(m)): 
        x = m[xi]
        ic = i[xi]
        nist = False
        if "linkedlifedata" in x.lower(): 
            dsets.append(x.split("_")[2].split(".")[0].title())
        elif "pubchem" in x.lower(): 
            dsets.append("Pubchem")
        elif "modelorganism" in x.lower(): 
            dsets.append(model_proc(a))
        elif "bio2rdf-" in x.lower(): 
            dsets.append(x.split("_")[2].split(".")[0].title())
        elif "bio2rdf" in x.lower(): 
            b2r_dsets = bio2rdf_proc(a)
            dsets.append(b2r_dsets)
            nist = True
            for k in range(len(b2r_dsets.split(":-:"))):
                inst_count.append(ic)
        elif "nbdc" in x.lower(): 
            dsets.append(nbdc_proc(a))
        else: 
            dsets.append(x.split("_")[2].split(".")[0])
        if not nist:
            inst_count.append(ic)
    dsets = ":-:".join([str(k) for k in dsets])
    return dsets, inst_count

def get_sparql_endpoint(dset):
    if dset in ad: 
        return ad[dset]
    elif dset.split("-")[0] in ad: 
        return ad[dset.split("-")[0]]
    else: 
        return ""

In [1074]:
mfio = MatrixIO()
fu = FileUtils()

# --------------------
#### Read in Class Files and CODP-Combined-Network
# --------------------

In [1076]:
combG = nx.read_gpickle(final_folder + "network_data/combG_c123.gpickle")

In [1077]:
class_list_folder = final_folder + "class_lists/"
class_set_files = fu.get_reqd_fileset(class_list_folder, lambda x: False if "classlist" in x.lower() else True)
class_sets = {}

In [1078]:
ccount = 0
for k in class_set_files:
    a = mfio.load_matrix(class_list_folder + k)
    for m in a: 
        if not m in class_sets: class_sets[m] = {"files": [], "instance_count": []}
        class_sets[m]["files"].append(k)
        class_sets[m]["instance_count"].append(a[m])
        ccount += 1

In [1079]:
print ("Total Class Count", ccount)

('Total Class Count', 66768)


In [1080]:
print ("Unique Class Count", len(class_sets))

('Unique Class Count', 57315)


In [1081]:
for k in class_sets:
    a, b = det_dset((k, class_sets[k]["files"], class_sets[k]["instance_count"]))
    class_sets[k]["dsets"] = a
    class_sets[k]["instance_count"] = b

In [1082]:
for k in class_sets:
    class_sets[k]["total_instance_count"] = np.sum(class_sets[k]["instance_count"])

# --------------------
#### Create Class Labels
# --------------------

In [1083]:
uc = []
undsets = {}
for k in combG.nodes():
    if not combG.node[k]["type"] == "class": continue
    if not k in class_sets:
        uc.append((k, combG.node[k]))
        if "dsets" in combG.node[k]: 
            spcdsets = combG.node[k]["dsets"].split(":-:")
            for m in spcdsets:
                if not m in undsets: undsets[m] = []
                undsets[m].append(k)

In [1084]:
for k in undsets:
    print (k, len(undsets[k]))

(u'Pdb', 6)
('linkeddrugs', 168)
(u'BioSamples', 244)
(u'LinkedSPL', 1)
('tcga', 14)


In [1085]:
for k in undsets:
    for m in undsets[k]:
        class_sets[m] = {"files": [], 
                         "dsets": pref_dsets_cst(combG.node[m]["dsets"]), 
                         "instance_count": [combG.node[m]["count"]] if "count" in combG.node[m] else [0], 
                         "total_instance_count": combG.node[m]["count"] if "count" in combG.node[m] else 0}

In [1086]:
found_lab = 0
found_sec_lab = 0
for k in class_sets:
    if combG.has_node(k):
        if "refL1" in combG.node[k]: 
            class_sets[k]["refL1"] = combG.node[k]["refL1"]
            found_lab += 1
        if "sec_label" in combG.node[k]:
            class_sets[k]["sec_label"] = combG.node[k]["sec_label"]
            found_sec_lab += 1
    if not "refL1" in class_sets[k]: class_sets[k]["refL1"] = parseURI(k)
    if not "sec_label" in class_sets[k]: class_sets[k]["sec_label"] = ""

In [1087]:
print ("Found parsed labels", found_lab)
print ("Found external labels", found_sec_lab)

('Found parsed labels', 9588)
('Found external labels', 1146)


# --------------------
#### Read in prefix lists and external ontology descriptions (pre-generated)
# --------------------

In [1089]:
ref_preflist = mfio.load_matrix(final_folder + "network_data/ref-preflist.dat")
ref_obolist = mfio.load_matrix(final_folder + "network_data/ref-oboprefset.dat")

In [1090]:
sort_obolist = sorted({k: len(ref_obolist[k]) for k in ref_obolist}.items(), key=itemgetter(1), reverse=True)
for k in sort_obolist: disAllPrefs[k[0]] = parseURI(k[0]).upper()

In [1091]:
for k in disAllPrefs: print k, disAllPrefs[k]

http://purl.obolibrary.org/obo/INO INO
http://www.hipaaspace.com/Medical_Billing/Coding/Logical.Observation.Identifiers.Names.and.Codes/ LOINC
http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# NCIT
http://purl.bioontology.org/ontology/NDFRT/ NDFRT
http://purl.obolibrary.org/obo/UO UO
http://purl.obolibrary.org/obo/ECO ECO
http://purl.obolibrary.org/obo/ERO ERO
http://www.ebi.ac.uk/efo/ EFO
http://purl.obolibrary.org/obo/comment COMMENT
http://purl.obolibrary.org/obo/CLO CLO
http://purl.obolibrary.org/obo/default-relationship-id-prefix DEFAULT RELATIONSHIP ID PREFIX
http://purl.obolibrary.org/obo/has-role HAS ROLE
http://www.orpha.net/ORDO/ ORDO
http://purl.obolibrary.org/obo/idspace IDSPACE
http://purl.obolibrary.org/obo/BFO BFO
http://purl.obolibrary.org/obo/DRON DRON
http://purl.obolibrary.org/obo/TEMP TEMP
http://bmrbpub.protein.osaka-u.ac.jp/schema/mmcif_nmr-star.owl# BMRB-OWL
http://purl.obolibrary.org/obo/DIDEO DIDEO
http://purl.obolibrary.org/obo/CMO CMO
http://purl.bioontolog

In [1092]:
onto_nodes = {}
for k in disAllPrefs:
    if k in ref_obolist: ic_n = ref_obolist[k]
    else: ic_n = ref_preflist[k]
    for m in ic_n:
        if m not in class_sets: continue
        onto_nodes[m] = ontoLabels[k]['name']
print ("Ontology related nodes (only classes)", len(onto_nodes))

('Ontology related nodes (only classes)', 54639)


In [1093]:
for k in list(onto_nodes.keys())[0:10]: print (k, onto_nodes[k])

('http://rdf.wwpdb.org/schema/pdbx-v40.owl#symmetryCategory', 'Protein Data Bank Ontology')
('http://purl.bioontology.org/ontology/SNOMEDCT/108677001', 'Systematized Nomenclature of Medicine - Clinical Terms')
('http://purl.obolibrary.org/obo/CHEBI_8194', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_8196', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_8191', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_116591', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_8199', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_8198', 'Chemical Entities of Biological Interest Ontology')
('http://purl.obolibrary.org/obo/CHEBI_116599', 'Chemical Entities of Biological Interest Ontology')
('http://purl.bioontology.org/ontology/NDFRT/N0000007050', 'National Drug File - Reference Te

In [1094]:
for k in class_sets:
    if k in onto_nodes: class_sets[k]["FromOntology"] = onto_nodes[k]
    else: class_sets[k]["FromOntology"] = ""

# --------------------
#### Read in prefix lists and external vocabulary descriptions (pre-generated)
# --------------------

In [1095]:
vocab_elem = mfio.load_matrix(final_folder + "network_data/vocab_elem.dat")

In [1096]:
vocab_index = {}
for k in vocab_elem:
    for m in vocab_elem[k]:
        if m not in vocab_index: vocab_index[m] = set([])
        vocab_index[m].add(k)
print ("Terms from vocabularies", len(vocab_index))

('Terms from vocabularies', 9735)


In [1098]:
sc_nomen = mfio.load_matrix(final_folder + "network_data/sc_nomen.dat")
print ("Vocabularies from LOV", len(sc_nomen))

('Vocabularies from LOV', 501)


In [1099]:
vocab_nodes = {}
for k in vocab_index:
    st = list(vocab_index[k])[0]
    vocabiden = sc_nomen[st]["vocab_name"].title()
    if not k in onto_nodes:
        if k in class_sets: vocab_nodes[k] = vocabiden
print ("Vocabulary related nodes (only classes)", len(vocab_nodes))

('Vocabulary related nodes (only classes)', 228)


In [1100]:
for k in class_sets:
    if k in vocab_nodes: class_sets[k]["FromVocabulary"] = vocab_nodes[k]
    else: class_sets[k]["FromVocabulary"] = ""

# --------------------
#### Prepare Class-List TSV files
# --------------------

In [1101]:
uniq_classes = []
for k in class_sets:
    if class_sets[k]["FromOntology"] == "" and class_sets[k]["FromVocabulary"] == "":
        uniq_classes.append(k)
        
print ("Unique non-reused classes", len(uniq_classes))

('Unique non-reused classes', 2881)


In [1102]:
class_sets_df = pd.DataFrame.from_dict(class_sets, orient="index")
class_sets_df = class_sets_df.reset_index()
del class_sets_df["files"]
class_sets_df = pd.concat([class_sets_df[["index", "refL1", "sec_label", "FromOntology", "FromVocabulary", "dsets"]], 
                           class_sets_df["dsets"].apply(lambda x: len(x.split(":-:"))).to_frame(name="dcount"), 
                           class_sets_df["instance_count"].apply(lambda x: ":-:".join([str(k) for k in x])), 
                           class_sets_df["total_instance_count"]], axis=1)
class_sets_df = class_sets_df.sort_values(["total_instance_count"], ascending=False)
class_sets_df.columns = ["URI", "ParsedLabel", "ExternalLabel", "FromBioOntology", "FromLOVVocabulary", "Datasets", 
                         "DatasetCount", "InstanceCountByDataset", "TotalInstanceCount"]

In [1103]:
print ("Total Classes", class_sets_df.shape[0])

('Total Classes', 57748)


In [1104]:
class_sets_df.to_csv("final/Extracted_Classes.tsv", sep="\t", index=None)

In [1105]:
class_sets_df.sort_values(["FromLOVVocabulary", "TotalInstanceCount"], ascending=False).head(100)

Unnamed: 0,URI,ParsedLabel,ExternalLabel,FromBioOntology,FromLOVVocabulary,Datasets,DatasetCount,InstanceCountByDataset,TotalInstanceCount
42160,http://www.w3.org/ns/prov#Organization,Organization,,,W3C Provenance Interchange,:-:DisGenet,2,9:-:41,50
29129,http://www.w3.org/ns/prov#SoftwareAgent,Software Agent,,,W3C Provenance Interchange,:-:DisGenet,2,3:-:9,12
43499,http://www.w3.org/ns/prov#Agent,Agent,,,W3C Provenance Interchange,:-:DisGenet,2,1:-:6,7
29128,http://www.w3.org/ns/prov#Person,Person,,,W3C Provenance Interchange,NBDC,1,6,6
28238,http://vivoweb.org/ontology/core#University,University,,,Vivo Core Ontology,linkeddrugs,1,8,8
36073,http://purl.uniprot.org/core/Structured_Name,Structured Name,,,Uniprot Core Ontology,EBI-UniProt:-:Linkedlifedata-Uniprot,2,486920183:-:2061023,488981206
27658,http://purl.uniprot.org/core/Gene,Gene,,,Uniprot Core Ontology,EBI-UniProt:-:Linkedlifedata-Uniprot,2,426809488:-:23269980,450079468
46557,http://purl.uniprot.org/core/Protein,Protein,,,Uniprot Core Ontology,EBI-UniProt:-:Linkedlifedata-Uniprot,2,154494281:-:6955988,161450269
27683,http://purl.uniprot.org/core/Resource,Resource,,,Uniprot Core Ontology,EBI-UniProt,1,93596295,93596295
36072,http://purl.uniprot.org/core/Strain,Strain,,,Uniprot Core Ontology,EBI-UniProt:-:NBDC,2,81149055:-:371,81149426


# --------------------
#### Read Instances
# --------------------

In [1109]:
def get_folder_path(dset):
    folder_path = final_folder + "instance_data/"
    if "bio2rdf" in dset.lower():
        subdir = os.listdir(folder_path + "Bio2Rdf/")
        if dset in subdir: return folder_path + "Bio2Rdf/" + dset + "/"
        else: return folder_path + "Bio2Rdf/Bio2Rdf/"
    elif "EBI" in dset:
        return folder_path + "EBI/" + dset + "/"
    elif "sbg" in dset.lower():
        return folder_path + "SBG/" + dset + "/"
    elif "nbdc" in dset.lower():
        return folder_path + "NBDC/"
    elif "linkedlifedata" in dset.lower():
        return folder_path + "Linkedlifedata/" + dset + "/"
    elif "modelorganism" in dset.lower():
        return folder_path + "ModelOrganism/"
    else:
        return folder_path + dset + "/"
    
def get_inst_data(dset, verbose=True):
    str_inst_val = True
    num_inst_val = True
    if len(dset) == 0: 
        return None, None, False, False
    folder_path = get_folder_path(dset)
    if verbose: print ("---------------------------------")
    if verbose: print (dset, folder_path)
    if os.path.isdir(folder_path):
        str_inst = pd.read_csv(folder_path + "str_inst_df.tsv", sep="\t")
        num_inst = pd.read_csv(folder_path + "num_inst_df.tsv", sep="\t")
        if verbose: print ("Dset sizes", str_inst.shape, num_inst.shape)
        if str_inst.shape[0] == 0: str_inst_val = False
        if num_inst.shape[0] == 0: num_inst_val = False
        return str_inst, num_inst, str_inst_val, num_inst_val
    else: 
        print ("No folder path found -  XXX")
        return None, None, None, None

In [1110]:
str_inst, num_inst, _, _ = get_inst_data("Bio2Rdf-Drugbank")
str_inst[str_inst['ftype'] == 'entity'].sample(10)

---------------------------------
('Bio2Rdf-Drugbank', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf/')
('Dset sizes', (8919, 13), (436, 13))


Unnamed: 0,index,is_categorical,dtype,lenmedian,uniq_set,file,inst_count,origdtype,namespace,ftype,categories,samp_insts,lenstd
606,http://bio2rdf.org/clinicaltrials_vocabulary:G...,False,iri,75.0,1279,inst_5247.tsv,1279,object,set(['http://bio2rdf.org/clinicaltrials_resour...,entity,set([]),['http://bio2rdf.org/clinicaltrials_resource:d...,0.0
124,http://bio2rdf.org/apo:0000187,False,iri,64.0,2000,inst_3441.tsv,2000,object,set(['http://bio2rdf.org/sgd_resource:']),entity,set([]),['http://bio2rdf.org/sgd_resource:ce1f627a35c4...,0.0
7009,http://bio2rdf.org/pharmgkb_vocabulary:variant...,False,iri,41.5,2000,inst_2340.tsv,2000,object,set(['http://bio2rdf.org/pharmgkb_resource:']),entity,set([]),"['http://bio2rdf.org/pharmgkb_resource:19197',...",0.5
8186,http://bio2rdf.org/vz_vocabulary:Resource,False,iri,25.5,105,inst_739.tsv,105,object,set(['http://bio2rdf.org/vz:']),entity,set([]),"['http://bio2rdf.org/vz:804', 'http://bio2rdf....",0.5
7083,http://bio2rdf.org/pseudocap_vocabulary:Resource,False,iri,35.0,2000,inst_8981.tsv,2000,object,set(['http://bio2rdf.org/pseudocap:']),entity,set([]),"['http://bio2rdf.org/pseudocap:PA0225', 'http:...",0.0
5312,http://bio2rdf.org/mirbase_vocabulary:Resource,False,iri,38.5,2000,inst_3249.tsv,2000,object,set(['http://bio2rdf.org/mirbase:']),entity,set([]),"['http://bio2rdf.org/mirbase:MI0016863', 'http...",1.707825
7171,http://bio2rdf.org/rgap_vocabulary:Resource,False,iri,41.0,2000,inst_7268.tsv,2000,object,set(['http://bio2rdf.org/rgap:']),entity,set([]),"['http://bio2rdf.org/rgap:LOC_Os05g23610.1', '...",2.160247
4233,http://bio2rdf.org/mgi_vocabulary:Gene,False,iri,29.0,2000,inst_393.tsv,2000,object,set(['http://bio2rdf.org/mgi:']),entity,set([]),"['http://bio2rdf.org/mgi:5434473', 'http://bio...",0.816497
628,http://bio2rdf.org/clinicaltrials_vocabulary:I...,False,iri,75.0,2000,inst_1375.tsv,2000,object,set(['http://bio2rdf.org/clinicaltrials_resour...,entity,set([]),['http://bio2rdf.org/clinicaltrials_resource:1...,0.0
2557,http://bio2rdf.org/genbank_vocabulary:Resource,False,iri,37.5,2000,inst_1456.tsv,2000,object,set(['http://bio2rdf.org/genbank:']),entity,set([]),"['http://bio2rdf.org/genbank:AAHY01001988.1', ...",2.165064


# --------------------
#### Extract Property Data from CODP-Combined Network
# --------------------

In [1166]:
def get_new_dsets(node):
    src_dsets = combG.node[node]["dsets"] if "dsets" in combG.node[node] else ""
    src_dsets = [old_dset_mappings[k] for k in src_dsets.split(":-:") if k in old_dset_mappings]
    return src_dsets

def get_assoc_properties(source, dset):
    data_properties = []
    obj_properties = []
    def_l_set = 'http://www.w3.org/2001/XMLSchema#string'
    if not combG.has_node(source): return ([], [])
    src_dsets = get_new_dsets(source)
    #print (src_dsets)
    if "NBDC" in dset: dset = "NBDC"
    elif "ModelOrganism" in dset: dset = "ModelOrganism"
    else: dset = dset
    for k in combG[source]:
        dprop_material = set([])
        oprop_material = set([])
        if combG.node[k]["type"] == "literal" or combG.node[k]["type"] == "objectProp":
            targ_dsets = get_new_dsets(k)
            dset_intersection = set(src_dsets).intersection(set(targ_dsets))
            if dset in dset_intersection:
                for n in combG[source][k]:
                    mat = combG[source][k][n]
                    _type = mat['type'] if 'type' in mat else def_l_set
                    _count = mat['count'] if 'count' in mat else 0
                    if len(_type.split("->")) == 1: 
                        dprop_material.add((_count, _type))
                    else:
                        oprop_material.add((_count, _type))
                if len(dprop_material) > 0: 
                    data_properties.append((k, dprop_material))
                if len(oprop_material) > 0:
                    obj_properties.append((k, oprop_material))
    return (data_properties, obj_properties)

def get_labels(node):
    parsed_label = combG.node[node]["refL1"] if "refL1" in combG.node[node] else parseURI(node)
    sec_label = combG.node[node]["sec_label"] if "sec_label" in combG.node[node] else ""
    return parsed_label, sec_label

def prepare_property_dtype_dict(dset):
    curop = {}
    curdp = {}
    dtypes = set([])
    opn_c = 0
    dpn_c = 0
    for m in all_dsets[dset]['Classes']:
        dprop, oprop = get_assoc_properties(m, dset)
        for item in oprop:
            n = item[0]
            mn = item[1]
            if not n in curop: curop[n] = {"Realizations": {}}
            parsed_label, sec_label = get_labels(n)
            curop[n]["ParsedLabel"] = parsed_label
            curop[n]["ExternalLabel"] = sec_label
            curop[n]["Shared Datasets"] = get_new_dsets(n)
            if n in oproponto_nodes: curop[n]["FromBioOntology"] = oproponto_nodes[n]
            else: curop[n]["FromBioOntology"] = ""
            if n in opropvocab_nodes: curop[n]["FromLOVVocabulary"] = opropvocab_nodes[n]
            else: curop[n]["FromLOVVocabulary"] = ""
            for op in mn:
                opd = op[1].split("->")
                _domain = m
                _range = opd[2] if len(opd) > 2 else ""
                _count = op[0]
                pdict = {"Domain": _domain, "Range": _range, "Assertion Count": _count}
                curop[n]["Realizations"][op[1]] = pdict
                opn_c += 1
        for item in dprop:
            n = item[0]
            mn = item[1]
            if not n in curdp: curdp[n] = {"Realizations": {}}
            parsed_label, sec_label = get_labels(n)
            curdp[n]["ParsedLabel"] = parsed_label
            curdp[n]["ExternalLabel"] = sec_label
            curdp[n]["Shared Datasets"] = get_new_dsets(n)
            if n in dproponto_nodes: curdp[n]["FromBioOntology"] = dproponto_nodes[n]
            else: curdp[n]["FromBioOntology"] = ""
            if n in dpropvocab_nodes: curdp[n]["FromLOVVocabulary"] = dpropvocab_nodes[n]
            else: curdp[n]["FromLOVVocabulary"] = ""
            for dp in mn:
                _range = dp[1]
                _count = dp[0]
                dtypes.add(_range)
                try:
                    realization = m + "->" + n + "->" + _range
                    pdict = {"Domain": m, "Range": _range, "Assertion Count": _count}
                    curdp[n]["Realizations"][realization] = pdict
                    dpn_c += 1
                except: 
                    print ("Error generating realization for", dp, n, m)
    dtypes = list(dtypes)
    return curop, curdp, dtypes, opn_c, dpn_c

In [1167]:
dproperty_sets = {}
oproperty_sets = {}
for k in combG.nodes():
    if not "type" in combG.node[k]: continue
    if combG.node[k]["type"] == 'literal': dproperty_sets[k] = {}
    if combG.node[k]["type"] == 'objectProp': oproperty_sets[k] = {}

print ("Object Properties", len(oproperty_sets))
print ("Data Properties", len(dproperty_sets))

('Object Properties', 4397)
('Data Properties', 8447)


In [1168]:
oproponto_nodes = {}
dproponto_nodes = {}
for k in disAllPrefs:
    if k in ref_obolist: ic_n = ref_obolist[k]
    else: ic_n = ref_preflist[k]
    for m in ic_n:
        if m in oproperty_sets:
            oproponto_nodes[m] = ontoLabels[k]['name']
        elif m in dproperty_sets:
            dproponto_nodes[m] = ontoLabels[k]['name']
print ("Ontology related nodes (only object properties)", len(oproponto_nodes))
print ("Ontology related nodes (only data properties)", len(dproponto_nodes))

('Ontology related nodes (only object properties)', 1514)
('Ontology related nodes (only data properties)', 6005)


In [1169]:
for k in list(oproponto_nodes.keys())[0:10]: print (k, oproponto_nodes[k])

(u'http://bmrbpub.protein.osaka-u.ac.jp/schema/mmcif_nmr-star.owl#has_cross_correlation_d_csa_experimentCategory', 'Biological Magnetic Resonance Bank Ontology')
(u'http://rdf.wwpdb.org/schema/pdbx-v42.owl#has_pdbx_nmr_spectrometerCategory', 'Protein Data Bank Ontology')
(u'http://bmrbpub.protein.osaka-u.ac.jp/schema/mmcif_nmr-star.owl#has_entity_purity_list', 'Biological Magnetic Resonance Bank Ontology')
(u'http://bmrbpub.protein.osaka-u.ac.jp/schema/mmcif_nmr-star.owl#has_chem_shift_anisotropy', 'Biological Magnetic Resonance Bank Ontology')
(u'http://rdf.wwpdb.org/schema/pdbx-v42.owl#has_computing', 'Protein Data Bank Ontology')
(u'http://rdf.wwpdb.org/schema/pdbx-v42.owl#has_em_virus_entity', 'Protein Data Bank Ontology')
(u'http://bmrbpub.protein.osaka-u.ac.jp/schema/mmcif_nmr-star.owl#has_entity_natural_srcCategory', 'Biological Magnetic Resonance Bank Ontology')
(u'http://rdf.wwpdb.org/schema/pdbx-v42.owl#has_atom_typeCategory', 'Protein Data Bank Ontology')
(u'http://bmrbpub.p

In [1170]:
opropvocab_nodes = {}
dpropvocab_nodes = {}
for k in vocab_index:
    st = list(vocab_index[k])[0]
    vocabiden = sc_nomen[st]["vocab_name"].title()
    if not k in oproponto_nodes:
        if k in oproperty_sets: opropvocab_nodes[k] = vocabiden
    if not k in dproponto_nodes:
        if k in dproperty_sets: dpropvocab_nodes[k] = vocabiden
print ("Vocabulary related nodes (only object properties)", len(opropvocab_nodes))
print ("Vocabulary related nodes (only data properties)", len(dpropvocab_nodes))

('Vocabulary related nodes (only object properties)', 345)
('Vocabulary related nodes (only data properties)', 716)


In [1171]:
res_n = 'http://rdf.wwpdb.org/schema/pdbx-v40.owl#atom_sites'
dprop2, oprop2 = get_assoc_properties(res_n, "NBDC-PDB")
res_n = 'http://bio2rdf.org/drugbank_vocabulary:Drug'
dprop1, oprop1 = get_assoc_properties(res_n, "Bio2Rdf-Drugbank")

for k in oprop2:
    print ("--------------------")
    print (k[0])
    for m in k[1]: print (m)

--------------------
http://rdf.wwpdb.org/schema/pdbx-v40.owl#reference_to_entry
(108526, u'http://rdf.wwpdb.org/schema/pdbx-v40.owl#atom_sites->http://rdf.wwpdb.org/schema/pdbx-v40.owl#reference_to_entry->http://rdf.wwpdb.org/schema/pdbx-v40.owl#entry')
--------------------
http://rdf.wwpdb.org/schema/pdbx-v40.owl#of_datablock
(108526, u'http://rdf.wwpdb.org/schema/pdbx-v40.owl#atom_sites->http://rdf.wwpdb.org/schema/pdbx-v40.owl#of_datablock->http://rdf.wwpdb.org/schema/pdbx-v40.owl#datablock')


# --------------------
#### Prepare Final Dataset
# --------------------

In [1174]:
all_dsets = {}
for k in set(class_sets_df["Datasets"]):
    ap = k.split(":-:")
    for m in ap: 
        if not m in all_dsets: 
            all_dsets[m] = {"SPARQL Endpoint": get_sparql_endpoint(m), 
                            "Counts": {
                                "Class Count": 0,
                                "Object Property Count": 0,
                                "Data Property Count": 0,
                                "Datatypes Count": 0,
                                "Object Property Realization Count": 0,
                                "Data Property Realization Count": 0,
                                "Class-with-Instance Count": 0,
                                "Object Property-with-Assertion Count": 0,
                                "Data Property-with-Assertion Count": 0
                            },
                            "Classes": {},
                            "Properties": {
                                "Object Properties": {},
                                "Data Properties": {}
                            },
                            "Datatypes": []
                           }
    
print ("Total Datasets", len(all_dsets))

('Total Datasets', 100)


In [1175]:
for k in class_sets:
    dsets_c = class_sets[k]['dsets'].split(":-:")
    instances = class_sets[k]['instance_count']
    for m in range(len(dsets_c)):
        class_dict = {"URI": k, 
                      "Parsed Label": class_sets[k]["refL1"], 
                      "External Label": class_sets[k]["sec_label"], 
                      "Shared Datasets": class_sets[k]["dsets"].split(":-:"), 
                      "Source BioPortal Ontology": class_sets[k]["FromOntology"],
                      "Source LOV Vocabulary": class_sets[k]["FromVocabulary"],
                      "Instance Count": instances[m], 
                      "Instance Characteristics": {}}
        all_dsets[dsets_c[m]]['Classes'][k] = class_dict
        all_dsets[dsets_c[m]]["Counts"]['Class Count'] += 1

In [1176]:
for k in all_dsets:
    str_inst, num_inst, str_valid, num_valid = get_inst_data(k)
    data_count = 0
    if str_valid:
        entity_data = str_inst[str_inst['ftype'] == 'entity']
        entity_data = entity_data.set_index("index").to_dict(orient="index")
        for m in entity_data:
            if m in all_dsets[k]['Classes']:
                instance_dict = {"Sample Instances (3)": eval(entity_data[m]['samp_insts']), 
                                 "Namespaces": list(eval(entity_data[m]['namespace'])), 
                                 "Is Categorical": entity_data[m]['is_categorical'], 
                                 "Sample Instances Total": entity_data[m]['inst_count'], 
                                 'Categories': list(eval(entity_data[m]['categories'])), 
                                 'Datatype': entity_data[m]['dtype'], 
                                 'Instance Length (Median)': entity_data[m]['lenmedian'], 
                                 'Instance Length (Standard Deviation)': entity_data[m]['lenstd'], 
                                 'Is String': True}
                all_dsets[k]['Classes'][m]["Instance Characteristics"] = instance_dict
                all_dsets[k]["Counts"]["Class-with-Instance Count"] += 1

---------------------------------
('ModelOrganism-Flymine', 'final/raw_data_folder/instance_data/ModelOrganism/')
('Dset sizes', (6317, 13), (91, 13))
---------------------------------
('Pubchem', 'final/raw_data_folder/instance_data/Pubchem/')
('Dset sizes', (42837, 13), (35, 13))
---------------------------------
('Bio2Rdf-Mesh', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf/')
('Dset sizes', (8919, 13), (436, 13))
---------------------------------
('NBDC-BMRB', 'final/raw_data_folder/instance_data/NBDC/')
('Dset sizes', (33143, 13), (4277, 13))
---------------------------------
('Bio2Rdf-Interpro', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf-Interpro/')
('Dset sizes', (393, 13), (2, 13))
---------------------------------
('Bio2Rdf-Sider', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf/')
('Dset sizes', (8919, 13), (436, 13))
---------------------------------
('Linkedlifedata-Mint', 'final/raw_data_folder/instance_data/Linkedlifedata/Linkedlifedata-Mint/')
('Dse

In [1177]:
for k in sorted(all_dsets.keys()):
    odict, ddict, datatypes, opn, dpn = prepare_property_dtype_dict(k)
    print (k, len(odict), len(ddict), len(datatypes))
    all_dsets[k]["Properties"]["Object Properties"] = odict
    all_dsets[k]["Properties"]["Data Properties"] = ddict
    all_dsets[k]["Datatypes"] = datatypes
    all_dsets[k]["Counts"]["Object Property Count"] = len(odict)
    all_dsets[k]["Counts"]["Data Property Count"] = len(ddict)
    all_dsets[k]["Counts"]["Datatypes Count"] = len(datatypes)
    all_dsets[k]["Counts"]["Object Property Realization Count"] = opn
    all_dsets[k]["Counts"]["Data Property Realization Count"] = dpn

('', 1, 0, 0)
('Bio2Rdf-Affymetrix', 74, 75, 8)
('Bio2Rdf-Bioportal', 69, 48, 5)
('Bio2Rdf-Clinicaltrials', 59, 109, 9)
('Bio2Rdf-Ctd', 63, 36, 9)
('Bio2Rdf-Dbsnp', 13, 24, 5)
('Bio2Rdf-Drugbank', 68, 42, 8)
('Bio2Rdf-Genage', 10, 23, 5)
('Bio2Rdf-Goa', 61, 34, 7)
('Bio2Rdf-Hgnc', 34, 51, 9)
('Bio2Rdf-Homologene', 13, 13, 5)
('Bio2Rdf-Interpro', 25, 12, 4)
('Bio2Rdf-Irefindex', 36, 20, 6)
('Bio2Rdf-Kegg', 127, 89, 7)
('Bio2Rdf-Lsr', 38, 74, 10)
('Bio2Rdf-Medline', 21, 46, 6)
('Bio2Rdf-Mesh', 0, 0, 0)
('Bio2Rdf-Mgi', 21, 26, 6)
('Bio2Rdf-Ncbigene', 81, 44, 8)
('Bio2Rdf-Ndc', 11, 17, 5)
('Bio2Rdf-Omim', 68, 77, 7)
('Bio2Rdf-Pharmgkb', 66, 55, 9)
('Bio2Rdf-Refseq', 9, 62, 5)
('Bio2Rdf-Sgd', 93, 68, 10)
('Bio2Rdf-Sider', 7, 9, 3)
('Bio2Rdf-Taxonomy', 8, 34, 4)
('Bio2Rdf-Wormbase', 67, 34, 5)
('DisGenet', 128, 396, 19)
('EBI-BioModels', 52, 77, 13)
('EBI-BioSamples', 80, 168, 11)
('EBI-Chembl', 96, 171, 14)
('EBI-Ensembl', 87, 138, 14)
('EBI-ExpressionAtlas', 111, 157, 13)
('EBI-Reactome', 

In [1178]:
found_op_s = 0
found_op_n = 0
found_dp_s = 0
found_dp_n = 0

for k in all_dsets:
    op = all_dsets[k]["Properties"]['Object Properties']
    dp = all_dsets[k]["Properties"]['Data Properties']
    str_inst, num_inst, str_valid, num_valid = get_inst_data(k)
    data_count = 0
    str_relation_data = {}
    num_relation_data = {}
    if str_valid:
        str_relation_data = str_inst[str_inst['ftype'] == 'relation']
        str_relation_data = str_relation_data.set_index("index").to_dict(orient="index")
    if num_valid:
        num_relation_data = num_inst[num_inst['ftype'] == 'relation']
        num_relation_data = num_relation_data.set_index("index").to_dict(orient="index")
    for rel in op:
        has_realization = False
        for dreal in op[rel]['Realizations']:
            dreal_sp = "->".join(dreal.split("->")[0:2])
            if dreal_sp in str_relation_data:
                try:
                    instance_dict = {"Sample Instances (3)": eval(str_relation_data[dreal_sp]['samp_insts']), 
                                     "Namespaces": list(eval(str_relation_data[dreal_sp]['namespace'])), 
                                     "Is Categorical": str_relation_data[dreal_sp]['is_categorical'], 
                                     "Sample Instances Total": str_relation_data[dreal_sp]['inst_count'],
                                     'Categories': list(eval(str_relation_data[dreal_sp]['categories'])), 
                                     'Datatype': str_relation_data[dreal_sp]['dtype'], 
                                     'Instance Length (Median)': str_relation_data[dreal_sp]['lenmedian'],
                                     'Instance Length (Standard Deviation)': str_relation_data[dreal_sp]['lenstd'], 
                                     'Is String': True}
                    op[rel]['Realizations'][dreal]["Assertion Characteristics"] = instance_dict
                    found_op_s += 1
                except: continue
            elif dreal_sp in num_relation_data:
                try:
                    instance_dict = {"Sample Instances (3)": eval(num_relation_data[dreal_sp]['samp_insts']), 
                                     "Is Categorical": num_relation_data[dreal_sp]['is_categorical'], 
                                     "Sample Instances Total": num_relation_data[dreal_sp]['inst_count'],
                                     'Categories': list(eval(num_relation_data[dreal_sp]['categories'])), 
                                     'Datatype': num_relation_data[dreal_sp]['dtype'], 
                                     'Value Set (Median)': num_relation_data[dreal_sp]['valmedian'],
                                     'Value Set (Standard Deviation)': num_relation_data[dreal_sp]['valstd'], 
                                     'Normal Distribution Test (P-Value)': num_relation_data[dreal_sp]['normp'],
                                     'Normal Distribution Test (Statistic)': num_relation_data[dreal_sp]['normsk'],
                                     'Is String': False}
                    op[rel]['Realizations'][dreal]["Assertion Characteristics"] = instance_dict
                    found_op_n += 1
                except: continue
            op[rel]['Realizations'][dreal]['Assertion Count'] = int(op[rel]['Realizations'][dreal]['Assertion Count'])
            if op[rel]['Realizations'][dreal]['Assertion Count'] > 0: has_realization = True
        if has_realization:
            all_dsets[k]["Counts"]["Object Property-with-Assertion Count"] += 1
    for rel in dp:
        has_realization = False
        for dreal in dp[rel]['Realizations']:
            dreal_sp = "->".join(dreal.split("->")[0:2])
            if dreal_sp in str_relation_data:
                try:
                    instance_dict = {"Sample Instances (3)": eval(str_relation_data[dreal_sp]['samp_insts']), 
                                     "Namespaces": list(eval(str_relation_data[dreal_sp]['namespace'])), 
                                     "Is Categorical": str_relation_data[dreal_sp]['is_categorical'], 
                                     "Sample Instances Total": str_relation_data[dreal_sp]['inst_count'],
                                     'Categories': list(eval(str_relation_data[dreal_sp]['categories'])), 
                                     'Datatype': str_relation_data[dreal_sp]['dtype'], 
                                     'Instance Length (Median)': str_relation_data[dreal_sp]['lenmedian'],
                                     'Instance Length (Standard Deviation)': str_relation_data[dreal_sp]['lenstd'], 
                                     'Is String': True}
                    dp[rel]['Realizations'][dreal]["Assertion Characteristics"] = instance_dict
                    found_dp_s += 1
                except: continue
            elif dreal_sp in num_relation_data:
                try:
                    instance_dict = {"Sample Instances (3)": eval(num_relation_data[dreal_sp]['samp_insts']), 
                                     "Is Categorical": num_relation_data[dreal_sp]['is_categorical'], 
                                     "Sample Instances Total": num_relation_data[dreal_sp]['inst_count'],
                                     'Categories': list(eval(num_relation_data[dreal_sp]['categories'])), 
                                     'Datatype': num_relation_data[dreal_sp]['dtype'], 
                                     'Value Set (Median)': num_relation_data[dreal_sp]['valmedian'],
                                     'Value Set (Standard Deviation)': num_relation_data[dreal_sp]['valstd'], 
                                     'Normal Distribution Test (P-Value)': num_relation_data[dreal_sp]['normp'],
                                     'Normal Distribution Test (Statistic)': num_relation_data[dreal_sp]['normsk'],
                                     'Is String': False}
                    dp[rel]['Realizations'][dreal]["Assertion Characteristics"] = instance_dict
                    found_dp_n += 1
                except: continue
            dp[rel]['Realizations'][dreal]['Assertion Count'] = int(dp[rel]['Realizations'][dreal]['Assertion Count'])
            if dp[rel]['Realizations'][dreal]['Assertion Count'] > 0: has_realization = True
        if has_realization:
            all_dsets[k]["Counts"]["Data Property-with-Assertion Count"] += 1

---------------------------------
('ModelOrganism-Flymine', 'final/raw_data_folder/instance_data/ModelOrganism/')
('Dset sizes', (6317, 13), (91, 13))
---------------------------------
('Pubchem', 'final/raw_data_folder/instance_data/Pubchem/')
('Dset sizes', (42837, 13), (35, 13))
---------------------------------
('Bio2Rdf-Mesh', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf/')
('Dset sizes', (8919, 13), (436, 13))
---------------------------------
('NBDC-BMRB', 'final/raw_data_folder/instance_data/NBDC/')
('Dset sizes', (33143, 13), (4277, 13))
---------------------------------
('Bio2Rdf-Interpro', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf-Interpro/')
('Dset sizes', (393, 13), (2, 13))
---------------------------------
('Bio2Rdf-Sider', 'final/raw_data_folder/instance_data/Bio2Rdf/Bio2Rdf/')
('Dset sizes', (8919, 13), (436, 13))
---------------------------------
('Linkedlifedata-Mint', 'final/raw_data_folder/instance_data/Linkedlifedata/Linkedlifedata-Mint/')
('Dse

# --------------------
#### Prepare Properties file
# --------------------

In [1180]:
def get_property_sets_refine(psets, ponto_nodes, pvocab_nodes, ptype):
    property_sets = deepcopy(psets)
    for k in property_sets:
        parsed_label, sec_label = get_labels(k)
        #print (parsed_label, sec_label)
        if k in ponto_nodes: oin_onto = ponto_nodes[k]
        else: oin_onto = ""
        if k in pvocab_nodes: oin_vocab = pvocab_nodes[k]
        else: oin_vocab = ""
        shared_dsets = []
        realization_count = []
        assertion_count = []
        for m in all_dsets:
            if k in all_dsets[m]["Properties"][ptype]:
                shared_dsets.append(m)
                real = all_dsets[m]["Properties"][ptype][k]['Realizations']
                realization_count.append(int(len(real)))
                assertions = int(np.sum([real[n]['Assertion Count'] for n in real]))
                assertion_count.append(assertions)
        property_sets[k] = {"ParsedLabel": parsed_label, "ExternalLabel": sec_label, "FromBioOntology": oin_onto,
                            "FromLOVVocabulary": oin_vocab, "Datasets": ":-:".join(shared_dsets), 
                            "DatasetCount": len(shared_dsets), 
                            "RealizationCount": ":-:".join([str(n) for n in realization_count]), 
                            "TotalRealizationCount": np.sum(realization_count), 
                            "AssertionCount": ":-:".join([str(n) for n in assertion_count]), 
                            "TotalAssertionCount": np.sum(assertion_count)}
    property_sets_df = pd.DataFrame.from_dict(property_sets, orient="index")
    property_sets_df = property_sets_df.reset_index()
    property_sets_df = property_sets_df.sort_values("TotalAssertionCount", ascending=False)
    property_sets_df = property_sets_df[[u'index', u'ParsedLabel', u'ExternalLabel', u'FromBioOntology', 
                                u'FromLOVVocabulary', u'Datasets', u'DatasetCount', 
                                u'RealizationCount', u'TotalRealizationCount', u'AssertionCount', 
                                u'TotalAssertionCount']]
    return property_sets_df

In [1181]:
oproperty_sets_df = get_property_sets_refine(oproperty_sets, oproponto_nodes, opropvocab_nodes, "Object Properties")
dproperty_sets_df = get_property_sets_refine(dproperty_sets, dproponto_nodes, dpropvocab_nodes, "Data Properties")

In [1182]:
dproperty_sets_df.to_csv("final/Extracted_Data_Properties.tsv", sep="\t", index=None)
oproperty_sets_df.to_csv("final/Extracted_Object_Properties.tsv", sep="\t", index=None)

# --------------------
#### Prepare Datatypes file
# --------------------

In [1183]:
all_datatypes = {}
for k in all_dsets:
    dtypes = all_dsets[k]["Datatypes"]
    for m in dtypes:
        if len(m) == 0: continue
        if not m in all_datatypes: all_datatypes[m] = {"Datasets": set([])}
        all_datatypes[m]["Datasets"].add(k)

print (len(all_datatypes))

24


In [1184]:
all_datatypes_df = pd.DataFrame.from_dict(all_datatypes, orient="index")
all_datatypes_df = all_datatypes_df.reset_index()
all_datatypes_df = pd.concat([all_datatypes_df["index"], all_datatypes_df["Datasets"].apply(lambda x: ":-:".join(list(x))), 
                              all_datatypes_df["Datasets"].apply(lambda x: len(x)).to_frame(name="DatasetCount")], axis=1)
all_datatypes_df = all_datatypes_df.sort_values("DatasetCount", ascending=False)
all_datatypes_df.to_csv("final/Extracted_Datatypes.tsv", sep="\t", index=None)
all_datatypes_df.head()

Unnamed: 0,index,Datasets,DatasetCount
21,http://www.w3.org/2001/XMLSchema#string,ModelOrganism-Flymine:-:Pubchem:-:NBDC-BMRB:-:...,96
4,http://www.w3.org/1999/02/22-rdf-syntax-ns#lan...,Bio2Rdf-Homologene:-:Linkedlifedata-Drugbank:-...,63
17,http://www.w3.org/2001/XMLSchema#integer,NextProt:-:Linkedlifedata-Intact:-:ModelOrgani...,56
6,http://www.w3.org/2001/XMLSchema#anyURI,Bio2Rdf-Drugbank:-:ModelOrganism:-:NextProt:-:...,45
3,http://www.w3.org/1999/02/22-rdf-syntax-ns#XML...,ModelOrganism:-:Linkedlifedata-Biogrid:-:Linke...,35


# --------------------
#### Prepare DSets file
# --------------------

In [1189]:
del all_dsets[""]
print ("Removing non-source", len(all_dsets))

('Removing non-source', 99)


In [1190]:
dset_print = {}
print_attr = ["SPARQL Endpoint", "Class Count", "Object Property Count", 
              "Data Property Count", "Object Property Realization Count", "Data Property Realization Count", 
              "Datatypes Count", "Class-with-Instance Count", "Data Property-with-Assertion Count", 
              "Object Property-with-Assertion Count"]
for k in all_dsets:
    if len(k) == 0: continue
    dset_print[k] = {}
    for m in print_attr:
        if not "count" in m.lower(): dset_print[k][m] = all_dsets[k][m]
        else: dset_print[k][m] = all_dsets[k]["Counts"][m]
    dset_print[k]["Class Instance Coverage"] = float(all_dsets[k]["Counts"]["Class-with-Instance Count"])/all_dsets[k]["Counts"]["Class Count"]

dset_print = pd.DataFrame.from_dict(dset_print, orient="index")
dset_print = dset_print.reset_index()
dset_print = dset_print[["index", "SPARQL Endpoint", "Class Count", "Object Property Count", 
              "Data Property Count", "Object Property Realization Count", "Data Property Realization Count", 
              "Datatypes Count", "Class-with-Instance Count", 
              "Object Property-with-Assertion Count", "Data Property-with-Assertion Count", "Class Instance Coverage"]]
dset_print.to_csv("final/Linked_Dataset_Graphs.tsv", sep="\t", index=None)

In [1191]:
dset_print.head(30)

Unnamed: 0,index,SPARQL Endpoint,Class Count,Object Property Count,Data Property Count,Object Property Realization Count,Data Property Realization Count,Datatypes Count,Class-with-Instance Count,Object Property-with-Assertion Count,Data Property-with-Assertion Count,Class Instance Coverage
0,Bio2Rdf-Affymetrix,http://sparql.openlifedata.org/,75,74,75,654,907,8,75,73,75,1.0
1,Bio2Rdf-Bioportal,http://sparql.openlifedata.org/,57,69,48,451,687,5,57,68,48,1.0
2,Bio2Rdf-Clinicaltrials,http://sparql.openlifedata.org/,62,59,109,591,828,9,62,58,109,1.0
3,Bio2Rdf-Ctd,http://sparql.openlifedata.org/,49,63,36,702,772,9,49,62,36,1.0
4,Bio2Rdf-Dbsnp,http://sparql.openlifedata.org/,16,13,24,193,449,5,16,12,24,1.0
5,Bio2Rdf-Drugbank,http://sparql.openlifedata.org/,107,68,42,884,1066,8,107,67,42,1.0
6,Bio2Rdf-Genage,http://sparql.openlifedata.org/,12,10,23,145,382,5,12,9,23,1.0
7,Bio2Rdf-Goa,http://sparql.openlifedata.org/,17,61,34,356,432,7,17,60,34,1.0
8,Bio2Rdf-Hgnc,http://sparql.openlifedata.org/,43,34,51,625,779,9,43,32,51,1.0
9,Bio2Rdf-Homologene,http://sparql.openlifedata.org/,24,13,13,197,413,5,10,12,13,0.416667


In [1196]:
mfio.save_matrix(all_dsets, "final/LSLOD_Schema_Graph.json.pickle")

In [None]:
with open("final/LSLOD_Schema_Graph.json", "w") as f:
    json.dump(all_dsets, f, indent=4, sort_keys=True, encoding='utf-8', ensure_ascii=False)