# CFDE C2M2 Model ETL

### Imports

In [1]:
import json
from bento_meta.objects import Node, Edge, Property
from bento_meta.model import Model
from bento_meta.mdb.loaders import load_model
from bento_meta.mdb.mdb_tools import ToolsMDB
from bento_meta.mdb import make_nanoid

### C2M2 model JSON to bento-meta model object

##### Import model JSON

Source: [C2M2 master JSON Schema](https://osf.io/c63aw/)

In [2]:
C2M2_JSON = "data_model/2022-11-22_C2M2_datapackage.json"
COMMIT = "2022-11-29_c2m2_model"

In [3]:
with open(C2M2_JSON, "r") as read_file:
    c2m2_data = json.load(read_file)

##### Instantiate model object

In [4]:
c2m2_model = Model(handle="C2M2")

##### Instantiate Dev-MDB

In [5]:
dev_mdb = ToolsMDB(uri="bolt://mdb.ctos-data-team.org:8687", user="", password="")

##### Separate nodes, edges, and other (ternary/trinary association, for further curation)

Edges come from association tables in C2M2, which codify relationships between entities

In [6]:
json_nodes = []
json_edges = []
json_other = []
prop_nanos = {}
link_tbl_desc = ("Association", "(Shallow) association")

for resource in c2m2_data['resources']:
    if resource['description'].startswith(link_tbl_desc):
        json_edges.append(resource)
    elif resource['description'].startswith("Trinary association"):
        json_other.append(resource)
    else:
        json_nodes.append(resource)

In [7]:
print(f"Nodes: {len(json_nodes)}, Edges: {len(json_edges)}, Other: {len(json_other)}")

Nodes: 21, Edges: 27, Other: 1


In [8]:
for node in json_nodes:
    print(node['name'])

file
biosample
subject
dcc
project
collection
subject_race
assay_type
analysis_type
ncbi_taxonomy
anatomy
file_format
data_type
disease
phenotype
compound
substance
gene
protein
sample_prep_method
id_namespace


In [9]:
for edge in json_edges:
    print(edge['name'])

project_in_project
collection_in_collection
file_describes_collection
collection_defined_by_project
file_in_collection
biosample_in_collection
subject_in_collection
file_describes_biosample
file_describes_subject
biosample_from_subject
biosample_disease
subject_disease
collection_disease
collection_phenotype
collection_gene
collection_compound
collection_substance
collection_taxonomy
collection_anatomy
collection_protein
subject_phenotype
biosample_substance
subject_substance
biosample_gene
phenotype_gene
phenotype_disease
protein_gene


In [10]:
for other in json_other:
    print(other['name'])

subject_role_taxonomy


##### Add nodes and their properties to the model object

In [11]:
for node in json_nodes:
    init = Node({
        'handle': node['name'],
        'model': "C2M2",
        'desc': node['description'],
        '_commit': COMMIT,
        'nanoid': dev_mdb.make_nano()
    })
    if not 'schema' in node and node['schema']:
        print('no schema')
        continue
    if not ('fields' in node['schema'] and node['schema']['fields']):
        print('no schema fields')
        continue
    c2m2_model.add_node(init)
    for prop in node['schema']['fields']:
        init_p = Property({
            'handle': prop['name'],
            'model': "C2M2",
            'desc': prop['description'],
            'value_domain': prop['type'],
            '_commit': COMMIT
        })
        if 'constraints' in prop and prop['constraints']:
            if 'required' in prop['constraints'] and prop['constraints']['required']:
                init_p.is_required = prop['constraints']['required']
        
        # create new nanoid for property if not unique
        # (i.e., combo of prop handle and desc not already used by another node/edge)
        if (init_p.handle, init_p.desc) not in prop_nanos.keys():
            prop_nanos[(init_p.handle, init_p.desc)] = dev_mdb.make_nano()
        init_p.nanoid = prop_nanos[(init_p.handle, init_p.desc)]
        c2m2_model.add_prop(init, init_p)

In [12]:
c2m2_model.props[('file', 'id_namespace')].nanoid

'mwpeAV'

In [13]:
print(f"Model Nodes: {len(c2m2_model.nodes)}, Model Node Props: {len(c2m2_model.props)}")

Model Nodes: 21, Model Node Props: 125


All nodes successfully added to model object along with their properties

##### Add edges and their properties to the model object

In [14]:
for edge in json_edges:
    split_handle = edge['name'].split("_")
    # print warning and move on if src or dst node not found in model
    if not (split_handle[0] in c2m2_model.nodes and c2m2_model.nodes[split_handle[0]]):
        print(f"src node {split_handle[0]} not found for edge {edge['name']}")
        json_other.append(edge)
        continue
    if not (split_handle[-1] in c2m2_model.nodes and c2m2_model.nodes[split_handle[-1]]):
        print(f"dst node {split_handle[-1]} not found for edge {edge['name']}")
        json_other.append(edge)
        continue
    init = Edge({
        'handle': "associated_with", # generic handle for unspecified relationships
        'src': c2m2_model.nodes[split_handle[0]],
        'dst': c2m2_model.nodes[split_handle[-1]],
        'model': "C2M2",
        'desc': edge['description'],
        '_commit': COMMIT,
        'nanoid': dev_mdb.make_nano()
    })
    if len(split_handle) > 2:
        init.handle = "_".join(split_handle[1:-1])
    if not 'schema' in edge and edge['schema']:
        print('no schema')
        continue
    if not ('fields' in edge['schema'] and edge['schema']['fields']):
        print('no schema fields')
        continue
    edge_nodes = str(init.handle).split()
    c2m2_model.add_edge(init)
    for prop in edge['schema']['fields']:
        init_p = Property({
            'handle': prop['name'],
            'model': "C2M2",
            'desc': prop['description'],
            'value_domain': prop['type'],
            '_commit': COMMIT
        })
        if 'constraints' in prop and prop['constraints']:
            if 'required' in prop['constraints'] and prop['constraints']['required']:
                init_p.is_required = prop['constraints']['required']
        # create new nanoid for property if not unique
        # (i.e., combo of prop handle and desc not already used by another node/edge)
        if (init_p.handle, init_p.desc) not in prop_nanos.keys():
            prop_nanos[(init_p.handle, init_p.desc)] = dev_mdb.make_nano()
        init_p.nanoid = prop_nanos[(init_p.handle, init_p.desc)]
        c2m2_model.add_prop(init, init_p)

dst node taxonomy not found for edge collection_taxonomy


In [16]:
print(f"Model Edges: {len(c2m2_model.edges)}, Model Edge Props: {len(c2m2_model.props) - 125}")

Model Edges: 26, Model Edge Props: 89


All but 1 edge (collection_taxonomy) added to model object along with their properties

### Add C2M2 model object to MDB

##### Load C2M2 model

In [17]:
load_model(model=c2m2_model, mdb=dev_mdb, _commit=COMMIT)

100%|██████████| 527/527 [00:49<00:00, 10.56it/s]


### Curation for other nodes/edges

In [18]:
print(len(json_other))

2


In [19]:
for other in json_other:
    print(f"{other['name']}: {other['description']}\n")

subject_role_taxonomy: Trinary association linking IDs representing (1) a subject, (2) a subject_role (a named organism-level constituent component of a subject, like 'host', 'pathogen', 'endosymbiont', 'taxon detected inside a microbiome subject', etc.) and (3) a taxonomic label (which is hereby assigned to this particular subject_role within this particular subject)

collection_taxonomy: Association between a taxon and a C2M2 collection containing experimental resources directly related to the study of that taxon



TBD - "taxonomy" table same as "NCBI_taxonomy"?