# CFDE C2M2 Model Mapping

### Imports

In [27]:
import pandas as pd
from bento_meta.mdb.mdb_tools import ToolsMDB

### DSS Mapping

##### Import clean DSS mappings

In [11]:
dss_mapping_df = pd.read_csv("Mappings/CRDC DST Cross-Model Mapping_clean.csv")

In [12]:
dss_mapping_df.shape

(303, 6)

In [13]:
dss_mapping_df.head()

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Identifier,['Subject'],CTDC,case_id,['case']
1,DST,Identifier,['Subject'],GDC,id,['case']
2,DST,Identifier,['Subject'],ICDC,case_id,['case']
3,DST,Identifier,['Subject'],IDC,Patient ID,['file']
4,DST,Identifier,['Subject'],PDC,case_id,['case']


##### Filter for only C2M2 model

In [14]:
c2m2_mapping_df = dss_mapping_df[dss_mapping_df["ent_2_model"] == "C2M2"]

In [15]:
c2m2_mapping_df.reset_index(inplace=True, drop=True)

In [16]:
c2m2_mapping_df.shape

(19, 6)

In [17]:
c2m2_mapping_df.head()

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Identifier,['Subject'],C2M2,local_id,['biosample']
1,DST,Type,['Specimen'],C2M2,assay_type,['biosample']
2,DST,Ethnicity,['Subject'],C2M2,ethnicity,['subject']
3,DST,Race,['Subject'],C2M2,race,['subject_race']
4,DST,Format,['File'],C2M2,id,['file_format']


##### Check original mapping spreadsheet and add any missing ents/mappings

In [18]:
c2m2_mapping_df

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Identifier,['Subject'],C2M2,local_id,['biosample']
1,DST,Type,['Specimen'],C2M2,assay_type,['biosample']
2,DST,Ethnicity,['Subject'],C2M2,ethnicity,['subject']
3,DST,Race,['Subject'],C2M2,race,['subject_race']
4,DST,Format,['File'],C2M2,id,['file_format']
5,DST,Identifier,['File'],C2M2,local_id,['file']
6,DST,Name,['Program'],C2M2,name,['project']
7,DST,Taxon,['Subject'],C2M2,taxonomy_id,['subject_role_taxonomy']
8,DST,Type,['File'],C2M2,data_type,['file']
9,DST,Name,['Study'],C2M2,name,['project']


In [19]:
# additional lines to add (mapped in comments but not explicitly mapped in spreadsheet)
ent_1_model = ["DST", "DST", "DST"]
ent_1_handle = ["Type", "symbol", "Identifier"]
ent_1_extra_handles = ["File", "Gene", "Project"]
ent_2_model = ["C2M2", "C2M2", "C2M2"]
ent_2_handle = ["mime_type", "name", "local_id"]
ent_2_extra_handles = ["file", "gene", "collection"]

extra_dict = {
    "ent_1_model": ent_1_model,
    "ent_1_handle": ent_1_handle,
    "ent_1_extra_handles": ent_1_extra_handles,
    "ent_2_model": ent_2_model,
    "ent_2_handle": ent_2_handle,
    "ent_2_extra_handles": ent_2_extra_handles
}

c2m2_extra = pd.DataFrame(extra_dict)

c2m2_extra.head()

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Type,File,C2M2,mime_type,file
1,DST,symbol,Gene,C2M2,name,gene
2,DST,Identifier,Project,C2M2,local_id,collection


In [20]:
c2m2_extra["ent_1_extra_handles"] = [[l] for l in c2m2_extra["ent_1_extra_handles"]] # listify extra handles
c2m2_extra["ent_2_extra_handles"] = [[l] for l in c2m2_extra["ent_2_extra_handles"]] # listify extra handles

In [21]:
c2m2_extra

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Type,[File],C2M2,mime_type,[file]
1,DST,symbol,[Gene],C2M2,name,[gene]
2,DST,Identifier,[Project],C2M2,local_id,[collection]


In [22]:
c2m2_mapping_df_all = c2m2_mapping_df.append(c2m2_extra, ignore_index=True) # ignore index?

In [23]:
len(c2m2_mapping_df_all)

22

In [24]:
c2m2_mapping_df_all

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DST,Identifier,['Subject'],C2M2,local_id,['biosample']
1,DST,Type,['Specimen'],C2M2,assay_type,['biosample']
2,DST,Ethnicity,['Subject'],C2M2,ethnicity,['subject']
3,DST,Race,['Subject'],C2M2,race,['subject_race']
4,DST,Format,['File'],C2M2,id,['file_format']
5,DST,Identifier,['File'],C2M2,local_id,['file']
6,DST,Name,['Program'],C2M2,name,['project']
7,DST,Taxon,['Subject'],C2M2,taxonomy_id,['subject_role_taxonomy']
8,DST,Type,['File'],C2M2,data_type,['file']
9,DST,Name,['Study'],C2M2,name,['project']


##### Export clean csv with C2M2 mappings

In [25]:
c2m2_mapping_df_all.to_csv("Mappings/c2m2_dss_mappings_clean.csv", index=False)

### Run script to add mappings to MDB

In [55]:
!python "Script/link_synonym_ents_csv.py" --csv_filepath="Mappings/c2m2_dss_mappings_clean.csv" --mdb_uri="bolt://localhost:7687" --mdb_user="neo4j" --mdb_pass="blank1" --entity_type="property" --add_missing_ent_1="True" --add_missing_ent_2="False" --_commit="2022-11-30_c2m2-dss-map"

Creating new property node with properties: {handle: 'Identifier', model: 'DST', nanoid: 'atumqd', _commit: '2022-11-30_c2m2-dss-map'}
Creating new node node with properties: {handle: 'Subject', model: 'DST', nanoid: 'UA0baG', _commit: '2022-11-30_c2m2-dss-map'}
Adding has_property relationship between src node with properties: {handle: 'Subject', model: 'DST', nanoid: 'UA0baG', _commit: '2022-11-30_c2m2-dss-map'} to dst property with properties: {handle: 'Identifier', model: 'DST', nanoid: 'atumqd', _commit: '2022-11-30_c2m2-dss-map'}
Creating new concept node with properties: {nanoid: '6f944Q', _commit: '2022-11-30_c2m2-dss-map'}
Adding has_concept relationship between src property with properties: {handle: 'Identifier', model: 'DST', nanoid: 'atumqd', _commit: '2022-11-30_c2m2-dss-map'} to dst concept with properties: {nanoid: '6f944Q', _commit: '2022-11-30_c2m2-dss-map'}
Adding has_concept relationship between src property with properties: {handle: 'local_id', model: 'C2M2', nanoid