# DSS Model Mapping

### Imports

In [175]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
from datetime import date

### Load DSS Model from Excel

In [54]:
RAW_DSS_MAP_EXCEL = Path("../data/input/CRDC DST Cross-Model Mapping.xlsx")

In [55]:
raw_df = pd.read_excel(
    RAW_DSS_MAP_EXCEL,
    sheet_name="CRDC DST Cross-Model Mapping",
    header=2,
    nrows=1529,
    usecols=["DST Data Element Name", "Compiled Data Element Names"]
)

  warn(msg)


In [56]:
raw_df

Unnamed: 0,DST Data Element Name,Compiled Data Element Names
0,Subject Identifier,CTDC.case.case_id\nGDC.Case.id\nICDC.case.case...
1,Specimen Type,GDC.Sample.sample_type\nICDC.sample.physical_s...
2,Subject Ethnicity*,CTDC.case.ethnicity\nGDC.Demographic.ethnicity...
3,Subject Race,CTDC.case.race\nGDC.Demographic.race\nPDC.Demo...
4,,GDC.Diagnosis.site_of_resection_or_biopsy\nIDC...
...,...,...
1524,Tumor Grade,PDC.Diagnosis.who_nte_grade
1525,Tumor Grade,PDC.Diagnosis.gleason_grade_tertiary
1526,,GDC.Diagnosis.tumor_depth\nPDC.Diagnosis.tumor...
1527,Disease Clinical Stage,PDC.Diagnosis.figo_staging_edition_year


### Separate DSS & Other Models

In [92]:
# model for dst nodes
df_dss = pd.DataFrame(np.repeat("DSS", len(raw_df)))

df_dss = pd.concat(
    [df_dss, raw_df["DST Data Element Name"].str.split(" ", 1, expand=True)],
    axis=1
)
df_dss.columns = ["ent_1_model", "ent_1_extra_handles", "ent_1_handle"]
df_dss = df_dss[["ent_1_model", "ent_1_handle", "ent_1_extra_handles"]]
# df_dst["ent_1_extra_handles"] = [[l] for l in df_dst["ent_1_extra_handles"]] # listify extra handles

In [93]:
df_dss

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles
0,DSS,Identifier,Subject
1,DSS,Type,Specimen
2,DSS,Ethnicity*,Subject
3,DSS,Race,Subject
4,DSS,,
...,...,...,...
1524,DSS,Grade,Tumor
1525,DSS,Grade,Tumor
1526,DSS,,
1527,DSS,Clinical Stage,Disease


In [94]:
df_models = raw_df["Compiled Data Element Names"].str.strip().str.split(
    '\n', 
    expand=True
).add_prefix("Model_Data_Element_Name_")

df_models

Unnamed: 0,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
0,CTDC.case.case_id,GDC.Case.id,ICDC.case.case_id,IDC.DICOM.Patient Module.Patient ID,PDC.Case.case_id,CDS.Participant.subject_id,CDA.Patient.dct:identifier,,C2M2.biosample.local_id,mCODE.Cancer Patient Profile.Identifier
1,GDC.Sample.sample_type,ICDC.sample.physical_sample_type,PDC.Sample.sample_type,CDS.Sample Information.sample_type,CDA.Specimen.source_material_type,HTAN.Molecular Test.biospecimen_type,C2M2.biosample.assay_type,mCODE.Genomic Specimen Profile.Type,,
2,CTDC.case.ethnicity,GDC.Demographic.ethnicity,IDC.DICOM.Patient Module.Patient's Ethnic Group,PDC.Demographic.ethnicity,CDS.Participant.ethnicity,CDA.Patient.ethnicity,HTAN.Demographics.Ethnicity,C2M2.subject.ethnicity,,
3,CTDC.case.race,GDC.Demographic.race,PDC.Demographic.race,CDS.Participant.race,CDA.Patient.race,HTAN.Demographics.Race,C2M2.subject_race.race,,,
4,GDC.Diagnosis.site_of_resection_or_biopsy,IDC.TCIA.tcia_tumorLocation,PDC.Diagnosis.site_of_resection_or_biopsy,CDS.Additional Diagnosis Information.site_of_r...,HTAN.Diagnosis.site_of_resection_or_biopsy,,,,,
...,...,...,...,...,...,...,...,...,...,...
1524,PDC.Diagnosis.who_nte_grade,,,,,,,,,
1525,PDC.Diagnosis.gleason_grade_tertiary,,,,,,,,,
1526,GDC.Diagnosis.tumor_depth,PDC.Diagnosis.tumor_depth,,,,,,,,
1527,PDC.Diagnosis.figo_staging_edition_year,,,,,,,,,


In [95]:
df_dss.join(df_models)

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
0,DSS,Identifier,Subject,CTDC.case.case_id,GDC.Case.id,ICDC.case.case_id,IDC.DICOM.Patient Module.Patient ID,PDC.Case.case_id,CDS.Participant.subject_id,CDA.Patient.dct:identifier,,C2M2.biosample.local_id,mCODE.Cancer Patient Profile.Identifier
1,DSS,Type,Specimen,GDC.Sample.sample_type,ICDC.sample.physical_sample_type,PDC.Sample.sample_type,CDS.Sample Information.sample_type,CDA.Specimen.source_material_type,HTAN.Molecular Test.biospecimen_type,C2M2.biosample.assay_type,mCODE.Genomic Specimen Profile.Type,,
2,DSS,Ethnicity*,Subject,CTDC.case.ethnicity,GDC.Demographic.ethnicity,IDC.DICOM.Patient Module.Patient's Ethnic Group,PDC.Demographic.ethnicity,CDS.Participant.ethnicity,CDA.Patient.ethnicity,HTAN.Demographics.Ethnicity,C2M2.subject.ethnicity,,
3,DSS,Race,Subject,CTDC.case.race,GDC.Demographic.race,PDC.Demographic.race,CDS.Participant.race,CDA.Patient.race,HTAN.Demographics.Race,C2M2.subject_race.race,,,
4,DSS,,,GDC.Diagnosis.site_of_resection_or_biopsy,IDC.TCIA.tcia_tumorLocation,PDC.Diagnosis.site_of_resection_or_biopsy,CDS.Additional Diagnosis Information.site_of_r...,HTAN.Diagnosis.site_of_resection_or_biopsy,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,DSS,Grade,Tumor,PDC.Diagnosis.who_nte_grade,,,,,,,,,
1525,DSS,Grade,Tumor,PDC.Diagnosis.gleason_grade_tertiary,,,,,,,,,
1526,DSS,,,GDC.Diagnosis.tumor_depth,PDC.Diagnosis.tumor_depth,,,,,,,,
1527,DSS,Clinical Stage,Disease,PDC.Diagnosis.figo_staging_edition_year,,,,,,,,,


### Drop duplicates in whole set of mappings

In [96]:
# check for duplicates in whole df, drop 2nd occurance
df_both = df_dss.join(df_models)
df_both.drop_duplicates(keep="first", inplace=True)
dss_df = df_both[["ent_1_model", "ent_1_handle", "ent_1_extra_handles"]]
df_models = df_both.drop(["ent_1_model", "ent_1_handle", "ent_1_extra_handles"], axis=1)
print(len(dss_df), len(df_models))

1522 1522


In [97]:
df_models.count()

Model_Data_Element_Name_0    1520
Model_Data_Element_Name_1     394
Model_Data_Element_Name_2     243
Model_Data_Element_Name_3      66
Model_Data_Element_Name_4      32
Model_Data_Element_Name_5      18
Model_Data_Element_Name_6      10
Model_Data_Element_Name_7       4
Model_Data_Element_Name_8       2
Model_Data_Element_Name_9       1
dtype: int64

In [98]:
# look at row with 9 "mappings"
for i, x in enumerate(df_models.count(axis=1)):
    if x >= 9:
        print(i, x)

0 10
48 9


In [99]:
df_models.iloc[[48]]

Unnamed: 0,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
48,GDC.Sample.biospecimen_anatomic_site,ICDC.sample.sample_site,IDC.DICOM.General Series Module.BodyPartExamined,PDC.Sample.biospecimen_anatomic_site,CDS.Additional Sample Information.sample_anato...,CDA.Specimen.anatomical_site,,C2M2.biosample.anatomy,mCODE.Genomic Specimen Profile.Collection > Bo...,


### Stack rows of matches into one column

In [100]:
df_models_stack = pd.DataFrame(df_models.stack())

In [101]:
df_models_stack.head()

Unnamed: 0,Unnamed: 1,0
0,Model_Data_Element_Name_0,CTDC.case.case_id
0,Model_Data_Element_Name_1,GDC.Case.id
0,Model_Data_Element_Name_2,ICDC.case.case_id
0,Model_Data_Element_Name_3,IDC.DICOM.Patient Module.Patient ID
0,Model_Data_Element_Name_4,PDC.Case.case_id


In [102]:
# get rid of "model data element name" column
df_models_stack = df_models_stack.droplevel(-1)

In [103]:
df_models_stack.head()

Unnamed: 0,0
0,CTDC.case.case_id
0,GDC.Case.id
0,ICDC.case.case_id
0,IDC.DICOM.Patient Module.Patient ID
0,PDC.Case.case_id


Note: index here aligns with index of DSS property

In [104]:
len(df_models_stack)

2290

### Clean stacked models df


##### Drop any missing rows

In [105]:
# drop any missing rows
df_models_stack.dropna(subset=[0], inplace=True)

In [106]:
len(df_models_stack)

2290

No missing values found here, but some incorrectly coded

##### Replace non-breaking spaces and empty strings with null and drop nulls again

In [107]:
df_models_stack[0].replace(u"\xa0", np.nan, inplace=True)
df_models_stack[0].replace(u"", np.nan, inplace=True)

In [108]:
df_models_stack.dropna(subset=[0], inplace=True)

In [109]:
len(df_models_stack)

2257

33 empty rows dropped this time

In [110]:
df_models_stack.head()

Unnamed: 0,0
0,CTDC.case.case_id
0,GDC.Case.id
0,ICDC.case.case_id
0,IDC.DICOM.Patient Module.Patient ID
0,PDC.Case.case_id


##### Separate stacked models df into model, node, & prop handles

In [111]:
df_models_split = df_models_stack[0].str.split(".", expand=True)

In [112]:
df_models_split.count()

0    2257
1    2257
2    2227
3      15
dtype: int64

In [113]:
# df_models rows with a third option (what should be node and what should be property?)
df_models_other_1 = df_models_split.loc[df_models_split[3].notnull()]
df_models_other_1

Unnamed: 0,0,1,2,3
0,IDC,DICOM,Patient Module,Patient ID
2,IDC,DICOM,Patient Module,Patient's Ethnic Group
10,IDC,DICOM,Patient Module,Patient's Ethnic Group
29,IDC,DICOM,Patient Module,Patient's Sex
48,IDC,DICOM,General Series Module,BodyPartExamined
271,IDC,DICOM,Patient Study Module,Patient Age
517,IDC,DICOM,Patient Study Module,Patient's Size
518,IDC,DICOM,Patient Study Module,Patient's Weight
519,IDC,DICOM,General Series Module,StudyDate
571,GDC,SlideImage,ref:GDC,data_file_properties


In [114]:
# df_models without a 2nd option
df_models_other_2 = df_models_split.loc[df_models_split[1].notnull() & df_models_split[2].isnull()]
df_models_other_2

Unnamed: 0,0,1,2,3
50,ICDC,case,,
51,ICDC,diagnosis,,
53,ICDC,sample,,
55,ICDC,demographic,,
56,ICDC,program,,
57,ICDC,study,,
75,ICDC,file,,
79,ICDC,principal_investigator,,
134,ICDC,enrollment,,
212,ICDC,sample,,


##### Identify duplicates in split models df

In [115]:
df_models_split[df_models_split.duplicated(keep=False)]

Unnamed: 0,0,1,2,3
1,mCODE,Genomic Specimen Profile,Type,
2,CTDC,case,ethnicity,
2,IDC,DICOM,Patient Module,Patient's Ethnic Group
2,PDC,Demographic,ethnicity,
2,CDA,Patient,ethnicity,
...,...,...,...,...
1425,mCODE,Primary Cancer Condition Profile,Code,
1426,mCODE,Primary Cancer Condition Profile,Body Site,
1430,mCODE,Primary Cancer Condition Profile,Stage,
1502,mCODE,Tumor Size Profile,Component > Tumor Longest Dimension,


##### Change model df handles to match mapping script

In [116]:
df_models_split.columns = ["ent_2_model", "ent_2_extra_handles", "ent_2_handle", "ent_2_handles_other"]
df_models_split = df_models_split[["ent_2_model", "ent_2_handle", "ent_2_handles_other", "ent_2_extra_handles"]]

### Combine DST and other model mappings into one df

In [117]:
df_mappings = df_dss.join(df_models_split)

In [118]:
df_mappings

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,CTDC,case_id,,case
0,DSS,Identifier,Subject,GDC,id,,Case
0,DSS,Identifier,Subject,ICDC,case_id,,case
0,DSS,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
0,DSS,Identifier,Subject,PDC,case_id,,Case
...,...,...,...,...,...,...,...
1525,DSS,Grade,Tumor,PDC,gleason_grade_tertiary,,Diagnosis
1526,DSS,,,GDC,tumor_depth,,Diagnosis
1526,DSS,,,PDC,tumor_depth,,Diagnosis
1527,DSS,Clinical Stage,Disease,PDC,figo_staging_edition_year,,Diagnosis


##### identify duplicate rows in combined df

In [119]:
df_mappings[df_mappings.duplicated(keep=False)]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
2,DSS,Ethnicity*,Subject,CTDC,ethnicity,,case
2,DSS,Ethnicity*,Subject,IDC,Patient Module,Patient's Ethnic Group,DICOM
2,DSS,Ethnicity*,Subject,PDC,ethnicity,,Demographic
2,DSS,Ethnicity*,Subject,CDA,ethnicity,,Patient
10,DSS,Ethnicity*,Subject,CTDC,ethnicity,,case
...,...,...,...,...,...,...,...
961,DSS,,,ICDC,of_case(case),,sample
962,DSS,,,,,,
963,DSS,,,,,,
1069,DSS,,,,,,


In [120]:
len(df_mappings)

2266

##### drop duplicate rows in combined df

In [121]:
df_mappings.drop_duplicates(keep="first", inplace=True)

In [122]:
len(df_mappings)

2216

50 duplicate rows dropped

In [123]:
df_mappings

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,CTDC,case_id,,case
0,DSS,Identifier,Subject,GDC,id,,Case
0,DSS,Identifier,Subject,ICDC,case_id,,case
0,DSS,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
0,DSS,Identifier,Subject,PDC,case_id,,Case
...,...,...,...,...,...,...,...
1525,DSS,Grade,Tumor,PDC,gleason_grade_tertiary,,Diagnosis
1526,DSS,,,GDC,tumor_depth,,Diagnosis
1526,DSS,,,PDC,tumor_depth,,Diagnosis
1527,DSS,Clinical Stage,Disease,PDC,figo_staging_edition_year,,Diagnosis


### Separate rows with missing DSS property handle

In [124]:
dss_mappings_other = df_mappings[df_mappings["ent_1_handle"].isna()]

In [125]:
dss_mappings_other

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
4,DSS,,,GDC,site_of_resection_or_biopsy,,Diagnosis
4,DSS,,,IDC,tcia_tumorLocation,,TCIA
4,DSS,,,PDC,site_of_resection_or_biopsy,,Diagnosis
4,DSS,,,CDS,site_of_resection_or_biopsy,,Additional Diagnosis Information
4,DSS,,,HTAN,site_of_resection_or_biopsy,,Diagnosis
...,...,...,...,...,...,...,...
1520,DSS,,,mCODE,Location,,Tumor Profile
1521,DSS,,,mCODE,Location Qualifier,,Tumor Profile
1522,DSS,,,mCODE,Patient,,Tumor Profile
1526,DSS,,,GDC,tumor_depth,,Diagnosis


In [126]:
dss_mappings = df_mappings[df_mappings["ent_1_handle"].notna()]

In [127]:
dss_mappings

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,CTDC,case_id,,case
0,DSS,Identifier,Subject,GDC,id,,Case
0,DSS,Identifier,Subject,ICDC,case_id,,case
0,DSS,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
0,DSS,Identifier,Subject,PDC,case_id,,Case
...,...,...,...,...,...,...,...
1523,DSS,Grade,Tumor,PDC,who_cns_grade,,Diagnosis
1524,DSS,Grade,Tumor,PDC,who_nte_grade,,Diagnosis
1525,DSS,Grade,Tumor,PDC,gleason_grade_tertiary,,Diagnosis
1527,DSS,Clinical Stage,Disease,PDC,figo_staging_edition_year,,Diagnosis


### Fix identified issues with mappings

##### Clean IDC node handles; matching to CDA definition, TBD exactly how to do in MDB as of Dec 2022

In [128]:
# giving IDC nodes the "file" node for now to match existing (from CDA?)
# could change to use tables from https://learn.canceridc.dev/data/organization-of-data/files-and-metadata as nodes?

dss_mappings[dss_mappings["ent_2_model"] == "IDC"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
2,DSS,Ethnicity*,Subject,IDC,Patient Module,Patient's Ethnic Group,DICOM
14,DSS,Identifier,File,IDC,source_DOI,,TCIA
15,DSS,Name,Program,IDC,Program,,TCIA
21,DSS,Taxon,Subject,IDC,tcia_species (not yet available in IDC product...,,TCIA
29,DSS,Gender (Gender Identity),Subject,IDC,Patient Module,Patient's Sex,DICOM
48,DSS,Anatomic Site,Specimen,IDC,General Series Module,BodyPartExamined,DICOM
259,DSS,Identifier,Project,IDC,collection_id,,TCIA
277,DSS,Modality,Imaging,IDC,Modality,,DICOM


In [129]:
dss_mappings[dss_mappings["ent_2_handles_other"].notnull()]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
2,DSS,Ethnicity*,Subject,IDC,Patient Module,Patient's Ethnic Group,DICOM
29,DSS,Gender (Gender Identity),Subject,IDC,Patient Module,Patient's Sex,DICOM
48,DSS,Anatomic Site,Specimen,IDC,General Series Module,BodyPartExamined,DICOM


In [130]:
idc_other_handles_mask = dss_mappings["ent_2_handles_other"].notnull()
idc_other_handles = dss_mappings[dss_mappings["ent_2_handles_other"].notnull()]["ent_2_handles_other"]
dss_mappings.loc[idc_other_handles_mask, 'ent_2_handle'] = idc_other_handles

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [131]:
dss_mappings[dss_mappings["ent_2_model"] == "IDC"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_handles_other,ent_2_extra_handles
0,DSS,Identifier,Subject,IDC,Patient ID,Patient ID,DICOM
2,DSS,Ethnicity*,Subject,IDC,Patient's Ethnic Group,Patient's Ethnic Group,DICOM
14,DSS,Identifier,File,IDC,source_DOI,,TCIA
15,DSS,Name,Program,IDC,Program,,TCIA
21,DSS,Taxon,Subject,IDC,tcia_species (not yet available in IDC product...,,TCIA
29,DSS,Gender (Gender Identity),Subject,IDC,Patient's Sex,Patient's Sex,DICOM
48,DSS,Anatomic Site,Specimen,IDC,BodyPartExamined,BodyPartExamined,DICOM
259,DSS,Identifier,Project,IDC,collection_id,,TCIA
277,DSS,Modality,Imaging,IDC,Modality,,DICOM


In [132]:
dss_mappings = dss_mappings.drop(columns="ent_2_handles_other")

In [133]:
dss_mappings[dss_mappings["ent_2_model"] == "IDC"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DSS,Identifier,Subject,IDC,Patient ID,DICOM
2,DSS,Ethnicity*,Subject,IDC,Patient's Ethnic Group,DICOM
14,DSS,Identifier,File,IDC,source_DOI,TCIA
15,DSS,Name,Program,IDC,Program,TCIA
21,DSS,Taxon,Subject,IDC,tcia_species (not yet available in IDC product...,TCIA
29,DSS,Gender (Gender Identity),Subject,IDC,Patient's Sex,DICOM
48,DSS,Anatomic Site,Specimen,IDC,BodyPartExamined,DICOM
259,DSS,Identifier,Project,IDC,collection_id,TCIA
277,DSS,Modality,Imaging,IDC,Modality,DICOM


In [134]:
dss_mappings.loc[dss_mappings["ent_2_model"] == "IDC", ["ent_2_extra_handles"]] = "file"

In [135]:
dss_mappings[dss_mappings["ent_2_model"] == "IDC"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DSS,Identifier,Subject,IDC,Patient ID,file
2,DSS,Ethnicity*,Subject,IDC,Patient's Ethnic Group,file
14,DSS,Identifier,File,IDC,source_DOI,file
15,DSS,Name,Program,IDC,Program,file
21,DSS,Taxon,Subject,IDC,tcia_species (not yet available in IDC product...,file
29,DSS,Gender (Gender Identity),Subject,IDC,Patient's Sex,file
48,DSS,Anatomic Site,Specimen,IDC,BodyPartExamined,file
259,DSS,Identifier,Project,IDC,collection_id,file
277,DSS,Modality,Imaging,IDC,Modality,file


##### Clean up other notes, etc. added to handles in raw spreadsheet

In [136]:
# remove extraneous notes, etc. from ent 1 handles
ent_1_pattern = "|".join([
    re.escape("*"),
    re.escape(" (same as name?)"),
    re.escape(" (?)")
])
dss_mappings["ent_1_handle"] = dss_mappings["ent_1_handle"].str.replace(ent_1_pattern, "")

  dss_mappings["ent_1_handle"] = dss_mappings["ent_1_handle"].str.replace(ent_1_pattern, "")


In [137]:
# remove extraneous notes, etc. from ent 2 handles
ent_2_pattern = "|".join([
    re.escape(" (not yet available in IDC production release)"),
    re.escape(" (Consider PhenotypicSex)"),
    re.escape("dct:"),
    re.escape("Component > "),
    re.escape("Collection > "),
    re.escape("'")
])
dss_mappings["ent_2_handle"] = dss_mappings["ent_2_handle"].str.replace(ent_2_pattern, "")

  dss_mappings["ent_2_handle"] = dss_mappings["ent_2_handle"].str.replace(ent_2_pattern, "")


In [139]:
# reset index
dss_mappings.reset_index(inplace=True, drop=True)

In [140]:
dss_mappings.head(20)

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DSS,Identifier,Subject,CTDC,case_id,case
1,DSS,Identifier,Subject,GDC,id,Case
2,DSS,Identifier,Subject,ICDC,case_id,case
3,DSS,Identifier,Subject,IDC,Patient ID,file
4,DSS,Identifier,Subject,PDC,case_id,Case
5,DSS,Identifier,Subject,CDS,subject_id,Participant
6,DSS,Identifier,Subject,CDA,identifier,Patient
7,DSS,Identifier,Subject,C2M2,local_id,biosample
8,DSS,Identifier,Subject,mCODE,Identifier,Cancer Patient Profile
9,DSS,Type,Specimen,GDC,sample_type,Sample


##### Clean up mCODE handles

In [141]:
dss_mappings[dss_mappings["ent_2_model"] == "mCODE"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
8,DSS,Identifier,Subject,mCODE,Identifier,Cancer Patient Profile
16,DSS,Type,Specimen,mCODE,Type,Genomic Specimen Profile
37,DSS,Grade,Tumor,mCODE,Stage,Primary Cancer Condition Profile
48,DSS,general morphology,Tissue,mCODE,Histology Morphology Behavior,Primary Cancer Condition Profile
60,DSS,Identifier,Specimen,mCODE,Identifier,Tumor Specimen Profile
83,DSS,Gender (Gender Identity),Subject,mCODE,Gender,Cancer Patient Profile
96,DSS,Clinical Stage,Disease,mCODE,Code,Primary Cancer Condition Profile
109,DSS,Anatomic Site,Specimen,mCODE,Body Site,Genomic Specimen Profile
128,DSS,Site,Primary,mCODE,Body Site,Primary Cancer Condition Profile
144,DSS,chromosome,Gene,mCODE,Cytogenetic Location,Genomic Variant Profile


In [142]:
# drop "Profile" from ent 2 node handles
ent_2_extra_pattern = "|".join([
    re.escape(" Profile"),
])
dss_mappings["ent_2_extra_handles"] = dss_mappings["ent_2_extra_handles"].str.replace(ent_2_extra_pattern, "")

  dss_mappings["ent_2_extra_handles"] = dss_mappings["ent_2_extra_handles"].str.replace(ent_2_extra_pattern, "")


In [143]:
dss_mappings[dss_mappings["ent_2_model"] == "mCODE"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
8,DSS,Identifier,Subject,mCODE,Identifier,Cancer Patient
16,DSS,Type,Specimen,mCODE,Type,Genomic Specimen
37,DSS,Grade,Tumor,mCODE,Stage,Primary Cancer Condition
48,DSS,general morphology,Tissue,mCODE,Histology Morphology Behavior,Primary Cancer Condition
60,DSS,Identifier,Specimen,mCODE,Identifier,Tumor Specimen
83,DSS,Gender (Gender Identity),Subject,mCODE,Gender,Cancer Patient
96,DSS,Clinical Stage,Disease,mCODE,Code,Primary Cancer Condition
109,DSS,Anatomic Site,Specimen,mCODE,Body Site,Genomic Specimen
128,DSS,Site,Primary,mCODE,Body Site,Primary Cancer Condition
144,DSS,chromosome,Gene,mCODE,Cytogenetic Location,Genomic Variant


##### Clean up CDA handles (to match those actually provided by CDA)

In [144]:
dss_mappings[dss_mappings["ent_2_model"] == "CDA"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
6,DSS,Identifier,Subject,CDA,identifier,Patient
13,DSS,Type,Specimen,CDA,source_material_type,Specimen
22,DSS,Ethnicity,Subject,CDA,ethnicity,Patient
29,DSS,Race,Subject,CDA,race,Patient
41,DSS,Age at Diagnosis,Subject,CDA,age_at_diagnosis,Diagnosis
46,DSS,general morphology,Tissue,CDA,general_tissue_morphology,Specimen
51,DSS,Format,File,CDA,file_format,File
59,DSS,Identifier,Specimen,CDA,identifier,Specimen
65,DSS,Identifier,File,CDA,identifier,File
75,DSS,Taxon,Subject,CDA,taxon,Patient


In [145]:
# CDA "Patient" node to "Subject"
cda_other_handles_mask_1 = (dss_mappings["ent_2_model"] == "CDA") & (dss_mappings["ent_2_extra_handles"] == "Patient")
dss_mappings.loc[cda_other_handles_mask_1, "ent_2_extra_handles"] = "Subject"

In [146]:
# CDA "taxon" property to "species"
cda_other_handles_mask_2 = (dss_mappings["ent_2_model"] == "CDA") & (dss_mappings["ent_2_handle"] == "taxon")
dss_mappings.loc[cda_other_handles_mask_2, "ent_2_handle"] = "species"

In [147]:
# CDA "hasDiseaseStaging" property to "stage"
cda_other_handles_mask_3 = (dss_mappings["ent_2_model"] == "CDA") & (dss_mappings["ent_2_handle"] == "hasDiseaseStaging")
dss_mappings.loc[cda_other_handles_mask_3, "ent_2_handle"] = "stage"

In [148]:
# CDA "identifier" property of "Project" node to "member_of_research_project" of "ResearchSubjeect"
cda_other_handles_mask_4 = (dss_mappings["ent_2_model"] == "CDA") & (dss_mappings["ent_2_extra_handles"] == "Project") & (dss_mappings["ent_2_handle"] == "identifier")
dss_mappings.loc[cda_other_handles_mask_4, "ent_2_handle"] = "member_of_research_project"
dss_mappings.loc[cda_other_handles_mask_4, "ent_2_extra_handles"] = "ResearchSubject"

In [149]:
dss_mappings[dss_mappings["ent_2_model"] == "CDA"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
6,DSS,Identifier,Subject,CDA,identifier,Subject
13,DSS,Type,Specimen,CDA,source_material_type,Specimen
22,DSS,Ethnicity,Subject,CDA,ethnicity,Subject
29,DSS,Race,Subject,CDA,race,Subject
41,DSS,Age at Diagnosis,Subject,CDA,age_at_diagnosis,Diagnosis
46,DSS,general morphology,Tissue,CDA,general_tissue_morphology,Specimen
51,DSS,Format,File,CDA,file_format,File
59,DSS,Identifier,Specimen,CDA,identifier,Specimen
65,DSS,Identifier,File,CDA,identifier,File
75,DSS,Taxon,Subject,CDA,species,Subject


In [150]:
dss_mappings[dss_mappings["ent_2_model"] == "CDA"].shape

(21, 6)

In [151]:
# drop rows not found in official CDA mappings
cda_drop_handles =  [
    "hasGenotypicSex",
    "hasDisease?     [OntologyReference]", 
    "isPrincipalInvestigator",
    "general_tissue_morphology", 
    "specific_tissue_morphology"
]

for handle in cda_drop_handles:
    cda_drop_index = dss_mappings.loc[(dss_mappings["ent_2_model"] == "CDA") & (dss_mappings["ent_2_handle"] == handle)].index
    dss_mappings.drop(labels=cda_drop_index, inplace=True)

In [152]:
dss_mappings[dss_mappings["ent_2_model"] == "CDA"]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
6,DSS,Identifier,Subject,CDA,identifier,Subject
13,DSS,Type,Specimen,CDA,source_material_type,Specimen
22,DSS,Ethnicity,Subject,CDA,ethnicity,Subject
29,DSS,Race,Subject,CDA,race,Subject
41,DSS,Age at Diagnosis,Subject,CDA,age_at_diagnosis,Diagnosis
51,DSS,Format,File,CDA,file_format,File
59,DSS,Identifier,Specimen,CDA,identifier,Specimen
65,DSS,Identifier,File,CDA,identifier,File
75,DSS,Taxon,Subject,CDA,species,Subject
87,DSS,Type,File,CDA,data_type,File


In [153]:
dss_mappings[dss_mappings["ent_2_model"] == "CDA"].shape

(16, 6)

5 rows dropped b/c not in official CDA mappings

##### Fix GDC, PDC, ICDC node handles to match those already in MDB

In [154]:
dss_mappings.ent_2_model.unique()

array(['CTDC', 'GDC', 'ICDC', 'IDC', 'PDC', 'CDS', 'CDA', 'C2M2', 'mCODE',
       'HTAN', nan], dtype=object)

In [155]:
# convert GDC node handles to lowercase to match format in MDB
gdc_node_handles = dss_mappings[dss_mappings["ent_2_model"] == "GDC"]["ent_2_extra_handles"]
dss_mappings.loc[dss_mappings["ent_2_model"] == "GDC", "ent_2_extra_handles"] = gdc_node_handles.str.lower()

In [156]:
# convert ICDC node handles to lowercase to match format in MDB
icdc_node_handles = dss_mappings[dss_mappings["ent_2_model"] == "ICDC"]["ent_2_extra_handles"]
dss_mappings.loc[dss_mappings["ent_2_model"] == "ICDC", "ent_2_extra_handles"] = icdc_node_handles.str.lower()

In [157]:
# convert PDC node handles to lowercase to match format in MDB
pdc_node_handles = dss_mappings[dss_mappings["ent_2_model"] == "PDC"]["ent_2_extra_handles"]
dss_mappings.loc[dss_mappings["ent_2_model"] == "PDC", "ent_2_extra_handles"] = pdc_node_handles.str.lower()

### Drop any rows with nulls or duplicates in cleaned df

In [158]:
dss_mappings.shape

(307, 6)

##### Drop rows with a null

In [159]:
# drop any rows with remaining NA values
null_index = dss_mappings[dss_mappings.isnull().any(axis=1)].index
dss_mappings.drop(labels=null_index, inplace=True)

dss_mappings.shape

(305, 6)

2 rows with null values dropped

##### Drop duplicate rows

In [160]:
# drop any remaining duplicates
dss_mappings.drop_duplicates(keep="first", inplace=True)
dss_mappings.shape

(303, 6)

2 duplicate rows dropped

### Convert node handles to list for mapping script

In [161]:
dss_mappings["ent_1_extra_handles"] = [[l] for l in dss_mappings["ent_1_extra_handles"]] # listify extra handles
dss_mappings["ent_2_extra_handles"] = [[l] for l in dss_mappings["ent_2_extra_handles"]] # listify extra handles

In [162]:
dss_mappings.head(20)

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handle,ent_2_extra_handles
0,DSS,Identifier,[Subject],CTDC,case_id,[case]
1,DSS,Identifier,[Subject],GDC,id,[case]
2,DSS,Identifier,[Subject],ICDC,case_id,[case]
3,DSS,Identifier,[Subject],IDC,Patient ID,[file]
4,DSS,Identifier,[Subject],PDC,case_id,[case]
5,DSS,Identifier,[Subject],CDS,subject_id,[Participant]
6,DSS,Identifier,[Subject],CDA,identifier,[Subject]
7,DSS,Identifier,[Subject],C2M2,local_id,[biosample]
8,DSS,Identifier,[Subject],mCODE,Identifier,[Cancer Patient]
9,DSS,Type,[Specimen],GDC,sample_type,[sample]


### Output cleaned DSS mappings (& others that couldn't be mapped properly)

In [178]:
output_path = Path("../data/output/")

In [179]:
today_date = date.today().strftime('%Y%m%d')

In [181]:
clean_output_path = Path(output_path, f"{today_date}_{input_filename}_clean.csv")
other_output_path = Path(output_path, f"{today_date}_{input_filename}_other.csv")

In [184]:
dss_mappings.to_csv(clean_output_path, index=False)
dss_mappings_other.to_csv(other_output_path, index=False)