# bento-meta cypher

## entities

### node

In [86]:
from bento_meta.util.cypher.entities import (
    N, N0, R, R0, P, T, G,
    _pattern, _as, _condition, _return,
    _plain, _anon, _var, _plain_var
    )
from bento_meta.util.cypher.functions import (
    Func, count, exists, group, And, Or, Not,
)
from bento_meta.util.cypher.clauses import (
    Clause, Match, Where, Return, Set, Create, Merge,
    OnMatchSet, OnCreateSet, Remove, Statement,
)

In [87]:
# N() entity is node
n = N(label="node", props={"model": "ICDC", "handle": "diagnosis"})

In [88]:
# label for each new instantiation of an entity counts up unless _reset_counter() called
n._reset_counter()

In [89]:
# pattern() renders entity as a match pattern
n.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis'})"

In [90]:
# condition() renders entity as a condition (e.g. for WHERE clause)
n.condition()

["n0.model = 'ICDC'", "n0.handle = 'diagnosis'"]

In [91]:
# label has entity label
n.label

'node'

In [92]:
# Return() "renders entity as a return value", maybe Neo4j variable assigned to entity?
n.Return()

'n0'

In [93]:
# _add_props() can add properties to a node or relationship entity
n._add_props({"nanoid": "mnmzW6"})
n.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})"

In [94]:
# As param lets a node be aliased?
x = N(label="thing", As="dude")

In [95]:
x.label

'thing'

In [96]:
x.As

'dude'

In [97]:
x.Return()

'n0 as dude'

In [98]:
# anonymous node (R0 also exists); probably used for just matching a pattern where you don't care 
# which node/relationship something is connected to as long as it is connected
N0().pattern()

'()'

### relationship

In [99]:
# R() entity is relationship/edge (not relationship node, but actual relationship in Neo4 (I think))
r = R(Type="has_property")

In [100]:
r.pattern()

'-[r0:has_property]-'

In [101]:
r.condition()

[]

In [102]:
r.Type

'has_property'

In [103]:
r.Return()

'r0'

In [104]:
R0().pattern()

'--'

### property

In [105]:
# P() represent properties of a node and can be used to hold (just one?) handle/value pair repping a property
p = P(handle="handle", value="stage_of_disease")

In [106]:
p.pattern()

"handle:'stage_of_disease'"

In [107]:
# these all return None since p.entity is None?
print(p.entity, p.condition(), p.Return())

None None None


In [108]:
m = N(label="property", props=p)

In [109]:
q = P("model", "ICDC")
l = P("nanoid", "xaF4my")

In [110]:
m._add_props([q, l])

True

In [111]:
m.pattern()

"(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})"

### triple

In [112]:
# R.relate() returns a triple relating two nodes by a relationship
t = r.relate(n, m)

In [113]:
# pattern() & condition() give same result for T() triple
print(t.pattern())
print(t.condition())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})
(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})


In [114]:
# can also relate_to() from a N() node rather than from the R() relationship
t2 = n.relate_to(r, m)

In [115]:
t2.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})"

### paths

In [116]:
# setup for paths
nodes = [N(label="case"), N(label="sample"), N(label="aliquot"),
            N(label="file")]
edges = [R(Type="of_case"), R(Type="of_sample"), R(Type="of_aliquot")]

t1 = edges[0].relate(nodes[1], nodes[0])  # (sample)-[:of_case]->(case)
t2 = edges[1].relate(nodes[2], nodes[1])  # (aliquot)-[:of_sample]->(sample)
t3 = edges[2].relate(nodes[3], nodes[2])  # (file)-[:of_aliquot]->(aliquot)

In [117]:
# G() represents a path, or an ordered set of partially overlapping triples
pth0 = G(nodes[1], edges[0], nodes[0])
pth0.pattern()

'(n4:sample)-[r2:of_case]->(n3:case)'

In [118]:
pth1 = G(t1)
pth1.pattern()

'(n4:sample)-[r2:of_case]->(n3:case)'

In [119]:
pth2 = G(t1, t2)
pth2.pattern()

'(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [120]:
pth3 = G(t2, t1)
pth3.pattern()

'(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [121]:
pth4 = G(t3, pth3) # switching these args around doesn't work though?
pth4.pattern()

'(n6:file)-[r4:of_aliquot]->(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [122]:
pth5 = G(t1, t2, t3)
pth5.pattern()

'(n6:file)-[r4:of_aliquot]->(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [123]:
for x in pth5.nodes():
    print(x.pattern())

(n5:aliquot)
(n3:case)
(n6:file)
(n4:sample)


In [124]:
for x in pth5.edges():
    print(x.pattern())

-[r3:of_sample]-
-[r2:of_case]-
-[r4:of_aliquot]-


### modifiers

In [125]:
# _as returns copy of ent with As alias set
print(n.Return()) 
print(_as(n, "alias_n").Return())

n0
n0 as alias_n


In [126]:
# _plain returns ent w/o properties
print(n.pattern())
print(_plain(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0:node)


In [127]:
# _var returns ent w/o label or type
print(n.pattern())
print( _var(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0 {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})


In [128]:
# _plain_var returns ent w/o label or properties
print(n.pattern())
print( _plain_var(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0)


## Clauses

In [129]:
# Match() represents a cypher match clause
print(str(Match(t)))
print(str(Match(n)))
print(str(Match(n, m)))

MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})
MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'}), (n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})


In [130]:
# Where() represents a cypher where clause, args are 
print(str(Where(*t.nodes())))
print(str(Where(n)))

# With(), Create(), Merge(), Remove() are similar

WHERE n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6' AND n2.handle = 'stage_of_disease' AND n2.model = 'ICDC' AND n2.nanoid = 'xaF4my'
WHERE n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6'


In [131]:
# Set() creates a SET clause, only property args matter for this
print(str(Set(*n.props.values())))

SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'


In [132]:
# OnCreateSet() and OnMatchSet() similar
print(str(OnCreateSet(*n.props.values())))
print(str(OnMatchSet(*n.props.values())))

ON CREATE SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'
ON MATCH SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'


In [133]:
# Statement() used to create a Neo4j statement w/ clauses & strings in order
str(
    Statement(
        Match(_var(_plain(t))),
        Where(exists(m.props['handle']), n),
        Return(count(n))
        )
    )

"MATCH (n0)-[r0:has_property]->(n2) WHERE exists(n2.handle) AND n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6' RETURN count(n0)"

In [134]:
str(
    Statement(
        Match(_var(_plain(t))),
        Where(group(And(exists(m.props['handle']), n.props['model'])),
            Not(n.props['handle'])),
        Return(p),
        'LIMIT 10'
        )
    )

"MATCH (n0)-[r0:has_property]->(n2) WHERE (exists(n2.handle) AND n0.model = 'ICDC') AND NOT n0.handle = 'diagnosis' RETURN n2.handle LIMIT 10"

In [135]:
str(
    Statement(
        Create(_plain(n)),
        Set(*n.props.values()),
        Return(n)
        )
    )

"CREATE (n0:node) SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6' RETURN n0"

## functions

functions include:
- exists()
- count()
- labels()
- Not()
- And()
- Or()
- group()
- is_null()
- is_not_null()

This could replace some of the cypher pattern/clause generation in MDB tools ()

Could potentially be integrated w/ bento-meta entities (which have some overlap with N()?)

# DSS mapping

In [223]:
from pathlib import Path
import numpy as np
import pandas as pd
import re as re

In [166]:
RAW_DSS_MAP_EXCEL = "C:/Users/nelson/Documents/GitHub/HIDS-Capstone/data/CRDC DST Cross-Model Mapping.xlsx"
df = pd.read_excel(RAW_DSS_MAP_EXCEL, sheet_name="CRDC DST Cross-Model Mapping", header=2, nrows=1529, usecols=["DST Data Element Name", "Compiled Data Element Names"])

  warn(msg)


In [167]:
# model for dst nodes
df_dst = pd.DataFrame(np.repeat("DST", len(df)))

# 
df_dst = pd.concat([df_dst, df["DST Data Element Name"].str.split(" ", 1, expand=True)], axis=1)
df_dst.columns = ["ent_1_model", "ent_1_extra_handles", "ent_1_handle"]
df_dst = df_dst[["ent_1_model", "ent_1_handle", "ent_1_extra_handles"]]
# df_dst["ent_1_extra_handles"] = [[l] for l in df_dst["ent_1_extra_handles"]] # listify extra handles

In [184]:
df_dst

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles
0,DST,Identifier,Subject
1,DST,Type,Specimen
2,DST,Ethnicity*,Subject
3,DST,Race,Subject
4,DST,,
...,...,...,...
1524,DST,Grade,Tumor
1525,DST,Grade,Tumor
1526,DST,,
1527,DST,Clinical Stage,Disease


In [178]:
df_models = df["Compiled Data Element Names"].str.strip().str.split('\n', expand=True).add_prefix("Model_Data_Element_Name_")
df_models

Unnamed: 0,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
0,CTDC.case.case_id,GDC.Case.id,ICDC.case.case_id,IDC.DICOM.Patient Module.Patient ID,PDC.Case.case_id,CDS.Participant.subject_id,CDA.Patient.dct:identifier,,C2M2.biosample.local_id,mCODE.Cancer Patient Profile.Identifier
1,GDC.Sample.sample_type,ICDC.sample.physical_sample_type,PDC.Sample.sample_type,CDS.Sample Information.sample_type,CDA.Specimen.source_material_type,HTAN.Molecular Test.biospecimen_type,C2M2.biosample.assay_type,mCODE.Genomic Specimen Profile.Type,,
2,CTDC.case.ethnicity,GDC.Demographic.ethnicity,IDC.DICOM.Patient Module.Patient's Ethnic Group,PDC.Demographic.ethnicity,CDS.Participant.ethnicity,CDA.Patient.ethnicity,HTAN.Demographics.Ethnicity,C2M2.subject.ethnicity,,
3,CTDC.case.race,GDC.Demographic.race,PDC.Demographic.race,CDS.Participant.race,CDA.Patient.race,HTAN.Demographics.Race,C2M2.subject_race.race,,,
4,GDC.Diagnosis.site_of_resection_or_biopsy,IDC.TCIA.tcia_tumorLocation,PDC.Diagnosis.site_of_resection_or_biopsy,CDS.Additional Diagnosis Information.site_of_r...,HTAN.Diagnosis.site_of_resection_or_biopsy,,,,,
...,...,...,...,...,...,...,...,...,...,...
1524,PDC.Diagnosis.who_nte_grade,,,,,,,,,
1525,PDC.Diagnosis.gleason_grade_tertiary,,,,,,,,,
1526,GDC.Diagnosis.tumor_depth,PDC.Diagnosis.tumor_depth,,,,,,,,
1527,PDC.Diagnosis.figo_staging_edition_year,,,,,,,,,


In [188]:
df_dst.join(df_models)

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
0,DST,Identifier,Subject,CTDC.case.case_id,GDC.Case.id,ICDC.case.case_id,IDC.DICOM.Patient Module.Patient ID,PDC.Case.case_id,CDS.Participant.subject_id,CDA.Patient.dct:identifier,,C2M2.biosample.local_id,mCODE.Cancer Patient Profile.Identifier
1,DST,Type,Specimen,GDC.Sample.sample_type,ICDC.sample.physical_sample_type,PDC.Sample.sample_type,CDS.Sample Information.sample_type,CDA.Specimen.source_material_type,HTAN.Molecular Test.biospecimen_type,C2M2.biosample.assay_type,mCODE.Genomic Specimen Profile.Type,,
2,DST,Ethnicity*,Subject,CTDC.case.ethnicity,GDC.Demographic.ethnicity,IDC.DICOM.Patient Module.Patient's Ethnic Group,PDC.Demographic.ethnicity,CDS.Participant.ethnicity,CDA.Patient.ethnicity,HTAN.Demographics.Ethnicity,C2M2.subject.ethnicity,,
3,DST,Race,Subject,CTDC.case.race,GDC.Demographic.race,PDC.Demographic.race,CDS.Participant.race,CDA.Patient.race,HTAN.Demographics.Race,C2M2.subject_race.race,,,
4,DST,,,GDC.Diagnosis.site_of_resection_or_biopsy,IDC.TCIA.tcia_tumorLocation,PDC.Diagnosis.site_of_resection_or_biopsy,CDS.Additional Diagnosis Information.site_of_r...,HTAN.Diagnosis.site_of_resection_or_biopsy,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,DST,Grade,Tumor,PDC.Diagnosis.who_nte_grade,,,,,,,,,
1525,DST,Grade,Tumor,PDC.Diagnosis.gleason_grade_tertiary,,,,,,,,,
1526,DST,,,GDC.Diagnosis.tumor_depth,PDC.Diagnosis.tumor_depth,,,,,,,,
1527,DST,Clinical Stage,Disease,PDC.Diagnosis.figo_staging_edition_year,,,,,,,,,


In [192]:
# check for duplicates in whole df
df_both = df_dst.join(df_models)
df_both.drop_duplicates(keep="first", inplace=True)
df_dst = df_both[["ent_1_model", "ent_1_handle", "ent_1_extra_handles"]]
df_models = df_both.drop(["ent_1_model", "ent_1_handle", "ent_1_extra_handles"], axis=1)
print(len(df_dst), len(df_models))

1522 1522


In [193]:
df_models.count()

Model_Data_Element_Name_0    1520
Model_Data_Element_Name_1     394
Model_Data_Element_Name_2     243
Model_Data_Element_Name_3      66
Model_Data_Element_Name_4      32
Model_Data_Element_Name_5      18
Model_Data_Element_Name_6      10
Model_Data_Element_Name_7       4
Model_Data_Element_Name_8       2
Model_Data_Element_Name_9       1
dtype: int64

In [194]:
for i, x in enumerate(df_models.count(axis=1)):
    if x >= 9:
        print(i, x)

0 10
48 9


In [195]:
df_models.iloc[[48]]

Unnamed: 0,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
48,GDC.Sample.biospecimen_anatomic_site,ICDC.sample.sample_site,IDC.DICOM.General Series Module.BodyPartExamined,PDC.Sample.biospecimen_anatomic_site,CDS.Additional Sample Information.sample_anato...,CDA.Specimen.anatomical_site,,C2M2.biosample.anatomy,mCODE.Genomic Specimen Profile.Collection > Bo...,


In [196]:
df_models_stack = pd.DataFrame(df_models.stack())

In [197]:
len(df_models_stack)

2290

In [198]:
df_models_stack = df_models_stack.droplevel(-1)

In [199]:
df_models_stack.head(10)

Unnamed: 0,0
0,CTDC.case.case_id
0,GDC.Case.id
0,ICDC.case.case_id
0,IDC.DICOM.Patient Module.Patient ID
0,PDC.Case.case_id
0,CDS.Participant.subject_id
0,CDA.Patient.dct:identifier
0,
0,C2M2.biosample.local_id
0,mCODE.Cancer Patient Profile.Identifier


In [200]:
df_models_stack[0].replace(u"\xa0", np.nan, inplace=True)
df_models_stack[0].replace(u"", np.nan, inplace=True)

In [201]:
df_models_stack.head(10)

Unnamed: 0,0
0,CTDC.case.case_id
0,GDC.Case.id
0,ICDC.case.case_id
0,IDC.DICOM.Patient Module.Patient ID
0,PDC.Case.case_id
0,CDS.Participant.subject_id
0,CDA.Patient.dct:identifier
0,
0,C2M2.biosample.local_id
0,mCODE.Cancer Patient Profile.Identifier


In [202]:
df_models_stack.dropna(subset=[0], inplace=True)

In [203]:
df_models_stack.head(10)

Unnamed: 0,0
0,CTDC.case.case_id
0,GDC.Case.id
0,ICDC.case.case_id
0,IDC.DICOM.Patient Module.Patient ID
0,PDC.Case.case_id
0,CDS.Participant.subject_id
0,CDA.Patient.dct:identifier
0,C2M2.biosample.local_id
0,mCODE.Cancer Patient Profile.Identifier
1,GDC.Sample.sample_type


In [204]:
len(df_models_stack)

2257

In [205]:
df_models_split = df_models_stack[0].str.split(".", expand=True)

In [206]:
df_models_split.count()

0    2257
1    2257
2    2227
3      15
dtype: int64

In [207]:
# df_models rows with a third option (what should be node and what should be property?)
df_models_other_1 = df_models_split.loc[df_models_split[3].notnull()]
df_models_other_1

Unnamed: 0,0,1,2,3
0,IDC,DICOM,Patient Module,Patient ID
2,IDC,DICOM,Patient Module,Patient's Ethnic Group
10,IDC,DICOM,Patient Module,Patient's Ethnic Group
29,IDC,DICOM,Patient Module,Patient's Sex
48,IDC,DICOM,General Series Module,BodyPartExamined
271,IDC,DICOM,Patient Study Module,Patient Age
517,IDC,DICOM,Patient Study Module,Patient's Size
518,IDC,DICOM,Patient Study Module,Patient's Weight
519,IDC,DICOM,General Series Module,StudyDate
571,GDC,SlideImage,ref:GDC,data_file_properties


In [208]:
# df_models without a 2nd option
df_models_other_2 = df_models_split.loc[df_models_split[1].notnull() & df_models_split[2].isnull()]
df_models_other_2

Unnamed: 0,0,1,2,3
50,ICDC,case,,
51,ICDC,diagnosis,,
53,ICDC,sample,,
55,ICDC,demographic,,
56,ICDC,program,,
57,ICDC,study,,
75,ICDC,file,,
79,ICDC,principal_investigator,,
134,ICDC,enrollment,,
212,ICDC,sample,,


In [209]:
# default is 60?
pd.set_option('display.max_rows', 60)

In [210]:
df_models_split[df_models_split.duplicated(keep=False)]

Unnamed: 0,0,1,2,3
1,mCODE,Genomic Specimen Profile,Type,
2,CTDC,case,ethnicity,
2,IDC,DICOM,Patient Module,Patient's Ethnic Group
2,PDC,Demographic,ethnicity,
2,CDA,Patient,ethnicity,
...,...,...,...,...
1425,mCODE,Primary Cancer Condition Profile,Code,
1426,mCODE,Primary Cancer Condition Profile,Body Site,
1430,mCODE,Primary Cancer Condition Profile,Stage,
1502,mCODE,Tumor Size Profile,Component > Tumor Longest Dimension,


In [211]:
df_models_split.columns = ["ent_2_model", "ent_2_extra_handles", "ent_2_handles", "ent_2_handles_other"]
df_models_split = df_models_split[["ent_2_model", "ent_2_handles", "ent_2_handles_other", "ent_2_extra_handles"]]
#df_models_split["ent_2_extra_handles"] = [[l] for l in df_models_split["ent_2_extra_handles"]] # listify extra handles

In [212]:
df_mappings = df_dst.join(df_models_split)

In [213]:
df_mappings

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handles,ent_2_handles_other,ent_2_extra_handles
0,DST,Identifier,Subject,CTDC,case_id,,case
0,DST,Identifier,Subject,GDC,id,,Case
0,DST,Identifier,Subject,ICDC,case_id,,case
0,DST,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
0,DST,Identifier,Subject,PDC,case_id,,Case
...,...,...,...,...,...,...,...
1525,DST,Grade,Tumor,PDC,gleason_grade_tertiary,,Diagnosis
1526,DST,,,GDC,tumor_depth,,Diagnosis
1526,DST,,,PDC,tumor_depth,,Diagnosis
1527,DST,Clinical Stage,Disease,PDC,figo_staging_edition_year,,Diagnosis


In [215]:
# default is 60?
pd.set_option('display.max_rows', 60)
df_mappings[df_mappings.duplicated(keep=False)]

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handles,ent_2_handles_other,ent_2_extra_handles
2,DST,Ethnicity*,Subject,CTDC,ethnicity,,case
2,DST,Ethnicity*,Subject,IDC,Patient Module,Patient's Ethnic Group,DICOM
2,DST,Ethnicity*,Subject,PDC,ethnicity,,Demographic
2,DST,Ethnicity*,Subject,CDA,ethnicity,,Patient
10,DST,Ethnicity*,Subject,CTDC,ethnicity,,case
...,...,...,...,...,...,...,...
774,DST,chromosome,Gene,mCODE,Component > Genomic Reference Sequence Id,,Genomic Region Studied Profile
775,DST,chromosome,Gene,mCODE,Component > Genomic Reference Sequence Id,,Genomic Region Studied Profile
776,DST,chromosome,Gene,mCODE,Component > Genomic Reference Sequence Id,,Genomic Region Studied Profile
793,DST,symbol (same as name?),Gene,mCODE,Component > Cytogenetic Nomenclature,,Genomic Variant Profile


In [216]:
len(df_mappings)

2259

In [217]:
df_mappings.drop_duplicates(keep="first", inplace=True)

In [218]:
len(df_mappings)

2216

In [219]:
df_mappings

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles,ent_2_model,ent_2_handles,ent_2_handles_other,ent_2_extra_handles
0,DST,Identifier,Subject,CTDC,case_id,,case
0,DST,Identifier,Subject,GDC,id,,Case
0,DST,Identifier,Subject,ICDC,case_id,,case
0,DST,Identifier,Subject,IDC,Patient Module,Patient ID,DICOM
0,DST,Identifier,Subject,PDC,case_id,,Case
...,...,...,...,...,...,...,...
1525,DST,Grade,Tumor,PDC,gleason_grade_tertiary,,Diagnosis
1526,DST,,,GDC,tumor_depth,,Diagnosis
1526,DST,,,PDC,tumor_depth,,Diagnosis
1527,DST,Clinical Stage,Disease,PDC,figo_staging_edition_year,,Diagnosis


In [251]:
df_mappings['ent_1_handle'].str.findall(r'[^a-zA-Z0-9 ]').str.len().sum()

151.0

In [247]:
df_mappings['ent_2_handles_other'].str.findall(r'[^a-zA-Z0-9 ]').str.len().sum()

16.0

In [263]:
df_mappings[['ent_2_handles_other']][df_mappings['ent_2_handles_other'].str.findall(r'[^a-zA-Z0-9 ]').notnull()]

Unnamed: 0,ent_2_handles_other
0,Patient ID
2,Patient's Ethnic Group
29,Patient's Sex
48,BodyPartExamined
271,Patient Age
517,Patient's Size
518,Patient's Weight
519,StudyDate
571,data_file_properties
668,data_file_properties


In [246]:
df_mappings['ent_2_handles'].str.findall(r'[^a-zA-Z0-9 ]').str.len().sum()

3334.0

In [245]:
df_mappings['ent_2_extra_handles'].str.findall(r'[^a-zA-Z0-9 ]').str.len().sum()

254.0