# bento-meta cypher

## entities

### node

In [1]:
from bento_meta.util.cypher.entities import (
    N, N0, R, R0, P, T, G,
    _pattern, _as, _condition, _return,
    _plain, _anon, _var, _plain_var
    )
from bento_meta.util.cypher.functions import (
    Func, count, exists, group, And, Or, Not,
)
from bento_meta.util.cypher.clauses import (
    Clause, Match, Where, Return, Set, Create, Merge,
    OnMatchSet, OnCreateSet, Remove, Statement,
)

In [2]:
# N() entity is node
n = N(label="node", props={"model": "ICDC", "handle": "diagnosis"})

In [3]:
# label for each new instantiation of an entity counts up unless _reset_counter() called
n._reset_counter()

In [4]:
# pattern() renders entity as a match pattern
n.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis'})"

In [5]:
# condition() renders entity as a condition (e.g. for WHERE clause)
n.condition()

["n0.model = 'ICDC'", "n0.handle = 'diagnosis'"]

In [6]:
# label has entity label
n.label

'node'

In [7]:
# Return() "renders entity as a return value", maybe Neo4j variable assigned to entity?
n.Return()

'n0'

In [8]:
# _add_props() can add properties to a node or relationship entity
n._add_props({"nanoid": "mnmzW6"})
n.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})"

In [9]:
# As param lets a node be aliased?
x = N(label="thing", As="dude")

In [10]:
x.label

'thing'

In [11]:
x.As

'dude'

In [12]:
x.Return()

'n0 as dude'

In [13]:
# anonymous node (R0 also exists); probably used for just matching a pattern where you don't care 
# which node/relationship something is connected to as long as it is connected
N0().pattern()

'()'

### relationship

In [14]:
# R() entity is relationship/edge (not relationship node, but actual relationship in Neo4 (I think))
r = R(Type="has_property")

In [15]:
r.pattern()

'-[r0:has_property]-'

In [16]:
r.condition()

[]

In [17]:
r.Type

'has_property'

In [18]:
r.Return()

'r0'

In [19]:
R0().pattern()

'--'

### property

In [20]:
# P() represent properties of a node and can be used to hold (just one?) handle/value pair repping a property
p = P(handle="handle", value="stage_of_disease")

In [21]:
p.pattern()

"handle:'stage_of_disease'"

In [22]:
# these all return None since p.entity is None?
print(p.entity, p.condition(), p.Return())

None None None


In [23]:
m = N(label="property", props=p)

In [24]:
q = P("model", "ICDC")
l = P("nanoid", "xaF4my")

In [25]:
m._add_props([q, l])

True

In [26]:
m.pattern()

"(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})"

### triple

In [27]:
# R.relate() returns a triple relating two nodes by a relationship
t = r.relate(n, m)

In [28]:
# pattern() & condition() give same result for T() triple
print(t.pattern())
print(t.condition())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})
(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})


In [29]:
# can also relate_to() from a N() node rather than from the R() relationship
t2 = n.relate_to(r, m)

In [30]:
t2.pattern()

"(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})"

### paths

In [31]:
# setup for paths
nodes = [N(label="case"), N(label="sample"), N(label="aliquot"),
            N(label="file")]
edges = [R(Type="of_case"), R(Type="of_sample"), R(Type="of_aliquot")]

t1 = edges[0].relate(nodes[1], nodes[0])  # (sample)-[:of_case]->(case)
t2 = edges[1].relate(nodes[2], nodes[1])  # (aliquot)-[:of_sample]->(sample)
t3 = edges[2].relate(nodes[3], nodes[2])  # (file)-[:of_aliquot]->(aliquot)

In [32]:
# G() represents a path, or an ordered set of partially overlapping triples
pth0 = G(nodes[1], edges[0], nodes[0])
pth0.pattern()

'(n4:sample)-[r2:of_case]->(n3:case)'

In [33]:
pth1 = G(t1)
pth1.pattern()

'(n4:sample)-[r2:of_case]->(n3:case)'

In [34]:
pth2 = G(t1, t2)
pth2.pattern()

'(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [35]:
pth3 = G(t2, t1)
pth3.pattern()

'(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [36]:
pth4 = G(t3, pth3) # switching these args around doesn't work though?
pth4.pattern()

'(n6:file)-[r4:of_aliquot]->(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [37]:
pth5 = G(t1, t2, t3)
pth5.pattern()

'(n6:file)-[r4:of_aliquot]->(n5:aliquot)-[r3:of_sample]->(n4:sample)-[r2:of_case]->(n3:case)'

In [38]:
for x in pth5.nodes():
    print(x.pattern())

(n6:file)
(n3:case)
(n5:aliquot)
(n4:sample)


In [39]:
for x in pth5.edges():
    print(x.pattern())

-[r3:of_sample]-
-[r4:of_aliquot]-
-[r2:of_case]-


### modifiers

In [40]:
# _as returns copy of ent with As alias set
print(n.Return()) 
print(_as(n, "alias_n").Return())

n0
n0 as alias_n


In [41]:
# _plain returns ent w/o properties
print(n.pattern())
print(_plain(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0:node)


In [42]:
# _var returns ent w/o label or type
print(n.pattern())
print( _var(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0 {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})


In [43]:
# _plain_var returns ent w/o label or properties
print(n.pattern())
print( _plain_var(n).pattern())

(n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
(n0)


## Clauses

In [44]:
# Match() represents a cypher match clause
print(str(Match(t)))
print(str(Match(n)))
print(str(Match(n, m)))

MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})-[r0:has_property]->(n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})
MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'})
MATCH (n0:node {model:'ICDC',handle:'diagnosis',nanoid:'mnmzW6'}), (n2:property {handle:'stage_of_disease',model:'ICDC',nanoid:'xaF4my'})


In [45]:
# Where() represents a cypher where clause, args are 
print(str(Where(*t.nodes())))
print(str(Where(n)))

# With(), Create(), Merge(), Remove() are similar

WHERE n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6' AND n2.handle = 'stage_of_disease' AND n2.model = 'ICDC' AND n2.nanoid = 'xaF4my'
WHERE n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6'


In [46]:
# Set() creates a SET clause, only property args matter for this
print(str(Set(*n.props.values())))

SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'


In [47]:
# OnCreateSet() and OnMatchSet() similar
print(str(OnCreateSet(*n.props.values())))
print(str(OnMatchSet(*n.props.values())))

ON CREATE SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'
ON MATCH SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6'


In [51]:
# Statement() used to create a Neo4j statement w/ clauses & strings in order
str(
    Statement(
        Match(_var(_plain(t))),
        Where(exists(m.props['handle']), n),
        Return(count(n))
        )
    )

"MATCH (n0)-[r0:has_property]->(n2) WHERE exists(n2.handle) AND n0.model = 'ICDC' AND n0.handle = 'diagnosis' AND n0.nanoid = 'mnmzW6' RETURN count(n0)"

In [52]:
str(
    Statement(
        Match(_var(_plain(t))),
        Where(group(And(exists(m.props['handle']), n.props['model'])),
            Not(n.props['handle'])),
        Return(p),
        'LIMIT 10'
        )
    )

"MATCH (n0)-[r0:has_property]->(n2) WHERE (exists(n2.handle) AND n0.model = 'ICDC') AND NOT n0.handle = 'diagnosis' RETURN n2.handle LIMIT 10"

In [54]:
str(
    Statement(
        Create(_plain(n)),
        Set(*n.props.values()),
        Return(n)
        )
    )

"CREATE (n0:node) SET n0.model = 'ICDC', n0.handle = 'diagnosis', n0.nanoid = 'mnmzW6' RETURN n0"

## functions

functions include:
- exists()
- count()
- labels()
- Not()
- And()
- Or()
- group()
- is_null()
- is_not_null()

This could replace some of the cypher pattern/clause generation in MDB tools ()

Could potentially be integrated w/ bento-meta entities (which have some overlap with N()?)

DSS mapping

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
RAW_DSS_MAP_EXCEL = "C:/Users/nelso/Documents/GitHub/HIDS-Capstone/data/CRDC DST Cross-Model Mapping.xlsx"
df = pd.read_excel(RAW_DSS_MAP_EXCEL, sheet_name="CRDC DST Cross-Model Mapping", header=2, nrows=1529, usecols=["DST Data Element Name", "Compiled Data Element Names"])

  warn(msg)


In [19]:
df_dst = pd.DataFrame(np.repeat("DST", len(df)))
df_dst = pd.concat([df_dst, df["DST Data Element Name"].str.split(" ", 1, expand=True)], axis=1)
df_dst.columns = ["ent_1_model", "ent_1_extra_handles", "ent_1_handle"]
df_dst = df_dst[["ent_1_model", "ent_1_handle", "ent_1_extra_handles"]]
df_dst["ent_1_extra_handles"] = [[l] for l in df_dst["ent_1_extra_handles"]]

In [20]:
df_dst

Unnamed: 0,ent_1_model,ent_1_handle,ent_1_extra_handles
0,DST,Identifier,[Subject]
1,DST,Type,[Specimen]
2,DST,Ethnicity*,[Subject]
3,DST,Race,[Subject]
4,DST,,[nan]
...,...,...,...
1524,DST,Grade,[Tumor]
1525,DST,Grade,[Tumor]
1526,DST,,[nan]
1527,DST,Clinical Stage,[Disease]


In [88]:
df_models = df["Compiled Data Element Names"].str.strip().str.split('\n', expand=True).add_prefix("Model_Data_Element_Name_")

In [90]:
df_dst_models = df_dst.join(df_models)

In [91]:
df_dst_models = df_dst_models.insert(0, np.repeat("DST", len(df_dst_models)), )

TypeError: unhashable type: 'numpy.ndarray'

In [92]:
pd.DataFrame(np.repeat("DST", len(df_dst_models))).join(df_dst_models)

Unnamed: 0,0,DST_Data_Element_Name_0,DST_Data_Element_Name_1,Model_Data_Element_Name_0,Model_Data_Element_Name_1,Model_Data_Element_Name_2,Model_Data_Element_Name_3,Model_Data_Element_Name_4,Model_Data_Element_Name_5,Model_Data_Element_Name_6,Model_Data_Element_Name_7,Model_Data_Element_Name_8,Model_Data_Element_Name_9
0,DST,Subject,Identifier,CTDC.case.case_id,GDC.Case.id,ICDC.case.case_id,IDC.DICOM.Patient Module.Patient ID,PDC.Case.case_id,CDS.Participant.subject_id,CDA.Patient.dct:identifier,,C2M2.biosample.local_id,mCODE.Cancer Patient Profile.Identifier
1,DST,Specimen,Type,GDC.Sample.sample_type,ICDC.sample.physical_sample_type,PDC.Sample.sample_type,CDS.Sample Information.sample_type,CDA.Specimen.source_material_type,HTAN.Molecular Test.biospecimen_type,C2M2.biosample.assay_type,mCODE.Genomic Specimen Profile.Type,,
2,DST,Subject,Ethnicity*,CTDC.case.ethnicity,GDC.Demographic.ethnicity,IDC.DICOM.Patient Module.Patient's Ethnic Group,PDC.Demographic.ethnicity,CDS.Participant.ethnicity,CDA.Patient.ethnicity,HTAN.Demographics.Ethnicity,C2M2.subject.ethnicity,,
3,DST,Subject,Race,CTDC.case.race,GDC.Demographic.race,PDC.Demographic.race,CDS.Participant.race,CDA.Patient.race,HTAN.Demographics.Race,C2M2.subject_race.race,,,
4,DST,,,GDC.Diagnosis.site_of_resection_or_biopsy,IDC.TCIA.tcia_tumorLocation,PDC.Diagnosis.site_of_resection_or_biopsy,CDS.Additional Diagnosis Information.site_of_r...,HTAN.Diagnosis.site_of_resection_or_biopsy,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,DST,Tumor,Grade,PDC.Diagnosis.who_nte_grade,,,,,,,,,
1525,DST,Tumor,Grade,PDC.Diagnosis.gleason_grade_tertiary,,,,,,,,,
1526,DST,,,GDC.Diagnosis.tumor_depth,PDC.Diagnosis.tumor_depth,,,,,,,,
1527,DST,Disease,Clinical Stage,PDC.Diagnosis.figo_staging_edition_year,,,,,,,,,
