# Setup

Import our usual suspects

In [None]:
import os
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience
from neo4j import GraphDatabase, RoutingControl

Register for a sandbox and create an empty sandbox  https://sandbox.neo4j.com

In [None]:
# Capture connection string and auth info
connectionUrl = 'neo4j://localhost:7687'
username = 'neo4j'
password = 'test1234'
database = 'patient'

In [None]:
driver = GraphDatabase.driver(
    connectionUrl, 
    auth=(username, password)
)
driver.verify_connectivity()

In [None]:
## Utility
def split_dataframe(df, chunk_size = 50_000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

# Graph creation

### Schema

In [None]:
synthea_data_dir = "~/import/synthea_data_csv/csv1000/"

In [None]:
schema_statements = [
    'create constraint patientNumber if not exists for (n:Patient) require (n.id) is node key',
    'create constraint payerId if not exists for (n:Payer) require (n.id) is node key',
    'create constraint encounterId if not exists for (n:Encounter) require (n.id) is node key',
    'create constraint conditionCode if not exists for (n:Condition) require (n.code) is node key',
    'create constraint providerId if not exists for (n:Provider) require (n.id) is node key',
    'create constraint observation if not exists for (n:Observation) require (n.code) is node key',
    'create constraint organization if not exists for (n:Organization) require (n.id) is node key',
    'create constraint drug if not exists for (n:Drug) require (n.code) is node key',
    'create constraint careplan if not exists for (n:CarePlan) require (n.id) is node key',
    'create constraint reaction if not exists for (n:Reaction) require (n.id) is node key',
    'create constraint device if not exists for (n:Device) require (n.code) is node key',
    'create constraint speciality if not exists for (n:Speciality) require (n.name) is node key',
    'create constraint conditionDescription if not exists for (n:ConditionDescription) require (n.text) is node key'
]
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=database,
        routing_=RoutingControl.WRITE
    )

# Fetch all constraints
schema_result_df  = driver.execute_query(
    'show constraints',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)


### Load data

In [None]:
df_patients = pd.read_csv(synthea_data_dir + 'patients.csv', delimiter=',').replace({np.nan: None})
#df_patient.head()

In [None]:
for chunk in split_dataframe(df_patients):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge(p:Patient{id:row['Id']})
            set 
            p.first = row['FIRST'],
            p.last = row['LAST'],
            p.birthdate = Date(row['BIRTHDATE']),
            p.birthplace = row['BIRTHPLACE'],
            p.deathdate = Date(row['DEATHDATE']),
            p.ethnicity = row['ETHNICITY'],
            p.gender = row['GENDER'],
            p.prefix = row['PREFIX'],
            p.race = row['RACE'],
            p.address = row['ADDRESS'],
            p.state = row['STATE'],
            p.city = row['CITY'],
            p.county = row['COUNTY'],
            p.drivers = row['DRIVERS'],
            p.healthcare_coverage = toFloat(row['HEALTHCARE_COVERAGE']),
            p.healthcare_expenses = toFloat(row['HEALTHCARE_EXPENSES']),
            p.latitude = toFloat(row['LAT']),
            p.longitude = toFloat(row['LON']),
            p.location = point({latitude:toFloat(row['LAT']),longitude: toFloat(row['LON'])}),
            p.martial = row['MARITAL']
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_payers = pd.read_csv(synthea_data_dir + 'payers.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_payers):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Payer{id:row['Id']})
            set p.name = row['NAME'],
                p.address = row['ADDRESS'],
                p.city = row['CITY'],
                p.zip = row['ZIP'],
                p.state = row['STATE_HEADQUARTERED']
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_encounters = pd.read_csv(synthea_data_dir + 'encounters.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_encounters):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (e:Encounter {id:row['Id']})
            set e.code = row['CODE'],
            e.description = row['DESCRIPTION'],
            e.class = row['ENCOUNTERCLASS'],
            e.start = datetime(row['START']),
            e.baseCost = toFloat(row['BASE_ENCOUNTER_COST']),
            e.claimCost = toFloat(row['TOTAL_CLAIM_COST']),
            e.coveredAmount = toFloat(row['PAYER_COVERAGE']),
            e.isEnd = false,
            e.end = datetime(row['STOP'])
            merge (p:Patient {id:row['PATIENT']})
            merge (p)-[:HAS_ENCOUNTER]->(e)
            merge (pr:Provider {id: row['PROVIDER']})
            merge (e)-[:HAS_PROVIDER]->(pr)
            merge (o:Organization {id:row['ORGANIZATION']})
            merge (e)-[:AT_ORGANIZATION]->(o)
            with e,row
            match (pa:Payer {id:row['PAYER']})
            merge (e)-[:HAS_PAYER]->(pa)
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_providers = pd.read_csv(synthea_data_dir + 'providers.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_providers):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Provider {id: row['Id']})
            set p.name = row['NAME'],
                p.address = row['ADDRESS'],
                p.location = point({latitude:toFloat(row['LAT']), longitude:toFloat(row['LON'])})
            merge (s:Speciality {name: row['SPECIALITY']})
            merge (p)-[:HAS_SPECIALITY]->(s)
            merge (o:Organization {id: row['ORGANIZATION']})
            merge (p)-[:BELONGS_TO]->(o)
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_payer_transitions = pd.read_csv(synthea_data_dir + 'payer_transitions.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_payer_transitions):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (p:Patient {id:row['PATIENT']})
            merge (payer:Payer {id:row['PAYER']})
            merge (p)-[s:INSURANCE_START]->(payer)
                set s.year=toInteger(row['START_YEAR'])
            merge (p)-[e:INSURANCE_END]->(payer)
                set e.year=toInteger(row['END_YEAR'])
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_allergies = pd.read_csv(synthea_data_dir + 'allergies.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_allergies):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (p:Patient {id:row['PATIENT']})
            merge (a:Allergy {code: row['CODE']})
                set a.description = row['DESCRIPTION'],
                    a.type = row['TYPE'],
                    a.category = row['CATEGORY'],
                    a.system = row['SYSTEM']
            merge (e:Encounter {id:row['ENCOUNTER']})
            merge (p)-[:HAS_ENCOUNTER]->(e)
            merge (p)-[:HAS_ALLERGY]->(a)
            merge (e)-[r:ALLERGY_DETECTED]->(a)
                set r.start = datetime(row['START'])
            with p,a,e,r,row
            where row['REACTION1'] is not null and row['REACTION1'] <> ''
            merge (r1:Reaction {id: row['REACTION1']})
                set r1.description = row['DESCRIPTION1']
            merge (p)-[rr:HAS_REACTION]->(r1)
                set rr.severity = row['SEVERITY1']
            merge (a)-[:CAUSES_REACTION]->(r1)
            with p,a,e,r,row
            where row['REACTION2'] is not null and row['REACTION2'] <> ''
            merge (r2:Reaction {id: row['REACTION2']})
                set r2.description = row['DESCRIPTION2']
            merge (p)-[rrr:HAS_REACTION]->(r2)
                set rrr.severity = row['SEVERITY2']
            merge (a)-[:CAUSES_REACTION]->(r2)
            with p,a,e,r,row
            where row['STOP'] is not null and row['STOP'] <> ''
                set r.isEnd = True,
                    r.stop = datetime(row['STOP'])
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_conditions = pd.read_csv(synthea_data_dir + 'conditions.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_conditions):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (p:Patient {id:row['PATIENT']})
            merge (c:Condition {code:row['CODE']})
                set c.description  = row['DESCRIPTION'],
                    c.start = datetime(row['START']),
                    c.code = row['CODE'],
                    c.isEnd = false
            merge (e:Encounter {id:row['ENCOUNTER']})
            merge (p)-[:HAS_ENCOUNTER]->(e)
            merge (e)-[:HAS_CONDITION]->(c)
            with p,c,e,row
                where row['STOP'] is not null and row['STOP'] <> ''
                    set c.stop = row['STOP'], c.isEnd = true
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_medications = pd.read_csv(synthea_data_dir + 'medications.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_medications):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Patient {id:row['PATIENT']})
            merge (e:Encounter {id:row['ENCOUNTER']})
            merge (d:Drug {code:row['CODE']})
                set d.description = row['DESCRIPTION'],
                    d.basecost = row['BASE_COST'],
                    d.totalcost = row['TOTALCOST'],
                    d.isEnd = false,
                    d.start = datetime(row['START'])
            merge (p)-[:HAS_ENCOUNTER]->(e)
            merge (e)-[:HAS_DRUG]->(d)
            with p,d,e,row
            where row['STOP'] is not null and row['STOP'] <> ''
                set d.stop = datetime(row['STOP']), d.isEnd = true
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_procedures = pd.read_csv(synthea_data_dir + 'procedures.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_procedures):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Patient {id:row['PATIENT']})
            merge (r:Procedure {code:row['CODE']})
                set r.description=row['DESCRIPTION']
            merge (pe:Encounter {id:row['ENCOUNTER'], isEnd: false})
                on create
                set pe.date=datetime(row['START']), pe.code=row['CODE']
                on match
                set pe.code=row['CODE']
            merge (p)-[:HAS_ENCOUNTER]->(pe)
            merge (pe)-[:HAS_PROCEDURE]->(r)
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_observations = pd.read_csv(synthea_data_dir + 'observations.csv', delimiter=',').replace({np.nan: None})

In [None]:
df_observations[df_observations['ENCOUNTER'].isnull()].head()

In [None]:
df_observations=df_observations.dropna()

In [None]:
for chunk in split_dataframe(df_observations):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (p:Patient {id:row['PATIENT']})
            merge (oe:Encounter {id:row['ENCOUNTER']})
            merge (ob:Observation{code:row['CODE']})
            set ob.description = row['DESCRIPTION'],
                ob.type = row['TYPE'],
                ob.units = row['UNTIS'],
                ob.category = row['CATEGORY'],
                ob.type = row['TYPE']
            merge (oe)-[r:HAS_OBSERVATION]->(ob)
            set r.value = row['VALUE'], 
                r.date = datetime(row['DATE']),
                r.unit = row['UNITS']
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_careplans = pd.read_csv(synthea_data_dir + 'careplans.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_careplans):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (p:Patient {id:row['PATIENT']})
            merge (c:CarePlan {id:row['Id']})
            set c.description = row['DESCRIPTION'],
                c.reasoncode = row['REASONCODE'],
                c.code = row['CODE'],
                c.start = datetime(row['START']),
                c.isEnd = false
            merge (e:Encounter {id:row['ENCOUNTER']})
            merge (p)-[:HAS_ENCOUNTER]->(e)
            merge (e)-[:HAS_CARE_PLAN]->(c)
            with p,c, row
            where row['STOP'] is not null and row['STOP'] <> '' 
            set c.end = datetime(row['STOP']),
                c.isEnd = true
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_organizations = pd.read_csv(synthea_data_dir + 'organizations.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_organizations):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (o:Organization {id:row['Id']})
            set o.name = row['NAME'],
                o.address = row['ADDRESS'],
                o.city = row['CITY'],
                o.location = point({latitude:toFloat(row['LAT']), longitude:toFloat(row['LON'])})
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

In [None]:
df_devices = pd.read_csv(synthea_data_dir + 'devices.csv', delimiter=',').replace({np.nan: None})

In [None]:
for chunk in split_dataframe(df_devices):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (d:Device {code:row['CODE']})
                set d.description = row['DESCRIPTION']
            merge (e:Encounter {id:row['ENCOUNTER']})
            merge (e)-[:DEVICE_USED]->(d)
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )
    print(summary.counters)

### Enrich graph

In [None]:
# Create linked list between encounters in chronlolgical order
with driver.session(database=database) as session:
    session.run(
        ''' 
            match (p:Patient)
            call { 
                with p
                match (p)-[:HAS_ENCOUNTER]->(e:Encounter)
                with p,e ORDER BY e.start
                with p, collect(e) as encounters
                call apoc.nodes.link( encounters , "NEXT")
                with p, head(encounters) as first, last(encounters) as last
                merge (p)-[:FIRST]->(first)
                merge (p)-[:LAST]->(last)
                set last.isEnd = True,
                first.isStart = True
            } in transactions of 5_000 rows
        '''
    ).consume()
    session.close()

In [None]:
# Drug paired with condition
with driver.session(database=database) as session:
    session.run(
        ''' 
            match (c:Condition)<-[:HAS_CONDITION]-(e:Encounter)-[:HAS_DRUG]->(d:Drug)
            with c, count(*) as total_pairs
            set c.total_drug_pairings = total_pairs;
        '''
    ).consume()
    session.close()

In [None]:
# Create condition descriptions
with driver.session(database=database) as session:
    session.run(
        ''' 
            match (c:Condition)
            with distinct c.description as text
            merge (:ConditionDescription {text: text})
        '''
    ).consume()
    session.close()

In [None]:
# Consecutive conditions network
with driver.session(database=database) as session:
    session.run(
        ''' 
            match (c:Condition)<--(e:Encounter)-[:NEXT*0..1]->(e2:Encounter)-->(c2:Condition)
            with c.description as desc1, c2.description as desc2, count(*) as count
            match (n1:ConditionDescription{text: desc1}), (n2:ConditionDescription{text: desc2})
            merge (n1)-[r:NEXT]->(n2)
                set r.amount = count
        '''
    ).consume()
    session.close()

### Add additional indexes


In [None]:
index_statements = [
    'create point index patientLocation if not exists for (n:Patient) on (n.location)',
    'create point index providerLocation if not exists for (n:Provider) on (n.location)',
    'create point index organizationLocation if not exists for (n:Organization) on (n.location)',
    'create text  index patient_index_name if not exists for (n:Patient) on (n.id)',
    'create text  index encounterDate_name if not exists for (n:Encounter) on (n.date)',
    'create text  index encounterIsEnd_name if not exists for (n:Encounter) on (n.isEnd)',
    'create text  index organization_name if not exists for (n:Organization) on (n.id)',
    'create text  index drug_name if not exists for (n:Drug) on (n.code)',
    'create text  index carePlan_name if not exists for (n:CarePlan) on (n.id)',
    'create text  index speciality_name if not exists for (n:Speciality) on (n.name)',
    'create text  index allergy_name if not exists for (n:Allergy) on (n.code)',
    'create text  index procedure_name if not exists for (n:Procedure) on (n.code)',
    'create text  index observation_name if not exists for (n:Observation) on (n.code)',
    'create text  index device_name if not exists for (n:Device) on (n.code)'
]
for statement in index_statements:
    driver.execute_query(
        statement,
        database_=database,
        routing_=RoutingControl.WRITE
    )

# Basic stats

In [None]:
# Node counting
driver.execute_query(
    ''' 
    match (n)
    return labels(n) as labels, count(*) as count
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head(20)

In [None]:
# Relationship counting
driver.execute_query(
    ''' 
    match ()-[r]->()
    return type(r) as type, count(*) as count
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head(30)

# Graph Data Science

Let's get this party started

In [107]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
gds.set_database(database)
gds.version()

'2.6.5'

In [108]:
G, res = gds.graph.project(
    "patient_allergies",  # Graph name
    ["Patient", "Allergy","Reaction"],         #  Node projection
    ["HAS_REACTION", "HAS_ALLERGY"]                    #  Relationship projection
)


In [109]:
res


nodeProjection            {'Allergy': {'label': 'Allergy', 'properties':...
relationshipProjection    {'HAS_ALLERGY': {'aggregation': 'DEFAULT', 'or...
graphName                                                 patient_allergies
nodeCount                                                              1172
relationshipCount                                                      1516
projectMillis                                                            78
Name: 0, dtype: object

In [111]:
gds.nodeSimilarity.stats(G)['similarityDistribution']


{'min': 0.07142829895019531,
 'p5': 0.2500014305114746,
 'max': 1.000007152557373,
 'p99': 1.000007152557373,
 'p1': 0.09090900421142578,
 'p10': 0.26666784286499023,
 'p90': 0.7000002861022949,
 'p50': 0.5500025749206543,
 'p25': 0.3750014305114746,
 'p75': 0.6315798759460449,
 'p95': 0.7500033378601074,
 'mean': 0.5146050188276503,
 'p100': 1.000007152557373,
 'stdDev': 0.17778499575228443}

In [113]:
gds.nodeSimilarity.write(G,
                         writeRelationshipType='SIMLAR_ALLERGIC_REACTION',
                         writeProperty='sim_score',
                         similarityCutoff=0.7)

preProcessingMillis                                                       0
computeMillis                                                            22
writeMillis                                                              22
postProcessingMillis                                                      0
nodesCompared                                                           180
relationshipsWritten                                                    184
similarityDistribution    {'min': 0.6999969482421875, 'p5': 0.6999969482...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

In [114]:
G.drop()

graphName                                                patient_allergies
database                                                           patient
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                             1172
relationshipCount                                                     1516
configuration            {'relationshipProjection': {'HAS_ALLERGY': {'a...
density                                                           0.001105
creationTime                           2024-05-03T12:26:09.434993000+00:00
modificationTime                       2024-05-03T12:26:09.514207000+00:00
schema                   {'graphProperties': {}, 'nodes': {'Allergy': {...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Allergy': {...
Name: 0, dtype: object

In [116]:
# Remove symetric relationships
gds.run_cypher('''
  match (a:Patient)-[r:SIMLAR_ALLERGIC_REACTION]->(b:Patient) 
    where (b)-[:SIMLAR_ALLERGIC_REACTION]->(a) 
    and   id(a)<id(b)
  delete r
''')

# Neodash

In [None]:
import json

In [119]:
# Backup to file
dashboard_json = driver.execute_query(
    '''
    match (n:`_Neodash_Dashboard`) 
    return n{.*} as dashboard limit 1
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.single(strict=True).get('dashboard')
)
del dashboard_json['date']



In [120]:
with open('patient-journey-dashboard.json', 'w') as file:
    json.dump(dashboard_json, file)

In [117]:
# Restore from file

dashboard_from_file = {}
with open('patient-journey-dashboard.json') as file:
    dashboard_from_file = json.load(file)

In [118]:
driver.execute_query(
    '''
    merge (n:`_Neodash_Dashboard`{uuid:$data.uuid})
    set n += $data,
        n.date = datetime() 
    return true
    ''',
    data = dashboard_from_file,
    database_=database,
    routing_=RoutingControl.WRITE,
    result_transformer_= lambda r: r.single(strict=True).get('dashboard')
)

# Bloom

// Search phrase: Prediabetes to diabetes
```
match path=(pd:Condition{description:"Prediabetes"})<-[:HAS_CONDITION]-(e1)(
    (f)-[n:NEXT]->(t)
    ){1,10}
(e2)-[:HAS_CONDITION]->(d:Condition{description:"Diabetes"})
return path limit 6
```

// Scene action: Expand patient journey
```

```

# Save for later

match pattern= (p1:Patient)-[:HAS_ALLERGY]-(a)<-[:HAS_ALLERGY]-(p2:Patient), (p1)-[:HAS_REACTION]->(r)<-[:HAS_REACTION]-(p2)
return p1,p2, count(distinct r) as shared_reactions, count(distinct a) as shared_allergies limit 1
