In [1]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "symptom"

# Create driver
Also set the DB_NAME

In [2]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

# Create databse and schema

In [5]:
# Create (or replace) database
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE OR REPLACE DATABASE {dbname}".format(dbname = DB_NAME)
        ).data()
    )

Empty DataFrame
Columns: []
Index: []


In [6]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Symptom) REQUIRE (n.name) IS NODE KEY"
        ).consume()
    )
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Disease) REQUIRE (n.name) IS NODE KEY"
        ).consume()     
    )

#  Load data

In [7]:
# Read csv file
# Source https://github.com/deshanadesai/Symptom-X-/blob/master/dataset_clean1.csv
csv = pd.read_csv('https://raw.githubusercontent.com/deshanadesai/Symptom-X-/master/dataset_clean1.csv')
print(csv)

                  Source          Target  Weight
0              influenza  uncoordination      68
1              influenza           fever      68
2              influenza  pleuritic pain      68
3              influenza         snuffle      68
4              influenza     throat sore      68
...                  ...             ...     ...
2124  migraine disorders       dizziness      61
2125  migraine disorders        numbness      61
2126  migraine disorders          nausea      61
2127  migraine disorders           fever      61
2128  migraine disorders    splenomegaly      61

[2129 rows x 3 columns]


In [8]:
# Select all unique symptoms
symptoms = csv['Target'].drop_duplicates().dropna()
print(symptoms)

0       uncoordination
1                fever
2       pleuritic pain
3              snuffle
4          throat sore
             ...      
1929      poor feeding
1938              ache
1939    macerated skin
1940     heavy feeling
2087        gravida 10
Name: Target, Length: 404, dtype: object


In [9]:
# Create Symptom nodes
for chunk in np.array_split(symptoms, 2):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $symptoms as symptomName
                MERGE (:Symptom{name: symptomName})
                RETURN count(*) as nodesCreated
                """,
                symptoms = chunk.to_list()
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

   nodesCreated
0           202
   nodesCreated
0           202


In [10]:
# Select all unique diseases
diseases = csv['Source'].drop_duplicates().dropna()
print(diseases)

0                  influenza
21                 gastritis
33      hypercholesterolemia
48                  epilepsy
61       insufficiency renal
                ...         
2061      systemic infection
2075     malignant neoplasms
2100              neuropathy
2110          overload fluid
2122      migraine disorders
Name: Source, Length: 148, dtype: object


In [11]:
# Create Disease nodes
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            UNWIND $diseases as diseaseName
            MERGE (:Disease{name: diseaseName})
            RETURN count(*) as nodesCreated
            """,
            diseases = diseases.to_list()
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   nodesCreated
0           148


In [12]:
# Select all SYMPTOM_OV relationships
symptom_of_rels = csv[['Source','Target', 'Weight']]
print(symptom_of_rels)

                  Source          Target  Weight
0              influenza  uncoordination      68
1              influenza           fever      68
2              influenza  pleuritic pain      68
3              influenza         snuffle      68
4              influenza     throat sore      68
...                  ...             ...     ...
2124  migraine disorders       dizziness      61
2125  migraine disorders        numbness      61
2126  migraine disorders          nausea      61
2127  migraine disorders           fever      61
2128  migraine disorders    splenomegaly      61

[2129 rows x 3 columns]


In [15]:
# Create VIEWED relationships
for chunk in np.array_split(symptom_of_rels, 2):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $data as rel
                MATCH (d:Disease{name: rel.Source}), (s:Symptom{name: rel.Target})
                MERGE (s)-[:SYMPTOM_OF{weight:toInteger(rel.Weight)}]->(d)
                RETURN count(*) as relsCreated
                """,
                data = chunk.to_dict('records')
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

   relsCreated
0         1064
   relsCreated
0         1062


# Some basic queries

In [16]:
# Syptom with highest degree
# Question: What does does it mean?
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:SYMPTOM_OF]->(d:Disease)
            RETURN 
                s.name as symptom, 
                collect(d.name) as diseases, 
                count(d) as number_of_diseases
            ORDER BY number_of_diseases DESC LIMIT $limit
            """,
            limit = 5
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

               symptom                                           diseases  \
0  shortness of breath  [adhesion, anemia, arthritis, asthma, bronchit...   
1                 pain  [adenocarcinoma, adhesion, anemia, anxiety sta...   
2                fever  [Alzheimer's disease, HIV, acquired immuno-def...   
3             diarrhea  [HIV, Pneumocystis carinii pneumonia, acquired...   
4       pain abdominal  [adenocarcinoma, biliary calculus, carcinoma c...   

   number_of_diseases  
0                  49  
1                  44  
2                  38  
3                  30  
4                  29  


In [17]:
# Syptom with lowest degree
# Question: What does does it mean?
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:SYMPTOM_OF]->(d:Disease)
            RETURN 
                s.name as symptom, 
                collect(d.name) as diseases, 
                count(d) as number_of_diseases
            ORDER BY number_of_diseases ASC LIMIT $limit
            """,
            limit = 5
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

            symptom                          diseases  number_of_diseases
0            floppy  [Pneumocystis carinii pneumonia]                   1
1             drool             [Alzheimer's disease]                   1
2  pin-point pupils             [Alzheimer's disease]                   1
3    tremor resting             [Alzheimer's disease]                   1
4       dyspareunia  [Pneumocystis carinii pneumonia]                   1


In [18]:
# Possible diseases given two symptoms
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom{name:"fever"})-[:SYMPTOM_OF]->(d:Disease)<-[:SYMPTOM_OF]-(s2:Symptom{name:"swelling"})
            RETURN d.name as disease
            """
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

                  disease
0           osteomyelitis
1               exanthema
2              cellulitis
3               infection
4      lymphatic diseases
5  chronic kidney failure


In [19]:
# Possible diseases given two symptoms
# Weighted
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom{name:"fever"})-[r1:SYMPTOM_OF]->(d:Disease)<-[r2:SYMPTOM_OF]-(s2:Symptom{name:"swelling"})
            RETURN d.name as disease, 
                   r1.weight + r2.weight as weight 
            ORDER BY weight DESC 
            """
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

                  disease  weight
0               infection    1260
1              cellulitis     682
2  chronic kidney failure     560
3               exanthema     416
4      lymphatic diseases     320
5           osteomyelitis     284


# Graph data science
Can we arrange syptoms based on diseases they share?

In [58]:
G1=gds.graph.project('p1',['Disease', 'Symptom'], ['SYMPTOM_OF'])
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G1.node_count(), relationship_count=G1.relationship_count())

In [48]:
gds.nodeSimilarity.stats(G1, 
    similarityMetric = 'OVERLAP', 
    similarityCutoff = 0.6
)

{'preProcessingMillis': 0,
 'computeMillis': 22,
 'postProcessingMillis': 9,
 'nodesCompared': 404,
 'similarityPairs': 3362,
 'similarityDistribution': {'p1': 0.5999984741210938,
  'max': 1.0000038146972656,
  'p5': 0.6666641235351562,
  'p90': 1.0000038146972656,
  'p50': 1.0000038146972656,
  'p95': 1.0000038146972656,
  'p10': 0.75,
  'p75': 1.0000038146972656,
  'p99': 1.0000038146972656,
  'p25': 1.0000038146972656,
  'p100': 1.0000038146972656,
  'min': 0.5999984741210938,
  'mean': 0.9582757115860484,
  'stdDev': 0.10784796756708696},
 'configuration': {'topK': 10,
  'similarityMetric': 'OVERLAP',
  'bottomK': 10,
  'bottomN': 0,
  'relationshipWeightProperty': None,
  'topN': 0,
  'concurrency': 4,
  'degreeCutoff': 1,
  'similarityCutoff': 0.6,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'username': None}}

In [49]:
gds.nodeSimilarity.write(G1, 
    similarityMetric = 'OVERLAP', 
    similarityCutoff = 0.6, 
    writeRelationshipType = 'NARROWER_THAN', 
    writeProperty = 'similarity'
)

{'preProcessingMillis': 0,
 'computeMillis': 21,
 'writeMillis': 49,
 'postProcessingMillis': -1,
 'nodesCompared': 404,
 'relationshipsWritten': 3362,
 'similarityDistribution': {'p1': 0.5999984741210938,
  'max': 1.0000038146972656,
  'p5': 0.6666641235351562,
  'p90': 1.0000038146972656,
  'p50': 1.0000038146972656,
  'p95': 1.0000038146972656,
  'p10': 0.75,
  'p75': 1.0000038146972656,
  'p99': 1.0000038146972656,
  'p25': 1.0000038146972656,
  'p100': 1.0000038146972656,
  'min': 0.5999984741210938,
  'mean': 0.9582757115860484,
  'stdDev': 0.10784796756708696},
 'configuration': {'topK': 10,
  'writeConcurrency': 4,
  'similarityMetric': 'OVERLAP',
  'bottomK': 10,
  'bottomN': 0,
  'relationshipWeightProperty': None,
  'topN': 0,
  'concurrency': 4,
  'writeProperty': 'similarity',
  'degreeCutoff': 1,
  'writeRelationshipType': 'NARROWER_THAN',
  'similarityCutoff': 0.6,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'username': None}}

In [63]:
G1.drop()

In [51]:
# Touch-up: Remove symmetric NARROWER_THAN relationships
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE (s2)-[:NARROWER_THAN]->(s1)
            DELETE r
            RETURN count(*) as relationships_deleted
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   relationships_deleted
0                   2184


In [52]:
# Touch-up: Make NARROWER_THAN point towards the node with the bigger set of adjacent Diseases
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE size( (s1)-[:SYMPTOM_OF]->() ) > size( (s2)-[:SYMPTOM_OF]->() )
            CREATE (s2)-[:NARROWER_THAN{similarity: r.similarity}]->(s1)
            DELETE r
            RETURN count(*) as relationships_reversed
            """
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   relationships_reversed
0                     485


In [53]:
# Touch-up: Remove transitive dependencies
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE (s)-[:NARROWER_THAN*2..20]->(s2)
            DELETE r
            RETURN count(*) as relationships_transitive
            """
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   relationships_transitive
0                       242


In [57]:
# Let's query our new NARROWER_THAN relationships
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (l2:Symptom)-[:NARROWER_THAN]->(l1:Symptom)-[:NARROWER_THAN]->(root:Symptom{name:"pain"})
            RETURN l2.name + '->' + l1.name + '->' + root.name as path
            ORDER BY l2.name, l1.name
            """,
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

                                                 path
0                Murphy's sign->colic abdominal->pain
1                     abdomen acute->flatulence->pain
2                  abnormal sensation->fremitus->pain
3                             ache->lung nodule->pain
4                   air fluid level->flatulence->pain
5           barking cough->emphysematous change->pain
6   behavior showing increased motor activity->art...
7   behavior showing increased motor activity->ten...
8   bowel sounds decreased->decreased stool calibe...
9   bowel sounds decreased->nausea and vomiting->pain
10                               bruit->redness->pain
11                  catching breath->flatulence->pain
12                            choke->arthralgia->pain
13                              choke->tenesmus->pain
14            coordination abnormal->arthralgia->pain
15              coordination abnormal->tenesmus->pain
16                       estrogen use->erythema->pain
17             feces in rect

## Conclusions
We have now learned the Taxonomy of our data :)

In [111]:
# Let's check what communities we have
# What type of graph is this?
G2=gds.graph.project('p2',['Symptom'], ['NARROWER_THAN'])
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G2.node_count(), relationship_count=G2.relationship_count())

'Nodes: 404, Relationships 936'

In [112]:
gds.wcc.stats(G2)

{'componentCount': 26,
 'componentDistribution': {'p99': 378,
  'min': 1,
  'max': 378,
  'mean': 15.538461538461538,
  'p90': 1,
  'p50': 1,
  'p999': 378,
  'p95': 2,
  'p75': 1},
 'postProcessingMillis': 6,
 'preProcessingMillis': 0,
 'computeMillis': 29,
 'configuration': {'seedProperty': None,
  'consecutiveIds': False,
  'threshold': 0.0,
  'relationshipWeightProperty': None,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'username': None,
  'concurrency': 4}}

In [113]:
gds.wcc.mutate(G2, mutateProperty='wccId')

{'mutateMillis': 0,
 'nodePropertiesWritten': 404,
 'componentCount': 26,
 'componentDistribution': {'p99': 378,
  'min': 1,
  'max': 378,
  'mean': 15.538461538461538,
  'p90': 1,
  'p50': 1,
  'p999': 378,
  'p95': 2,
  'p75': 1},
 'postProcessingMillis': 6,
 'preProcessingMillis': 0,
 'computeMillis': 6,
 'configuration': {'seedProperty': None,
  'consecutiveIds': False,
  'threshold': 0.0,
  'relationshipWeightProperty': None,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'mutateProperty': 'wccId',
  'username': None,
  'concurrency': 4}}

In [114]:
gds.graph.writeNodeProperties(G2, ['wccId'])

{'writeMillis': 6,
 'graphName': 'p2',
 'nodeProperties': ['wccId'],
 'propertiesWritten': 404}

In [115]:
# Let's check the small iselands
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)
            WITH s.wccId as communityId, count(*) as communitySize where communitySize < 5
            MATCH (s:Symptom{wccId:communityId})
            RETURN communityId, 
                   collect(s.name) as symptoms, 
                   communitySize, 
                   [ (s)-[:SYMPTOM_OF]->(d) | d.name ] as diseases
            ORDER BY communityId
            """,
        ).data()
    )
    df = pd.DataFrame(result)
    with pd.option_context('display.max_rows', None, 'display.max_columns', 5, 'display.expand_frame_repr', False):
        print(df)

    communityId                     symptoms  communitySize                                           diseases
0             0             [uncoordination]              1                        [encephalopathy, influenza]
1             4                [throat sore]              1  [upper respiratory infection, spasm bronchial,...
2             9            [scleral icterus]              1                       [gastroenteritis, influenza]
3            24             [disequilibrium]              2                                        [gastritis]
4            24                  [dizziness]              2  [hypertensive disease, hyperlipidemia, hemorrh...
5            34                      [sweat]              1  [ischemia, hypertensive disease, hyperlipidemi...
6            36                  [nonsmoker]              1  [hypercholesterolemia, degenerative polyarthri...
7            55                     [gurgle]              1  [pneumonia aspiration, carcinoma of lung, keto...
8

In [116]:
# Let's break down the largest community
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)
            WITH s.wccId as communityId, count(*) as communitySize
            RETURN communityId,communitySize order by communitySize desc limit 5
            """,
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   communityId  communitySize
0            1            378
1           24              2
2            4              1
3            9              1
4            0              1


In [117]:
wccId = df["communityId"][0]
print(wccId)


1


In [118]:
G3=gds.beta.graph.project.subgraph('p3', G2, 'n.wccId={id}'.format(id =wccId) , '*')
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G3.node_count(), relationship_count=G3.relationship_count())

'Nodes: 378, Relationships 935'

In [132]:
gds.louvain.stats(G3)

{'modularity': 0.2970276530641425,
 'modularities': [0.2970276530641425],
 'ranLevels': 1,
 'communityCount': 108,
 'communityDistribution': {'p99': 19,
  'min': 1,
  'max': 19,
  'mean': 3.5,
  'p90': 7,
  'p50': 2,
  'p999': 19,
  'p95': 9,
  'p75': 4},
 'postProcessingMillis': 0,
 'preProcessingMillis': 0,
 'computeMillis': 57,
 'configuration': {'maxIterations': 10,
  'seedProperty': None,
  'consecutiveIds': False,
  'maxLevels': 10,
  'relationshipWeightProperty': None,
  'includeIntermediateCommunities': False,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'tolerance': 0.0001,
  'concurrency': 4,
  'username': None}}

In [133]:
gds.louvain.write(G3, writeProperty='louvainId')

{'writeMillis': 1,
 'nodePropertiesWritten': 378,
 'modularity': 0.2970276530641425,
 'modularities': [0.2970276530641425],
 'ranLevels': 1,
 'communityCount': 108,
 'communityDistribution': {'p99': 19,
  'min': 1,
  'max': 19,
  'mean': 3.5,
  'p90': 7,
  'p50': 2,
  'p999': 19,
  'p95': 9,
  'p75': 4},
 'postProcessingMillis': 0,
 'preProcessingMillis': 0,
 'computeMillis': 68,
 'configuration': {'maxIterations': 10,
  'writeConcurrency': 4,
  'seedProperty': None,
  'consecutiveIds': False,
  'maxLevels': 10,
  'relationshipWeightProperty': None,
  'concurrency': 4,
  'writeProperty': 'louvainId',
  'includeIntermediateCommunities': False,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'tolerance': 0.0001,
  'username': None}}

In [134]:
G3.drop()

In [135]:
G2.drop()

## Conclusions
Well, sometimes we fail. 100's of communities are not useful (see bloom). But then again, our NARROWER_THAN is a tree structure. So maybe we are better off just looking at what are the different tree roots. 

In [139]:
# Add Root label and also return roots
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (root:Symptom)<-[r:NARROWER_THAN*1..10]-(s:Symptom)
            WHERE NOT (root)-[:NARROWER_THAN]->()
            WITH root, count(distinct(s)) as treeSize
            SET root:Root, 
            root.treeSize = treeSize
            RETURN root.name as symptom, treeSize
            ORDER BY treeSize desc
            """
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

                   symptom  treeSize
0      shortness of breath        78
1                     pain        75
2                    fever        47
3    decreased body weight        44
4                 diarrhea        38
..                     ...       ...
104                pustule         1
105            scar tissue         1
106  st segment depression         1
107                unhappy         1
108          unsteady gait         1

[109 rows x 2 columns]


## Important
We got 100 roots, but to me they look more useful. Then again it depends in the question you want to answer.

We should of course have studied the tree strucuture to get familiar with our data before we even tried to formulate or answer any questions. 