In [None]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "symptom"

# Create driver
Also set the DB_NAME

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px 
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

# Create databse and schema

In [None]:
# Create (or replace) database
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE OR REPLACE DATABASE {dbname}".format(dbname = DB_NAME)
        ).data()
    )

In [None]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Symptom) REQUIRE (n.name) IS NODE KEY"
        ).consume()
    )
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Disease) REQUIRE (n.name) IS NODE KEY"
        ).consume()     
    )

#  Load data

In [None]:
# Read csv file
# Source https://github.com/deshanadesai/Symptom-X-/blob/master/dataset_clean1.csv
csv = pd.read_csv('https://raw.githubusercontent.com/deshanadesai/Symptom-X-/master/dataset_clean1.csv')
csv.head()

In [None]:
# Select all unique symptoms
symptoms = csv['Target'].drop_duplicates().dropna()
symptoms.head()

In [None]:
# Create Symptom nodes
for chunk in np.array_split(symptoms, 2):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $symptoms as symptomName
                MERGE (:Symptom{name: symptomName})
                RETURN count(*) as nodesCreated
                """,
                symptoms = chunk.to_list()
            ).data()
        )
        df = pd.DataFrame(result)
df.head()

In [None]:
# Select all unique diseases
diseases = csv['Source'].drop_duplicates().dropna()
diseases.head()

In [None]:
# Create Disease nodes
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            UNWIND $diseases as diseaseName
            MERGE (:Disease{name: diseaseName})
            RETURN count(*) as nodesCreated
            """,
            diseases = diseases.to_list()
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Select all SYMPTOM_OV relationships
symptom_of_rels = csv[['Source','Target', 'Weight']]
df.head()

In [None]:
# Create VIEWED relationships
for chunk in np.array_split(symptom_of_rels, 2):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $data as rel
                MATCH (d:Disease{name: rel.Source}), (s:Symptom{name: rel.Target})
                MERGE (s)-[:SYMPTOM_OF{weight:toInteger(rel.Weight)}]->(d)
                RETURN count(*) as relsCreated
                """,
                data = chunk.to_dict('records')
            ).data()
        )
        df = pd.DataFrame(result)
df.head()

# Some basic queries

In [None]:
# Syptom with highest degree
# Question: What does does it mean?
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:SYMPTOM_OF]->(d:Disease)
            RETURN 
                s.name as symptom, 
                collect(d.name) as diseases, 
                count(d) as number_of_diseases
            ORDER BY number_of_diseases DESC LIMIT $limit
            """,
            limit = 5
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Syptom with lowest degree
# Question: What does does it mean?
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:SYMPTOM_OF]->(d:Disease)
            RETURN 
                s.name as symptom, 
                collect(d.name) as diseases, 
                count(d) as number_of_diseases
            ORDER BY number_of_diseases ASC LIMIT $limit
            """,
            limit = 5
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Possible diseases given two symptoms
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom{name:"fever"})-[:SYMPTOM_OF]->(d:Disease)<-[:SYMPTOM_OF]-(s2:Symptom{name:"swelling"})
            RETURN d.name as disease
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Possible diseases given two symptoms
# Weighted
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom{name:"fever"})-[r1:SYMPTOM_OF]->(d:Disease)<-[r2:SYMPTOM_OF]-(s2:Symptom{name:"swelling"})
            RETURN d.name as disease, 
                   r1.weight + r2.weight as weight 
            ORDER BY weight DESC 
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

# Graph data science
Can we arrange syptoms based on diseases they share?

In [None]:
G1=gds.graph.project('p1',['Disease', 'Symptom'], ['SYMPTOM_OF'])
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G1.node_count(), relationship_count=G1.relationship_count())

In [None]:
gds.nodeSimilarity.stats(G1, 
    similarityMetric = 'OVERLAP', 
    similarityCutoff = 0.6
)

In [None]:
gds.nodeSimilarity.write(G1, 
    similarityMetric = 'OVERLAP', 
    similarityCutoff = 0.6, 
    writeRelationshipType = 'NARROWER_THAN', 
    writeProperty = 'similarity'
)

In [None]:
G1.drop()

In [None]:
# Touch-up: Remove symmetric NARROWER_THAN relationships
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE (s2)-[:NARROWER_THAN]->(s1)
            DELETE r
            RETURN count(*) as relationships_deleted
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Touch-up: Make NARROWER_THAN point towards the node with the bigger set of adjacent Diseases
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s1:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE size( (s1)-[:SYMPTOM_OF]->() ) > size( (s2)-[:SYMPTOM_OF]->() )
            CREATE (s2)-[:NARROWER_THAN{similarity: r.similarity}]->(s1)
            DELETE r
            RETURN count(*) as relationships_reversed
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Touch-up: Remove transitive dependencies
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)-[r:NARROWER_THAN]->(s2:Symptom)
            WHERE (s)-[:NARROWER_THAN*2..20]->(s2)
            DELETE r
            RETURN count(*) as relationships_transitive
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Let's query our new NARROWER_THAN relationships
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (l2:Symptom)-[:NARROWER_THAN]->(l1:Symptom)-[:NARROWER_THAN]->(root:Symptom{name:"pain"})
            RETURN l2.name + '->' + l1.name + '->' + root.name as path
            ORDER BY l2.name, l1.name
            """,
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

## Conclusions
We have now learned the Taxonomy of our data :)

In [None]:
# Let's check what communities we have
# What type of graph is this?
G2=gds.graph.project('p2',['Symptom'], ['NARROWER_THAN'])
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G2.node_count(), relationship_count=G2.relationship_count())

In [None]:
gds.wcc.stats(G2)

In [None]:
gds.wcc.mutate(G2, mutateProperty='wccId')

In [None]:
gds.graph.writeNodeProperties(G2, ['wccId'])

In [None]:
# Let's check the small iselands
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)
            WITH s.wccId as communityId, count(*) as communitySize where communitySize < 5
            MATCH (s:Symptom{wccId:communityId})
            RETURN communityId, 
                   collect(s.name) as symptoms, 
                   communitySize, 
                   [ (s)-[:SYMPTOM_OF]->(d) | d.name ] as diseases
            ORDER BY communityId
            """,
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
# Let's break down the largest community
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)
            WITH s.wccId as communityId, count(*) as communitySize
            RETURN communityId,communitySize order by communitySize desc limit 5
            """,
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
wccId = df["communityId"][0]
print(wccId)


In [None]:
G3=gds.beta.graph.project.subgraph('p3', G2, 'n.wccId={id}'.format(id=wccId) , '*')
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G3.node_count(), relationship_count=G3.relationship_count())

In [None]:
gds.louvain.stats(G3)

In [None]:
gds.louvain.write(G3, writeProperty='louvainId')

In [None]:
G3.drop()

In [None]:
G2.drop()

## Conclusions
Well, sometimes we fail. 100's of communities are not useful (see bloom). But then again, our NARROWER_THAN is a tree structure. So maybe we are better off just looking at what are the different tree roots. 

In [None]:
# Add Root label and also return roots
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (root:Symptom)<-[r:NARROWER_THAN*1..10]-(s:Symptom)
            WHERE NOT (root)-[:NARROWER_THAN]->()
            WITH root, count(distinct(s)) as treeSize
            SET root:Root, 
            root.treeSize = treeSize
            RETURN root.name as symptom, treeSize
            ORDER BY treeSize desc
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

## Talke a look in neo4j bloom
Add the Root label to the scene, size the nodes by treeSize

In [None]:
# Set size for each node in the tree
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (s:Symptom)<-[r:NARROWER_THAN*0..10]-(n:Symptom)
            WITH s, count(distinct(n)) as treeSize
            SET s.treeSize = treeSize
            RETURN count(*) as nodesUpdated
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [349]:
# Grab some data so we can visualize it
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (root:Root)
            WITH root ORDER BY root.treeSize DESC LIMIT 3
            MATCH (l2:Symptom)-[:NARROWER_THAN]->(l1:Symptom)-[:NARROWER_THAN]->(root)
            RETURN l2.name as l2, l1.name as l1, root.name as root, l2.treeSize as size
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

Unnamed: 0,l2,l1,root,size
0,hemianopsia homonymous,breath sounds decreased,shortness of breath,4
1,ache,haemoptysis,shortness of breath,1
2,tinnitus,haemoptysis,shortness of breath,1
3,welt,haemoptysis,shortness of breath,1
4,st segment elevation,orthopnea,shortness of breath,1


In [350]:
fig = px.sunburst(df, path=['root', 'l1', 'l2'], values='size', color='root')
fig.update_layout(title_text="Symptoms Hierarchy", font_size=10)
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



## Important
We got 100+ roots, but to me they look more useful. Then again it depends in the question you want to answer.

We should of course have studied the tree strucuture to get familiar with our data before we even tried to formulate or answer any questions. 

# Let's pin our graph to the wall
A graphs are non eucledian, let's transform into an eucledian space. Node embeddings to the rescue


In [None]:
# Cypher projection of Diseases sharing symptoms
G4 = gds.graph.project.cypher('p4',
    """ MATCH (n:Disease)<-[:SYMPTOM_OF]-(s:Symptom)
        WHERE s.wccId = {wccId} 
        RETURN id(n) as id""".format(wccId=wccId),
    """ MATCH (n1:Disease)<-[:SYMPTOM_OF]-(s:Symptom)-[:SYMPTOM_OF]->(n2:Disease)
        WHERE s.wccId = {wccId} 
        RETURN id(n1) as source, id(n2) as target, count(*) as weight""".format(wccId=wccId)
)
"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G4.node_count(), relationship_count=G4.relationship_count())

In [None]:
gds.fastRP.write(G4, 
    embeddingDimension=2, 
    iterationWeights=[0.2, 1.0, 0.9, 0.7],
    nodeSelfInfluence=0.1,
    relationshipWeightProperty='weight', 
    writeProperty='embedding'
)

In [None]:
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (d:Disease)
            RETURN d.name as points, d.embedding[0] as x, d.embedding[1] as y
            """
        ).data()
    )
    df = pd.DataFrame(result)
df.head()

In [None]:
ax = df.plot.scatter(x='x', y='y', alpha=0.5, figsize=(10,10))
for i, txt in enumerate(df.points):
   ax.annotate(txt, (df.x.iat[i]+0.05, df.y.iat[i]))

In [None]:
G4.drop()