In [None]:
!pip install graphdatascience==1.15a2

Collecting graphdatascience==1.15a2
  Downloading graphdatascience-1.15a2-py3-none-any.whl.metadata (7.8 kB)
Collecting multimethod<3.0,>=1.0 (from graphdatascience==1.15a2)
  Downloading multimethod-2.0-py3-none-any.whl.metadata (9.2 kB)
Collecting neo4j<6.0,>=4.4.12 (from graphdatascience==1.15a2)
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting textdistance<5.0,>=4.0 (from graphdatascience==1.15a2)
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading graphdatascience-1.15a2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multimethod-2.0-py3-none-any.whl (9.8 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: 

In [None]:
!pip install --upgrade numpy

Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m706.4 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is 

In [None]:
import pandas as pd
from google.colab import userdata

In [None]:
CLIENT_ID = userdata.get("CLIENT_ID")
CLIENT_SECRET = userdata.get("CLIENT_SECRET")
TENANT_ID = userdata.get("TENANT_ID")

## Set Up Sessions
Set up sessions with credentials and then spin up a *session*

In [None]:
from graphdatascience.session import GdsSessions, AuraAPICredentials, AlgorithmCategory, CloudLocation
from datetime import timedelta

sessions = GdsSessions(api_credentials=AuraAPICredentials(CLIENT_ID, CLIENT_SECRET, TENANT_ID))

name = "my-new-session-sm"
memory = sessions.estimate(
    node_count=20,
    relationship_count=50,
    algorithm_categories=[AlgorithmCategory.CENTRALITY, AlgorithmCategory.NODE_EMBEDDING],
)
cloud_location = CloudLocation(provider="gcp", region="europe-west1")

gds = sessions.get_or_create(
    session_name=name,
    memory=memory,
    ttl=timedelta(hours=5),
    cloud_location=cloud_location,
)

## Taking a Look at the Data
We will be using [Synthea](https://synthetichealth.github.io/synthea/) to generate mock data. Synthea creates realistic mock healthcare data. Our goal will be to model patient similarity, that way we could see if there is an ideal patient plan for similar patients.

We will be looking at patients and the procedures they undergone. One thing we will need to change is the `ID` as it contains characters.

In [None]:
patients = pd.read_csv("Patients.csv")
patients.head()

Unnamed: 0,ID,PREFIX,FIRST,MIDDLE,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,ZIP
0,41313b42-6ce6-fa01-6021-6041fc6b1a26,Mr.,Linwood526,,Orn563,,,M,white,nonhispanic,M,Boston Massachusetts US,414 Gusikowski Grove,Worcester,Massachusetts,1603
1,06df5af7-aabe-008f-2c56-edb2a080c52a,Mr.,Beau391,Gail741,Goldner995,,,,black,nonhispanic,M,Leominster Massachusetts US,635 Ullrich Meadow Apt 38,Boston,Massachusetts,2129
2,5c8a31b6-6309-f047-71f9-c78778250acd,Mr.,Isreal8,Dan465,Jakubowski832,,,,white,nonhispanic,M,Wrentham Massachusetts US,1082 Wehner Ferry Unit 83,Dartmouth,Massachusetts,0
3,a0f3cc78-810a-3ec6-36cf-221b374350d6,Mrs.,Loralee922,Sommer719,Crona259,,Bashirian201,M,asian,nonhispanic,F,Springfield Massachusetts US,829 Cassin Gate,Newton,Massachusetts,2462
4,b78b64e2-a653-5e37-1854-99a70d14ca3f,,Jerald662,Quentin28,Koepp521,,,,white,nonhispanic,M,Bridgewater Massachusetts US,173 DuBuque Orchard Unit 69,Millbury,Massachusetts,0


Procedures contains both the id of the patient and a code for the procedure:

In [None]:
procedures = pd.read_csv("Procedures.csv")
procedures.head()

Unnamed: 0,STARTDATE,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
0,2015-06-19 05:03:41.000,2015-06-19 05:18:41.000,a0f3cc78-810a-3ec6-36cf-221b374350d6,3e3cd9ec-d27a-98ac-c4a4-51bc4420cb7b,http://snomed.info/sct,252160004,Standard pregnancy test (procedure),9046.75,72892002.0,Normal pregnancy (finding)
1,2015-06-19 05:03:41.000,2015-06-19 05:18:41.000,a0f3cc78-810a-3ec6-36cf-221b374350d6,3e3cd9ec-d27a-98ac-c4a4-51bc4420cb7b,http://snomed.info/sct,169230002,Ultrasound scan for fetal viability (procedure),4049.7,72892002.0,Normal pregnancy (finding)
2,2015-06-19 05:03:41.000,2015-06-19 05:18:41.000,a0f3cc78-810a-3ec6-36cf-221b374350d6,3e3cd9ec-d27a-98ac-c4a4-51bc4420cb7b,http://snomed.info/sct,274804006,Evaluation of uterine fundal height (procedure),3774.39,72892002.0,Normal pregnancy (finding)
3,2015-06-19 05:03:41.000,2015-06-19 05:18:41.000,a0f3cc78-810a-3ec6-36cf-221b374350d6,3e3cd9ec-d27a-98ac-c4a4-51bc4420cb7b,http://snomed.info/sct,225158009,Auscultation of the fetal heart (procedure),8837.86,72892002.0,Normal pregnancy (finding)
4,2015-06-19 05:03:41.000,2015-06-19 05:18:41.000,a0f3cc78-810a-3ec6-36cf-221b374350d6,3e3cd9ec-d27a-98ac-c4a4-51bc4420cb7b,http://snomed.info/sct,44608003,Blood group typing (procedure),2283.31,72892002.0,Normal pregnancy (finding)


## Converting Id to Numeric
Next, we are to create a numeric id for the patient ids in procedures. First we need to make sure that our ids don't collide. One way to do that would be to just have a longer id.

Let's see how long the ids in `CODE` in the `procedures` dataframe:

In [None]:
all_same_length = procedures['CODE'].astype(str).str.len().nunique() == 1
procedures['CODE'].astype(str).str.len().value_counts()

Unnamed: 0_level_0,count
CODE,Unnamed: 1_level_1
9,178462
8,24862
10,17273
15,11033
7,2274
17,323
13,7


And then we will create one that is a bit longer and doesn't have any leading 0s.

In [None]:
import pandas as pd

# Get unique patient IDs
unique_patients = procedures['PATIENT'].unique()

# Use pure Python integers to generate 20-digit codes
start_value = 10**18  # ensures 20 digits, doesn't start with 0
numeric_ids = [start_value + i for i in range(len(unique_patients))]

# Create mapping
patient_id_map = pd.Series(numeric_ids, index=unique_patients, dtype='object')

# Apply mapping
procedures['PATIENT2'] = procedures['PATIENT'].map(patient_id_map)

Next, we are need to ensure that `PATIENT` in `patients` and `PATIENT2` in `procedures` have the same id.

In [None]:
patients['PATIENT'] = patients['ID'].map(patient_id_map)

## Prepping for graph.construct
First we are going to create a dataframe that only contains the ids for patients who have had kidney disease.

We do need to do some mild clean up to make sure that everything has the right names.

For the dataframe representing nodes:
- The first column should be called `nodeId`
- There can be no characters so we will have to drop the station names

For the dataframe representing relationships:
- We need to have columns called `sourceNodeId` and `targetNodeId`
- As well as what we want to call that relationship in a column called `relationshipType`

Additionally, we are going to be looking just at patients who have kidney disease, so we are going to just look at patients with certain disease codes.

In [None]:
# Kidney-related reason codes
kidney_disease_codes = {431857002, 46177005, 161665007, 698306007}

# Filter procedures for kidney-related reasons
kidney_procedures = procedures[procedures['REASONCODE'].isin(kidney_disease_codes)]

# Extract unique patient IDs
kidney_patient_ids = kidney_procedures['PATIENT2'].unique()
kidney_patients_vw = pd.DataFrame({'nodeId': kidney_patient_ids})

Then we are going to do the same for procedures. This time we are just going to be looking for procedures that kidney patients have undergone.

In [None]:
# Filter all procedures done by kidney patients
kidney_patient_procedures = procedures[procedures['PATIENT2'].isin(kidney_patient_ids)]

# Extract unique procedure codes
kidney_patient_procedures_vw = pd.DataFrame({
    'nodeId': kidney_patient_procedures['CODE'].unique()
})

Finally create a view that represents the relationship between the kidney patients and all the procedures they have had.  

This will be the relationship used in the bipartite graph projection for Jaccard similarity

In [None]:
# Create patient-to-procedure relationship pairs
kidney_patient_procedure_relationship = kidney_patient_procedures[['PATIENT2', 'CODE']].drop_duplicates()


# Rename columns for graph semantics
relationships = kidney_patient_procedure_relationship.rename(
    columns={'PATIENT2': 'sourceNodeId', 'CODE': 'targetNodeId'}
)

Finally, we are going to combine the `NodeId`s for patients and procedures into one dataframe called nodes.

In [None]:
nodes = pd.concat([kidney_patients_vw, kidney_patient_procedures_vw], ignore_index=True)

## Projecting a Graph and Running Patient Similarity
Next we are going to quickly create a graph using `graph.construct`.

In [None]:
graph_name = "patients"

if gds.graph.exists(graph_name)["exists"]:
    # Drop the graph if it exists
    gds.graph.drop(graph_name)
    print(f"Graph '{graph_name}' dropped.")

G = gds.graph.construct(graph_name, nodes, relationships)

Uploading Nodes:   0%|          | 0/301 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/3509 [00:00<?, ?Records/s]

In [None]:
similarity = gds.nodeSimilarity.stream(
  G
)

similarity

 Node Similarity:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,node1,node2,similarity
0,1000000000000000021,1000000000000000781,0.900000
1,1000000000000000021,1000000000000000095,0.875000
2,1000000000000000021,1000000000000000687,0.812500
3,1000000000000000021,1000000000000000313,0.781250
4,1000000000000000021,1000000000000000748,0.777778
...,...,...,...
775,1000000000000001178,1000000000000000265,0.540541
776,1000000000000001178,1000000000000000096,0.527027
777,1000000000000001178,1000000000000001007,0.520000
778,1000000000000001178,1000000000000000914,0.513889


We can now use this similarity dataframe to build a new graph projection and then run louvain to see if we can build communities from our pairwise calculation.

In [None]:
nodes_sim = pd.DataFrame(
    pd.unique(similarity[['node1', 'node2']].values.ravel()),
    columns=['nodeId']
)

# Create the relationships DataFrame
relationships_sim = similarity.rename(columns={
    'node1': 'sourceNodeId',
    'node2': 'targetNodeId',
    'similarity': 'weight'
})

Now, we create a new graph projection using the similarity scores:  

In [None]:
graph_name = "patients_sim"

if gds.graph.exists(graph_name)["exists"]:
    # Drop the graph if it exists
    gds.graph.drop(graph_name)
    print(f"Graph '{graph_name}' dropped.")

G = gds.graph.construct(graph_name, nodes_sim, relationships_sim)



Uploading Nodes:   0%|          | 0/78 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/780 [00:00<?, ?Records/s]

And then we run louvain against it. This will allow us to bucket different users together into communities. From this we can build out similar treatment programs for similar patients!

In [None]:
gds.louvain.stream(
  G
)

 Louvain:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,nodeId,intermediateCommunityIds,communityId
0,1000000000000000021,,0
1,1000000000000000781,,0
2,1000000000000000095,,0
3,1000000000000000687,,0
4,1000000000000000313,,0
...,...,...,...
73,1000000000000001147,,0
74,1000000000000000730,,16
75,1000000000000001037,,16
76,1000000000000000735,,0


Finally, we must close the session and end our billing.

In [None]:
sessions.delete(session_name="my-new-session-sm")



True