## Install and import dependencies

First we need to install the graphdatascience package and load all of our secrets

In [36]:
%pip install graphdatascience



## Authentication & Session Setup

### Authentication
You must first generate your credentials in Neo4j Aura. Afterwards, you can store your credentials securely using *colab secrets*.


In [37]:
from google.colab import userdata
NEO4J_URI=userdata.get('PHARMA_URI')
NEO4J_USERNAME="neo4j"
NEO4J_DATABASE="neo4j"
NEO4J_PASSWORD=userdata.get('PHARMA_PASSWORD')
AURA_INSTANCEID=userdata.get('PHARMA_INSTANCEID')
AURA_INSTANCENAME="supply-chain-pharma"
# For use in Google Colab
# This crediential is the Organization ID
TENANT_ID=userdata.get('TENANT_ID')

# These credentials were generated after the creation of the API Endpoint
CLIENT_SECRET=userdata.get('CLIENT_SECRET')
CLIENT_ID=userdata.get('CLIENT_ID')

### Establishing a Session

Estimate resources based on graph size and create a session with a 2‑hour TTL.  

In [38]:
from graphdatascience.session import DbmsConnectionInfo, AlgorithmCategory, CloudLocation, GdsSessions, AuraAPICredentials
from datetime import timedelta

sessions = GdsSessions(api_credentials=AuraAPICredentials(CLIENT_ID, CLIENT_SECRET, TENANT_ID))

session_name = "pharma"
memory = sessions.estimate(
    node_count=1000, relationship_count=5000,
    algorithm_categories=[AlgorithmCategory.CENTRALITY, AlgorithmCategory.NODE_EMBEDDING],
)

db_connection_info = DbmsConnectionInfo(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

In [39]:
# Create or retrieve a session
gds = sessions.get_or_create(
    session_name=session_name,
    memory=memory,
    db_connection=db_connection_info, # this is checking for a bolt server currently
    ttl=timedelta(hours=5),
)

Next, we create a projected graph in Neo4j GDS called "supply_chain" using three types of existing relationships from the database.

This query collects edges for the graph from three sources:

1.	**Suppliers → Materials**
	  
    •	From Suppliers to raw materials (API, BULK, DP, etc.)
	  
    •	Via SUPPLIES_RM relationship
2.	**Materials → Materials/Products**
	  
    •	PRODUCT_FLOW edges between any two nodes with material/product labels
	  
    •	Represents transformation or flow in the supply chain
3.	**Product → Distributor**
	  
    •	DISTRIBUTED_BY reversed to become Product → Distributor
	  
    •	So that the flow always goes forward through the supply chain

Each is assigned a weight of 1.0 to be used as a property.


In [119]:
if gds.graph.exists("supply_chain")["exists"]:
    gds.graph.drop("supply_chain")

query = """
CALL {
    MATCH (sup:Suppliers)-[:SUPPLIES_RM]->(mat)
    WHERE mat:API OR mat:BULK OR mat:DP OR mat:FG OR mat:Batch
    RETURN sup AS source, mat AS target, 1.0 AS weight

    UNION

    MATCH (a)-[:PRODUCT_FLOW]->(b)
    WHERE (a:API OR a:BULK OR a:DP OR a:FG OR a:Batch OR a:Product)
      AND (b:API OR b:BULK OR b:DP OR b:FG OR b:Batch OR b:Product)
    RETURN a AS source, b AS target, 1.0 AS weight

    UNION

    MATCH (prod:Product)-[:DISTRIBUTED_BY]->(dist:Distributor)
    RETURN prod AS source, dist AS target, 1.0 AS weight
}
RETURN gds.graph.project.remote(
    source,
    target,
    {
        sourceNodeLabels: labels(source),
        targetNodeLabels: labels(target),
        relationshipType: 'SUPPLY_PATH',
        relationshipProperties: {weight:weight}
    }
)
"""

supply_chain_graph, _ = gds.graph.project(
    graph_name="supply_chain",
    query=query
)

 Graph creation from Triplets:   0%|          | 0/100 [00:00<?, ?%/s]

## Running our Algorithm
We then run pagerank to see how important each node is based on the quality of the incoming relationships.

In [75]:
pagerank_result = gds.pageRank.stream(
    supply_chain_graph,
    maxIterations=20,
    dampingFactor=0.85
)

pagerank_result.sort_values(by="score", ascending=False)

 PageRank:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,nodeId,score
92317,5297,207.773806
92327,5307,206.804818
92339,5319,99.182765
92349,5329,96.975366
92374,5354,95.667966
...,...,...
58208,94862,0.150000
58210,94863,0.150000
58212,94864,0.150000
9978,63580,0.150000


Certain node types are more important than others when dealing with a supply chain. APIs or active pharmaceutical ingredients are critical. Let's look at which APIs are most central in our pharmaceutical supply chain.

In [78]:
# using
top_nodes = pagerank_result.sort_values(by="score", ascending=False)

node_ids = top_nodes["nodeId"].tolist()

resolved_nodes = gds.run_cypher("""
WITH $nodeIds AS nodeIds
UNWIND nodeIds AS nid
MATCH (n:API)
WHERE id(n) = nid
RETURN id(n) AS nodeId,
       labels(n) AS labels,
       n.companyName AS companyName,
       n.productSKU AS productSKU,
       n.name AS name,
       n.location AS location,
       n.PR AS pageRank
ORDER BY nodeId
""", params={"nodeIds": node_ids})

resolved_nodes.sort_values(by="pageRank", ascending=False)

Unnamed: 0,nodeId,labels,companyName,productSKU,name,location,pageRank
1589,17644,"[Product, API]",,49c300cb-a791-4bbe-a977-4ebb01556a87,,Cape Town/ZA,2.309709
2309,23807,"[Product, API]",,b1a55d84-e60b-46d2-aaa4-c5da25fcb87f,,Quito/EC,2.250992
2639,26739,"[Product, API]",,2bd20d47-5cee-442f-ade1-ac3307b07ac5,,Quito/EC,2.234151
1569,17493,"[Product, API]",,b117a894-86d6-470d-bc3a-e39ea463335b,,Cape Town/ZA,2.182209
7129,143540,"[Product, API]",,982780dd-647d-4ef2-9bf8-608922e81ec5,,Venice/IT,2.168439
...,...,...,...,...,...,...,...
7210,144133,"[Product, API]",,208f200a-5d51-4eb8-97ae-f14168e1e0ba,,Montevideo/UY,0.277500
7230,144365,"[Product, API]",,8b417dbc-228e-4ca0-9477-b6c955ed81c6,,Sao Jose dos Campos/BR,0.277500
470,6308,"[Product, API]",,bfbfde03-fbf0-4595-b390-b5524a7f8e4c,,Philadelphia PA/US,0.277500
3850,36719,"[Product, API]",,6e3217a5-7b6e-4014-bdee-31df81da1d57,,Lima/PE,0.277500


It would be interesting to know if there are some APIs that are structurally similar to some of our most critical APIs. If there are, that would mean that we could potentially use them as replacements if one were to become unavailable.

We can use graph embeddings to convert the graph structure surrounding the APIs into numeric vectors.

In [120]:
# Run FastRP to generate embeddings for API nodes
res =gds.fastRP.write(
    supply_chain_graph,
    writeProperty="apiEmbedding",
    nodeLabels=["API"],
    embeddingDimension=128,
    randomSeed=42
)

In [88]:
gds.run_cypher("""
    MATCH (m:API)
    RETURN m.productSKU AS sku, m.apiEmbedding AS apiEmbedding
    ORDER BY sku
    LIMIT 5
    """)

Unnamed: 0,sku,apiEmbedding
0,000715d3-3551-4db1-b7f3-a74eb4ac9a6f,"[0.0, 0.0, -0.15075567364692688, -0.1507556736..."
1,000feefa-62e5-4c8c-8fbe-bf69a0557a09,"[0.1348399668931961, 0.0, 0.0, -0.144337564706..."
2,0022100a-fae3-4292-8d3f-402d3f94ff29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,002ae3b5-8e09-4730-b495-be9b928a845a,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,002ced81-39a6-4bcf-bdc2-e605c5677e27,"[0.0, -0.15249855816364288, 0.0, 0.0, -0.15249..."


We can then use KNN to determine how similar different nodes are to eachother based on these embeddings. Let's take a look at one of our top nodes and see if there are any structurally similar APIs:

In [113]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

# Convert embeddings to matrix
X = np.vstack(res["embedding"].values)
sku = res["sku"].values

# Run KNN
knn = NearestNeighbors(n_neighbors=5, metric='cosine')  # or 'euclidean'
knn.fit(X)

distances, indices = knn.kneighbors(X)

# Build final DataFrame
knn_df = pd.DataFrame({
    "sku": np.repeat(sku, 5),
    "neighbor_sku": sku[indices.flatten()],
    "distance": distances.flatten()
})

knn_df = knn_df[knn_df["sku"] != knn_df["neighbor_sku"]]

In [118]:
knn_df[knn_df["sku"] == "5faba79f-e4e1-4ef7-9d35-d70b24f948ca"]

Unnamed: 0,sku,neighbor_sku,distance
6,5faba79f-e4e1-4ef7-9d35-d70b24f948ca,21fada67-5bec-400c-8ae1-cafdb6b91fcc,0.476133
7,5faba79f-e4e1-4ef7-9d35-d70b24f948ca,6d30dbd5-9ad2-4648-bf2f-a8b32f92a8dd,0.573361
8,5faba79f-e4e1-4ef7-9d35-d70b24f948ca,d5e6498c-00fd-48b2-bfc3-7417d54de8cc,0.663016
9,5faba79f-e4e1-4ef7-9d35-d70b24f948ca,a13e07c7-f0f7-478c-ae12-82e5160ab03a,0.692503


And with that we can safely close the session!

In [None]:
sessions.delete(session_name="my-new-session")