In [None]:
!pip install graphdatascience==1.15a2

Collecting graphdatascience==1.15a2
  Downloading graphdatascience-1.15a2-py3-none-any.whl.metadata (7.8 kB)
Downloading graphdatascience-1.15a2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphdatascience
  Attempting uninstall: graphdatascience
    Found existing installation: graphdatascience 1.12
    Uninstalling graphdatascience-1.12:
      Successfully uninstalled graphdatascience-1.12
Successfully installed graphdatascience-1.15a2


In [None]:
!pip install --upgrade numpy

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which i

In [None]:
CLIENT_ID = userdata.get("CLIENT_ID")
CLIENT_SECRET = userdata.get("CLIENT_SECRET")
TENANT_ID = userdata.get("TENANT_ID")

## Set Up Sessions
Set up sessions with credentials and then spin up a *session*

In [None]:
from graphdatascience.session import GdsSessions, AuraAPICredentials, AlgorithmCategory, CloudLocation
from datetime import timedelta

sessions = GdsSessions(api_credentials=AuraAPICredentials(CLIENT_ID, CLIENT_SECRET, TENANT_ID))

name = "my-new-session-sm"
memory = sessions.estimate(
    node_count=20,
    relationship_count=50,
    algorithm_categories=[AlgorithmCategory.CENTRALITY, AlgorithmCategory.NODE_EMBEDDING],
)
cloud_location = CloudLocation(provider="gcp", region="europe-west1")

gds = sessions.get_or_create(
    session_name=name,
    memory=memory,
    ttl=timedelta(hours=5),
    cloud_location=cloud_location,
)

## Load in Data
Load in data from my github that covers how the NYC subway connects together.

In [None]:
import pandas as pd

lines = pd.read_csv("https://raw.githubusercontent.com/corydonbaylor/aura-graph-analytics/refs/heads/main/mta_subways/data/lines.csv")
stations = pd.read_csv("https://raw.githubusercontent.com/corydonbaylor/aura-graph-analytics/refs/heads/main/mta_subways/data/nodes.csv")


# Creating a Projection
You can create a projection directly from python dataframes. We have two dataframes-- one that represents stations and one that represents lines.

Currently, Graph Analytics only accepts directed graphs. So we need to explicitly create the relationships going in the other direction.


In [None]:
lines2 = lines.rename(
    columns={
        'sourceNodeId' : 'targetNodeId',
        'targetNodeId' : 'sourceNodeId'
    }
)

lines = pd.concat([lines, lines2], ignore_index=True)
lines

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,0,1,GOES_TO
1,1,2,GOES_TO
2,2,3,GOES_TO
3,3,4,GOES_TO
4,4,5,GOES_TO
...,...,...,...
1393,336,408,GOES_TO
1394,32,336,GOES_TO
1395,34,32,GOES_TO
1396,61,34,GOES_TO


We do need to do some mild clean up to make sure that everything has the right names.

For the dataframe representing nodes:
- The first column should be called `nodeId`
- There can be no characters so we will have to drop the station names

For the dataframe representing relationships:
- We need to have columns called `sourceNodeId` and `targetNodeId`
- As well as what we want to call that relationship in a column called `relationshipType`

In [None]:
stations = stations.rename(columns={'id': 'nodeId'})
nodes = stations[['nodeId']]
nodes

Unnamed: 0,nodeId
0,0
1,1
2,2
3,3
4,4
...,...
416,416
417,417
418,418
419,419


## Graph Construct
Using `graph.construct`, we can easily create a projection.

In [None]:
graph_name = "subways"

if gds.graph.exists(graph_name)["exists"]:
    # Drop the graph if it exists
    gds.graph.drop(graph_name)
    print(f"Graph '{graph_name}' dropped.")

G = gds.graph.construct("subways", nodes, lines)

Graph 'subways' dropped.


Uploading Nodes:   0%|          | 0/421 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/1398 [00:00<?, ?Records/s]

## Returning Results
We will use Dijkstra shortest path to see how we can move through the system efficiently.

We can create a simple wrapper function below, so that we can use the names of stations rather than their `nodeIds`.

And with that, let's see how to get from Grand Army Plaza in Brooklyn to Times Square:

In [None]:
station_crosswalk = dict(zip(stations['station_name'], stations['nodeId']))

# Function to get the node IDs from station names and run Dijkstra
def get_shortest_path(source_station, target_station, G):
    # Map the station names to node IDs
    source_node_id = station_crosswalk.get(source_station)
    target_node_id = station_crosswalk.get(target_station)

    result = gds.shortestPath.dijkstra.stream(
          G,
          sourceNode=source_node_id,
          targetNode=target_node_id
      )
    node_ids = result['nodeIds'][0]
    id_to_station = {v: k for k, v in station_crosswalk.items()}
    ordered_subset = {id_to_station[i]: i for i in node_ids if i in id_to_station}
    return ordered_subset

# Example usage
# Assuming 'G' is your graph
source_station = "Grand Army Plaza - Bk"
target_station = "Times Sq-42 St - M"

# Call the function
path_df = get_shortest_path(source_station, target_station, G)

path_df

{'Grand Army Plaza - Bk': 69,
 'Bergen St - Bk': 68,
 'Atlantic Av-Barclays Ctr - Bk': 67,
 'Canal St - M': 32,
 '14 St-Union Sq - M': 104,
 '34 St-Herald Sq - M': 230,
 'Times Sq-42 St - M': 24}

But what if one of those stations closed? What would be the quickest path there? Let's see what would happen if Herald Square was closed:

In [None]:
def exclude_node(nodes_df, lines_df, node_to_exclude):
    closed = nodes_df[nodes_df['nodeId'] != node_to_exclude]
    closed_lines = lines_df[
        (lines_df['sourceNodeId'] != node_to_exclude) &
        (lines_df['targetNodeId'] != node_to_exclude)
    ]
    return closed, closed_lines

closed_nodes, closed_lines = exclude_node(nodes, lines, 230)

We then need to create a new projection without Herald Square


In [None]:
graph_name = "exclude"

if gds.graph.exists(graph_name)["exists"]:
    # Drop the graph if it exists
    gds.graph.drop(graph_name)
    print(f"Graph '{graph_name}' dropped.")

G = gds.graph.construct(graph_name, closed_nodes, closed_lines)

Graph 'exclude' dropped.


Uploading Nodes:   0%|          | 0/420 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/1366 [00:00<?, ?Records/s]

And rerun to see the new path:

In [None]:
# Example usage
# Assuming 'G' is your graph
source_station = "Grand Army Plaza - Bk"
target_station = "Times Sq-42 St - M"

# Call the function
path_df = get_shortest_path(source_station, target_station, G)
print(path_df)

{'Grand Army Plaza - Bk': 69, 'Bergen St - Bk': 68, 'Atlantic Av-Barclays Ctr - Bk': 67, 'Canal St - M': 32, 'Chambers St - M': 34, '14 St - M': 29, '34 St-Penn Station - M': 25, 'Times Sq-42 St - M': 24}


Don't forget to close your session!

In [None]:
sessions.delete(session_name="my-new-session-sm")

True