# Graph Feature Engineering

In [1]:
import neo4j

import math

import numpy as np

import pandas as pd

from IPython.display import display

import psycopg2

## Postgress will hold our tables of features

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

## Neo4j

In [5]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [6]:
session = driver.session(database="neo4j")

In [7]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [8]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [9]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

## We will use our same disconnected graph of the high speed railways

In [10]:
def my_create_disconnected_graph():
    "create the connected graph"
    
    my_neo4j_wipe_out_database()

    query = """

    CREATE
      (seattle:Station {name: 'Seattle', latitude: 47.6062, longitude: -122.3321, label: 1}),
      (berkeley:Station {name: 'Berkeley', latitude: 37.8715, longitude: -122.2730, label: 1}),
      (losangeles:Station {name: 'Los Angeles', latitude: 34.0522, longitude: -118.2437, label: 1}),
      (denver:Station {name: 'Denver', latitude: 39.7392, longitude: -104.9903}),
      (dallas:Station {name: 'Dallas', latitude: 32.7767, longitude: -96.7970}),
      (chicago:Station {name: 'Chicago', latitude: 41.8781, longitude: -87.6298}),
      (newyork:Station {name: 'New York', latitude: 40.7128, longitude: -74.0060, label: 2}),
      (washington:Station {name: 'Washington', latitude: 38.9072, longitude: -77.0369, label: 2}),
      (miami:Station {name: 'Miami', latitude: 25.7617, longitude: -80.1918, label: 2}),
      (anchorage:Station {name: 'Anchorage', latitude: 61.2181, longitude: -149.9003, label:3}),
      (fairbanks:Station {name: 'Fairbanks', latitude: 64.8378, longitude: -147.7164}),
      (sanjuan:Station {name: 'San Juan', latitude: 18.4655, longitude: -66.1057, label:4}),
      (ponce:Station {name: 'Ponce', latitude: 18.0111, longitude: -66.6141}),
      (mayaguez:Station {name: 'Mayaguez', latitude: 18.2013, longitude: -67.1452}),
      (seattle)-[:TRACK {track_miles: 798}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 798}]->(seattle),
      (seattle)-[:TRACK {track_miles: 1303}]->(denver),
      (denver)-[:TRACK {track_miles: 1303}]->(seattle),
      (berkeley)-[:TRACK {track_miles: 1240}]->(denver),
      (denver)-[:TRACK {track_miles: 1240}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 376}]->(losangeles),
      (losangeles)-[:TRACK {track_miles: 376}]->(berkeley),
      (losangeles)-[:TRACK {track_miles: 1436}]->(dallas),
      (dallas)-[:TRACK {track_miles: 1436}]->(losangeles),
      (denver)-[:TRACK {track_miles: 1003}]->(chicago),
      (chicago)-[:TRACK {track_miles: 1003}]->(denver),
      (denver)-[:TRACK {track_miles: 794}]->(dallas),
      (dallas)-[:TRACK {track_miles: 794}]->(denver),
      (chicago)-[:TRACK {track_miles: 794}]->(newyork),
      (newyork)-[:TRACK {track_miles: 794}]->(chicago),
      (dallas)-[:TRACK {track_miles: 1329}]->(washington),
      (washington)-[:TRACK {track_miles: 1329}]->(dallas),
      (newyork)-[:TRACK {track_miles: 226}]->(washington),
      (washington)-[:TRACK {track_miles: 226}]->(newyork),
      (washington)-[:TRACK {track_miles: 1053}]->(miami),
      (miami)-[:TRACK {track_miles: 1053}]->(washington),
      (anchorage)-[:TRACK {track_miles: 359}]->(fairbanks),
      (fairbanks)-[:TRACK {track_miles: 359}]->(anchorage),
      (sanjuan)-[:TRACK {track_miles: 71}]->(ponce),
      (ponce)-[:TRACK {track_miles: 71}]->(sanjuan),
      (ponce)-[:TRACK {track_miles: 57}]->(mayaguez),
      (mayaguez)-[:TRACK {track_miles: 57}]->(ponce),
      (mayaguez)-[:TRACK {track_miles: 120}]->(sanjuan),
      (sanjuan)-[:TRACK {track_miles: 120}]->(mayaguez)


    """

    session.run(query)

In [11]:
my_create_disconnected_graph()

In [12]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Anchorage,[Station]
1,Berkeley,[Station]
2,Chicago,[Station]
3,Dallas,[Station]
4,Denver,[Station]
5,Fairbanks,[Station]
6,Los Angeles,[Station]
7,Mayaguez,[Station]
8,Miami,[Station]
9,New York,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,Anchorage,[Station],TRACK,Fairbanks,[Station]
1,Berkeley,[Station],TRACK,Denver,[Station]
2,Berkeley,[Station],TRACK,Los Angeles,[Station]
3,Berkeley,[Station],TRACK,Seattle,[Station]
4,Chicago,[Station],TRACK,Denver,[Station]
5,Chicago,[Station],TRACK,New York,[Station]
6,Dallas,[Station],TRACK,Denver,[Station]
7,Dallas,[Station],TRACK,Los Angeles,[Station]
8,Dallas,[Station],TRACK,Washington,[Station]
9,Denver,[Station],TRACK,Berkeley,[Station]


-------------------------
  Density: 0.3
-------------------------


# Lab: Neo4j - Graphy Features

## There is some variation in what is a "graphy feature" and what is a "graph algorithm feature";  many of the graphy features require a graph algorithm to be run to calculate the feature;  a better approach is to consider what can be single valued in a relational table of nodes;  for those who have had Machine Learning, features are typically input in an ML algorithm as a table structure

## Create a relational table in Postgres for the graphy_features: node, degree, closeness, betweenness, triangle_count, clustering_coefficient, and community

In [13]:
connection.rollback()

query = """

drop table if exists graphy_features
;

create table graphy_features(
    node varchar(32),
    degree numeric(5),
    closeness numeric(5,4),
    betweenness numeric(5),
    triangle_count numeric(5),
    clustering_coefficient numeric(5,4),
    community numeric(5)
)
;

"""

cursor.execute(query)

connection.commit()

In [14]:
def my_get_node_list():
    "get a list of nodes in the current graph"
    
    query = "match (n) return n.name as name"
    
    result = session.run(query)
    
    node_list = []
    
    for r in result:
        node_list.append(r["name"])
        
    node_list = sorted(node_list)
    
    return node_list

In [15]:
connection.rollback()

query = """

insert into graphy_features
values
(%s, 0, 0, 0, 0, 0, 0)
;

"""

node_list = my_get_node_list()

for node in node_list:
    cursor.execute(query, (node,))

connection.commit()

In [16]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,0,0,0,0,0,0
1,Berkeley,0,0,0,0,0,0
2,Chicago,0,0,0,0,0,0
3,Dallas,0,0,0,0,0,0
4,Denver,0,0,0,0,0,0
5,Fairbanks,0,0,0,0,0,0
6,Los Angeles,0,0,0,0,0,0
7,Mayaguez,0,0,0,0,0,0
8,Miami,0,0,0,0,0,0
9,New York,0,0,0,0,0,0


## For each node, calculate the degree centrality and update our graphy_features table

In [17]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653d66c40>

In [18]:
query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set degree = %s where node = %s"
    
    cursor.execute(query, (r["degree"], r["name"]))

connection.commit()

In [19]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0,0,0,0,0
1,Berkeley,3,0,0,0,0,0
2,Chicago,2,0,0,0,0,0
3,Dallas,3,0,0,0,0,0
4,Denver,4,0,0,0,0,0
5,Fairbanks,1,0,0,0,0,0
6,Los Angeles,2,0,0,0,0,0
7,Mayaguez,2,0,0,0,0,0
8,Miami,1,0,0,0,0,0
9,New York,2,0,0,0,0,0


## For each node, calculate the closeness centrality and update our graphy_features table; we will use harmonic centrality to calculate closeness since it handles disconnected subgraphs and gives much smoother results

In [20]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653defc40>

In [21]:
query = """

CALL gds.closeness.harmonic.stream('ds_graph', {})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY closeness DESC

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set closeness = %s where node = %s"
    
    cursor.execute(query, (r["closeness"], r["name"]))

connection.commit()

In [22]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0.0769,0,0,0,0
1,Berkeley,3,0.3782,0,0,0,0
2,Chicago,2,0.359,0,0,0,0
3,Dallas,3,0.4231,0,0,0,0
4,Denver,4,0.4487,0,0,0,0
5,Fairbanks,1,0.0769,0,0,0,0
6,Los Angeles,2,0.3462,0,0,0,0
7,Mayaguez,2,0.1538,0,0,0,0
8,Miami,1,0.2692,0,0,0,0
9,New York,2,0.3462,0,0,0,0


## For each node, calculate the betweenness centrality and update our graphy_features table; 

In [23]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653e16610>

In [24]:
query = """

CALL gds.betweenness.stream('ds_graph', {relationshipWeightProperty: 'track_miles'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set betweenness = %s where node = %s"
    
    cursor.execute(query, (r["betweenness"], r["name"]))

connection.commit()


In [25]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0.0769,0,0,0,0
1,Berkeley,3,0.3782,6,0,0,0
2,Chicago,2,0.359,14,0,0,0
3,Dallas,3,0.4231,10,0,0,0
4,Denver,4,0.4487,18,0,0,0
5,Fairbanks,1,0.0769,0,0,0,0
6,Los Angeles,2,0.3462,6,0,0,0
7,Mayaguez,2,0.1538,0,0,0,0
8,Miami,1,0.2692,0,0,0,0
9,New York,2,0.3462,12,0,0,0


## For each node, calculate the triangle count and update our graphy_features table

In [26]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653e1ef10>

In [27]:
query = """

CALL gds.triangleCount.stream('ds_graph')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS name, triangleCount as triangle_count
ORDER BY triangleCount DESC, name

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set triangle_count = %s where node = %s"
    
    cursor.execute(query, (r["triangle_count"], r["name"]))

connection.commit()



In [28]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0.0769,0,0,0,0
1,Berkeley,3,0.3782,6,1,0,0
2,Chicago,2,0.359,14,0,0,0
3,Dallas,3,0.4231,10,0,0,0
4,Denver,4,0.4487,18,1,0,0
5,Fairbanks,1,0.0769,0,0,0,0
6,Los Angeles,2,0.3462,6,0,0,0
7,Mayaguez,2,0.1538,0,1,0,0
8,Miami,1,0.2692,0,0,0,0
9,New York,2,0.3462,12,0,0,0


## For each node, calculate the clustering coefficient and update our graphy_features table

In [29]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653817c70>

In [30]:
query = """

CALL gds.localClusteringCoefficient.stream('ds_graph')
YIELD nodeId, localClusteringCoefficient
RETURN gds.util.asNode(nodeId).name AS name, localClusteringCoefficient as clustering_coefficient
ORDER BY localClusteringCoefficient DESC, name

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set clustering_coefficient = %s where node = %s"
    
    cursor.execute(query, (r["clustering_coefficient"], r["name"]))

connection.commit()


In [31]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0.0769,0,0,0.0,0
1,Berkeley,3,0.3782,6,1,0.3333,0
2,Chicago,2,0.359,14,0,0.0,0
3,Dallas,3,0.4231,10,0,0.0,0
4,Denver,4,0.4487,18,1,0.1667,0
5,Fairbanks,1,0.0769,0,0,0.0,0
6,Los Angeles,2,0.3462,6,0,0.0,0
7,Mayaguez,2,0.1538,0,1,1.0,0
8,Miami,1,0.2692,0,0,0.0,0
9,New York,2,0.3462,12,0,0.0,0


## For each node, calculate the community and update our graphy_features table;  we will use Louvain Modularity since it gave us the best results

In [32]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'TRACK', 
                      {relationshipProperties: 'track_miles'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7f0653824ac0>

In [33]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set community = %s where node = %s"
    
    cursor.execute(query, (r["community"], r["name"]))

connection.commit()

In [34]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,closeness,betweenness,triangle_count,clustering_coefficient,community
0,Anchorage,1,0.0769,0,0,0.0,10
1,Berkeley,3,0.3782,6,1,0.3333,3
2,Chicago,2,0.359,14,0,0.0,6
3,Dallas,3,0.4231,10,0,0.0,3
4,Denver,4,0.4487,18,1,0.1667,3
5,Fairbanks,1,0.0769,0,0,0.0,10
6,Los Angeles,2,0.3462,6,0,0.0,3
7,Mayaguez,2,0.1538,0,1,1.0,12
8,Miami,1,0.2692,0,0,0.0,6
9,New York,2,0.3462,12,0,0.0,6


## We can also query stats such as average, standard deviation, etc.

In [35]:
rollback_before_flag = True
rollback_after_flag = True

query = """

with summary as (

select avg(degree) as avg_degree,
       stddev(degree) as std_degree,
       avg(closeness) as avg_closeness,
       stddev(closeness) as std_closeness,
       avg(betweenness) as avg_betweenness,
       stddev(betweenness) as std_betweenness,
       avg(triangle_count) as avg_triangle_count,
       stddev(triangle_count) as std_triangle_count,
       avg(clustering_coefficient) as avg_clustering_coefficient,
       stddev(clustering_coefficient) as std_clustering_coefficient
from graphy_features

)

select a.node,
       a.degree, b.avg_degree, b.std_degree,
       a.closeness, b.avg_closeness, b.std_closeness,
       a.betweenness, b.avg_betweenness, b.std_betweenness,
       a.triangle_count, b.avg_triangle_count, b.std_triangle_count,
       a.clustering_coefficient, b.avg_clustering_coefficient, b.std_clustering_coefficient,
       a.community
from graphy_features as a,
     summary as b
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,degree,avg_degree,std_degree,closeness,avg_closeness,std_closeness,betweenness,avg_betweenness,std_betweenness,triangle_count,avg_triangle_count,std_triangle_count,clustering_coefficient,avg_clustering_coefficient,std_clustering_coefficient,community
0,Anchorage,1,2.142857,0.864438,0.0769,0.280207,0.130481,0,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,10
1,Berkeley,3,2.142857,0.864438,0.3782,0.280207,0.130481,6,6,7.103629,1,0.428571,0.513553,0.3333,0.321429,0.455095,3
2,Chicago,2,2.142857,0.864438,0.359,0.280207,0.130481,14,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,6
3,Dallas,3,2.142857,0.864438,0.4231,0.280207,0.130481,10,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,3
4,Denver,4,2.142857,0.864438,0.4487,0.280207,0.130481,18,6,7.103629,1,0.428571,0.513553,0.1667,0.321429,0.455095,3
5,Fairbanks,1,2.142857,0.864438,0.0769,0.280207,0.130481,0,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,10
6,Los Angeles,2,2.142857,0.864438,0.3462,0.280207,0.130481,6,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,3
7,Mayaguez,2,2.142857,0.864438,0.1538,0.280207,0.130481,0,6,7.103629,1,0.428571,0.513553,1.0,0.321429,0.455095,12
8,Miami,1,2.142857,0.864438,0.2692,0.280207,0.130481,0,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,6
9,New York,2,2.142857,0.864438,0.3462,0.280207,0.130481,12,6,7.103629,0,0.428571,0.513553,0.0,0.321429,0.455095,6


# Lab: Neo4j - Graph Algorithm Features

## For our Graph Algorithm Features, we will conside graph algorithm features anything that is multivalued and/or not specific to a single node;  we will create two tables: one for shortest paths and one for minimum spanning trees

## Create a relational table in Postgres for the graph_algorithm_features_1: from, to, cost, hops, path_string, path, path_costs

In [36]:
connection.rollback()

query = """

drop table if exists graph_algorithm_features_1
;

create table graph_algorithm_features_1(
    source varchar(32),
    target varchar(32),
    cost numeric(5),
    hops numeric(5),
    path_string varchar(100),
    path varchar(100),
    path_costs varchar(100)
)
;

"""

cursor.execute(query)

connection.commit()

## For each node loop through all nodes and run Dijkstra's algorithm for shortest path;

In [37]:
connection.rollback()

neo4j_query = """

MATCH (source:Station {name: $source}), (target:Station {name: $target})
CALL gds.shortestPath.dijkstra.stream(
    'ds_graph', 
    { sourceNode: source, 
      targetNode: target, 
      relationshipWeightProperty: 'track_miles'
    }
)
YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
RETURN
    gds.util.asNode(sourceNode).name AS from,
    gds.util.asNode(targetNode).name AS to,
    totalCost,
    [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
    costs
ORDER BY index

"""

postgres_query = """

insert into graph_algorithm_features_1
values
(%s, %s, %s, %s, %s, %s, %s)
;

"""

node_list = my_get_node_list()

for source in node_list:
    for target in node_list:
        
        if source != target:
            
            result = session.run(neo4j_query, source=source, target=target )
            
            for r in result:
                
                path_list = r["nodes"]
                
                path_string = "".join(path_list)
                    
                cursor.execute(postgres_query, (r["from"], 
                                                r["to"], 
                                                r["totalCost"], 
                                                len(path_list)-1,
                                                path_string,
                                                r["nodes"],
                                                r["costs"]
                                                ))

connection.commit()

## source, target, costs, hops, path_string can go directly into machine learning algorithms

## path_string would probably use a string kernel with Levenshtein distances for a measure of distance

## path and path_costs would be more suitable to constructing vectors

In [38]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_1
order by source, target

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,source,target,cost,hops,path_string,path,path_costs
0,Anchorage,Fairbanks,359,1,AnchorageFairbanks,"{Anchorage,Fairbanks}","{0.0,359.0}"
1,Berkeley,Chicago,2243,2,BerkeleyDenverChicago,"{Berkeley,Denver,Chicago}","{0.0,1240.0,2243.0}"
2,Berkeley,Dallas,1812,2,BerkeleyLos AngelesDallas,"{Berkeley,""Los Angeles"",Dallas}","{0.0,376.0,1812.0}"
3,Berkeley,Denver,1240,1,BerkeleyDenver,"{Berkeley,Denver}","{0.0,1240.0}"
4,Berkeley,Los Angeles,376,1,BerkeleyLos Angeles,"{Berkeley,""Los Angeles""}","{0.0,376.0}"
...,...,...,...,...,...,...,...
75,Washington,Denver,2023,3,WashingtonNew YorkChicagoDenver,"{Washington,""New York"",Chicago,Denver}","{0.0,226.0,1020.0,2023.0}"
76,Washington,Los Angeles,2765,2,WashingtonDallasLos Angeles,"{Washington,Dallas,""Los Angeles""}","{0.0,1329.0,2765.0}"
77,Washington,Miami,1053,1,WashingtonMiami,"{Washington,Miami}","{0.0,1053.0}"
78,Washington,New York,226,1,WashingtonNew York,"{Washington,""New York""}","{0.0,226.0}"


## 20 shortest paths with the highest cost

In [39]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_1
order by cost desc, source, target
limit 20

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,source,target,cost,hops,path_string,path,path_costs
0,Miami,Seattle,4379,5,MiamiWashingtonNew YorkChicagoDenverSeattle,"{Miami,Washington,""New York"",Chicago,Denver,Se...","{0.0,1053.0,1279.0,2073.0,3076.0,4379.0}"
1,Seattle,Miami,4379,5,SeattleDenverChicagoNew YorkWashingtonMiami,"{Seattle,Denver,Chicago,""New York"",Washington,...","{0.0,1303.0,2306.0,3100.0,3326.0,4379.0}"
2,Berkeley,Miami,4194,4,BerkeleyLos AngelesDallasWashingtonMiami,"{Berkeley,""Los Angeles"",Dallas,Washington,Miami}","{0.0,376.0,1812.0,3141.0,4194.0}"
3,Miami,Berkeley,4194,4,MiamiWashingtonDallasLos AngelesBerkeley,"{Miami,Washington,Dallas,""Los Angeles"",Berkeley}","{0.0,1053.0,2382.0,3818.0,4194.0}"
4,Los Angeles,Miami,3818,3,Los AngelesDallasWashingtonMiami,"{""Los Angeles"",Dallas,Washington,Miami}","{0.0,1436.0,2765.0,3818.0}"
5,Miami,Los Angeles,3818,3,MiamiWashingtonDallasLos Angeles,"{Miami,Washington,Dallas,""Los Angeles""}","{0.0,1053.0,2382.0,3818.0}"
6,Seattle,Washington,3326,4,SeattleDenverChicagoNew YorkWashington,"{Seattle,Denver,Chicago,""New York"",Washington}","{0.0,1303.0,2306.0,3100.0,3326.0}"
7,Washington,Seattle,3326,4,WashingtonNew YorkChicagoDenverSeattle,"{Washington,""New York"",Chicago,Denver,Seattle}","{0.0,226.0,1020.0,2023.0,3326.0}"
8,Berkeley,Washington,3141,3,BerkeleyLos AngelesDallasWashington,"{Berkeley,""Los Angeles"",Dallas,Washington}","{0.0,376.0,1812.0,3141.0}"
9,Washington,Berkeley,3141,3,WashingtonDallasLos AngelesBerkeley,"{Washington,Dallas,""Los Angeles"",Berkeley}","{0.0,1329.0,2765.0,3141.0}"


## 20 shortest paths with the lowest cost

In [40]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_1
order by cost, source, target
limit 20

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,source,target,cost,hops,path_string,path,path_costs
0,Mayaguez,Ponce,57,1,MayaguezPonce,"{Mayaguez,Ponce}","{0.0,57.0}"
1,Ponce,Mayaguez,57,1,PonceMayaguez,"{Ponce,Mayaguez}","{0.0,57.0}"
2,Ponce,San Juan,71,1,PonceSan Juan,"{Ponce,""San Juan""}","{0.0,71.0}"
3,San Juan,Ponce,71,1,San JuanPonce,"{""San Juan"",Ponce}","{0.0,71.0}"
4,Mayaguez,San Juan,120,1,MayaguezSan Juan,"{Mayaguez,""San Juan""}","{0.0,120.0}"
5,San Juan,Mayaguez,120,1,San JuanMayaguez,"{""San Juan"",Mayaguez}","{0.0,120.0}"
6,New York,Washington,226,1,New YorkWashington,"{""New York"",Washington}","{0.0,226.0}"
7,Washington,New York,226,1,WashingtonNew York,"{Washington,""New York""}","{0.0,226.0}"
8,Anchorage,Fairbanks,359,1,AnchorageFairbanks,"{Anchorage,Fairbanks}","{0.0,359.0}"
9,Fairbanks,Anchorage,359,1,FairbanksAnchorage,"{Fairbanks,Anchorage}","{0.0,359.0}"


## Create a relational table in Postgres for the graph_algorithm_features_2: node, cost, mst (minimum spanning tree)

In [41]:
connection.rollback()

query = """

drop table if exists graph_algorithm_features_2
;

create table graph_algorithm_features_2(
    node varchar(32),
    cost numeric(5),
    mst varchar(1000)
)
;

"""

cursor.execute(query)

connection.commit()

## For each node find the minimum spanning tree

In [42]:
neo4j_query_1 = """

MATCH ()-[r:MST]-()
DELETE r

"""

neo4j_query_2 = """

CALL gds.graph.drop('ds_graph', false) yield graphName

"""

neo4j_query_3 = """

CALL gds.graph.project('ds_graph', 'Station', 
                        {
                            TRACK: {
                                properties: 'track_miles',
                                orientation: 'UNDIRECTED'
                            }
                        }
                       )

"""

neo4j_query_4 = """

MATCH (n:Station {name: $source})
CALL gds.spanningTree.write('ds_graph',
                                          {sourceNode: n,
                                           relationshipWeightProperty: 'track_miles',
                                           writeProperty: 'writeCost',
                                           writeRelationshipType: 'MST'
                                          }
                                         )
YIELD preProcessingMillis, computeMillis, writeMillis, effectiveNodeCount
RETURN preProcessingMillis, computeMillis, writeMillis, effectiveNodeCount;


"""

neo4j_query_5 = """

MATCH path = (n:Station {name: $source})-[:MST*]-()
WITH relationships(path) AS rels
UNWIND rels AS rel
WITH DISTINCT rel AS rel
RETURN startNode(rel).name AS source, endNode(rel).name AS destination, rel.writeCost AS cost

"""

postgres_query = """

insert into graph_algorithm_features_2
values
(%s, %s, %s)
;

"""

connection.rollback()

node_list = my_get_node_list()

for source in node_list:
    
    session.run(neo4j_query_1)
    
    session.run(neo4j_query_2)
    
    session.run(neo4j_query_3)
    
    session.run(neo4j_query_4, source=source)
    
    result = session.run(neo4j_query_5, source=source)
    
    cost = 0
    
    mst = []
    
    for r in result:
        
        cost += r["cost"]
        
        mst.append([r["source"], r["destination"]])
            
    cursor.execute(postgres_query, (source, cost, mst))

connection.commit()

## source and cost can go directly into machine learning algorithms


## mst would be more suitable to constructing vectors

In [44]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_2
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,cost,mst
0,Anchorage,359,"{{Anchorage,Fairbanks}}"
1,Berkeley,6284,"{{Berkeley,""Los Angeles""},{Berkeley,Denver},{D..."
2,Chicago,6284,"{{Chicago,Denver},{Denver,Berkeley},{Berkeley,..."
3,Dallas,6284,"{{Dallas,Denver},{Denver,Berkeley},{Berkeley,S..."
4,Denver,6284,"{{Denver,Dallas},{Denver,Berkeley},{Berkeley,""..."
5,Fairbanks,359,"{{Fairbanks,Anchorage}}"
6,Los Angeles,6284,"{{""Los Angeles"",Berkeley},{Berkeley,Denver},{D..."
7,Mayaguez,128,"{{Mayaguez,Ponce},{Ponce,""San Juan""}}"
8,Miami,6284,"{{Miami,Washington},{Washington,""New York""},{""..."
9,New York,6284,"{{""New York"",Washington},{Washington,Miami},{""..."


## smallest cost mst's

Note: when the minimum spanning tree algorithm went from alpha to beta in neo4j, they changed it from directed to undirected. One consequence is that in our example, they will all have the same cost.  Personal opinion:  this is a really bad decision on their part, and I hope they change it back to directed, or at least give an option for directed in the next release.

In [45]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_2
order by cost, node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,cost,mst
0,Mayaguez,128,"{{Mayaguez,Ponce},{Ponce,""San Juan""}}"
1,Ponce,128,"{{Ponce,Mayaguez},{Ponce,""San Juan""}}"
2,San Juan,128,"{{""San Juan"",Ponce},{Ponce,Mayaguez}}"
3,Anchorage,359,"{{Anchorage,Fairbanks}}"
4,Fairbanks,359,"{{Fairbanks,Anchorage}}"
5,Berkeley,6284,"{{Berkeley,""Los Angeles""},{Berkeley,Denver},{D..."
6,Chicago,6284,"{{Chicago,Denver},{Denver,Berkeley},{Berkeley,..."
7,Dallas,6284,"{{Dallas,Denver},{Denver,Berkeley},{Berkeley,S..."
8,Denver,6284,"{{Denver,Dallas},{Denver,Berkeley},{Berkeley,""..."
9,Los Angeles,6284,"{{""Los Angeles"",Berkeley},{Berkeley,Denver},{D..."


## largest cost mst's

Note: when the minimum spanning tree algorithm went from alpha to beta in neo4j, they changed it from directed to undirected. One consequence is that in our example, they will all have the same cost.  Personal opinion:  this is a really bad decision on their part, and I hope they change it back to directed, or at least give an option for directed in the next release.

In [46]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graph_algorithm_features_2
order by cost desc, node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,cost,mst
0,Berkeley,6284,"{{Berkeley,""Los Angeles""},{Berkeley,Denver},{D..."
1,Chicago,6284,"{{Chicago,Denver},{Denver,Berkeley},{Berkeley,..."
2,Dallas,6284,"{{Dallas,Denver},{Denver,Berkeley},{Berkeley,S..."
3,Denver,6284,"{{Denver,Dallas},{Denver,Berkeley},{Berkeley,""..."
4,Los Angeles,6284,"{{""Los Angeles"",Berkeley},{Berkeley,Denver},{D..."
5,Miami,6284,"{{Miami,Washington},{Washington,""New York""},{""..."
6,New York,6284,"{{""New York"",Washington},{Washington,Miami},{""..."
7,Seattle,6284,"{{Seattle,Berkeley},{Berkeley,Denver},{Denver,..."
8,Washington,6284,"{{Washington,""New York""},{""New York"",Chicago},..."
9,Anchorage,359,"{{Anchorage,Fairbanks}}"
