# Graph Centrality Algorithms

## Web server interface at https://xxxx:7473

#### Update - since the videos were filmed, neo4j requires a longer, more complex password, so the newest password is here:

**Username: neo4j**

**Password: ucb_mids_w205**

**In the web server interface, run the same query from last week to return all nodes and all relationships:**

```
match (n) return n
```

In [2]:
import neo4j

import pandas as pd

from IPython.display import display

In [3]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [4]:
session = driver.session(database="neo4j")

In [5]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [6]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [7]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

## Connected Graph - same graph as last week; high speed rail

In [8]:
def my_create_connected_graph():
    "create the connected graph"
    
    my_neo4j_wipe_out_database()

    query = """

    CREATE
      (seattle:Station {name: 'Seattle', latitude: 47.6062, longitude: -122.3321}),
      (berkeley:Station {name: 'Berkeley', latitude: 37.8715, longitude: -122.2730}),
      (losangeles:Station {name: 'Los Angeles', latitude: 34.0522, longitude: -118.2437}),
      (denver:Station {name: 'Denver', latitude: 39.7392, longitude: -104.9903}),
      (dallas:Station {name: 'Dallas', latitude: 32.7767, longitude: -96.7970}),
      (chicago:Station {name: 'Chicago', latitude: 41.8781, longitude: -87.6298}),
      (newyork:Station {name: 'New York', latitude: 40.7128, longitude: -74.0060}),
      (washington:Station {name: 'Washington', latitude: 38.9072, longitude: -77.0369}),
      (miami:Station {name: 'Miami', latitude: 25.7617, longitude: -80.1918}),
      (seattle)-[:TRACK {track_miles: 798}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 798}]->(seattle),
      (seattle)-[:TRACK {track_miles: 1303}]->(denver),
      (denver)-[:TRACK {track_miles: 1303}]->(seattle),
      (berkeley)-[:TRACK {track_miles: 1240}]->(denver),
      (denver)-[:TRACK {track_miles: 1240}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 376}]->(losangeles),
      (losangeles)-[:TRACK {track_miles: 376}]->(berkeley),
      (losangeles)-[:TRACK {track_miles: 1436}]->(dallas),
      (dallas)-[:TRACK {track_miles: 1436}]->(losangeles),
      (denver)-[:TRACK {track_miles: 1003}]->(chicago),
      (chicago)-[:TRACK {track_miles: 1003}]->(denver),
      (denver)-[:TRACK {track_miles: 794}]->(dallas),
      (dallas)-[:TRACK {track_miles: 794}]->(denver),
      (chicago)-[:TRACK {track_miles: 794}]->(newyork),
      (newyork)-[:TRACK {track_miles: 794}]->(chicago),
      (dallas)-[:TRACK {track_miles: 1329}]->(washington),
      (washington)-[:TRACK {track_miles: 1329}]->(dallas),
      (newyork)-[:TRACK {track_miles: 226}]->(washington),
      (washington)-[:TRACK {track_miles: 226}]->(newyork),
      (washington)-[:TRACK {track_miles: 1053}]->(miami),
      (miami)-[:TRACK {track_miles: 1053}]->(washington)


    """

    session.run(query)

In [9]:
my_create_connected_graph()

In [10]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Berkeley,[Station]
1,Chicago,[Station]
2,Dallas,[Station]
3,Denver,[Station]
4,Los Angeles,[Station]
5,Miami,[Station]
6,New York,[Station]
7,Seattle,[Station]
8,Washington,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,Berkeley,[Station],TRACK,Denver,[Station]
1,Berkeley,[Station],TRACK,Los Angeles,[Station]
2,Berkeley,[Station],TRACK,Seattle,[Station]
3,Chicago,[Station],TRACK,Denver,[Station]
4,Chicago,[Station],TRACK,New York,[Station]
5,Dallas,[Station],TRACK,Denver,[Station]
6,Dallas,[Station],TRACK,Los Angeles,[Station]
7,Dallas,[Station],TRACK,Washington,[Station]
8,Denver,[Station],TRACK,Berkeley,[Station]
9,Denver,[Station],TRACK,Chicago,[Station]


-------------------------
  Density: 0.6
-------------------------


## Disconnected Graph - add two disconnected subgraphs to our graph: Anchorage and Fairbanks in Alaska, San Juan, Ponce, and Mayaguez in Peurto Rico

In [11]:
def my_create_disconnected_graph():
    "create the connected graph"
    
    my_neo4j_wipe_out_database()

    query = """

    CREATE
      (seattle:Station {name: 'Seattle', latitude: 47.6062, longitude: -122.3321}),
      (berkeley:Station {name: 'Berkeley', latitude: 37.8715, longitude: -122.2730}),
      (losangeles:Station {name: 'Los Angeles', latitude: 34.0522, longitude: -118.2437}),
      (denver:Station {name: 'Denver', latitude: 39.7392, longitude: -104.9903}),
      (dallas:Station {name: 'Dallas', latitude: 32.7767, longitude: -96.7970}),
      (chicago:Station {name: 'Chicago', latitude: 41.8781, longitude: -87.6298}),
      (newyork:Station {name: 'New York', latitude: 40.7128, longitude: -74.0060}),
      (washington:Station {name: 'Washington', latitude: 38.9072, longitude: -77.0369}),
      (miami:Station {name: 'Miami', latitude: 25.7617, longitude: -80.1918}),
      (anchorage:Station {name: 'Anchorage', latitude: 61.2181, longitude: -149.9003}),
      (fairbanks:Station {name: 'Fairbanks', latitude: 64.8378, longitude: -147.7164}),
      (sanjuan:Station {name: 'San Juan', latitude: 18.4655, longitude: -66.1057}),
      (ponce:Station {name: 'Ponce', latitude: 18.0111, longitude: -66.6141}),
      (mayaguez:Station {name: 'Mayaguez', latitude: 18.2013, longitude: -67.1452}),
      (seattle)-[:TRACK {track_miles: 798}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 798}]->(seattle),
      (seattle)-[:TRACK {track_miles: 1303}]->(denver),
      (denver)-[:TRACK {track_miles: 1303}]->(seattle),
      (berkeley)-[:TRACK {track_miles: 1240}]->(denver),
      (denver)-[:TRACK {track_miles: 1240}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 376}]->(losangeles),
      (losangeles)-[:TRACK {track_miles: 376}]->(berkeley),
      (losangeles)-[:TRACK {track_miles: 1436}]->(dallas),
      (dallas)-[:TRACK {track_miles: 1436}]->(losangeles),
      (denver)-[:TRACK {track_miles: 1003}]->(chicago),
      (chicago)-[:TRACK {track_miles: 1003}]->(denver),
      (denver)-[:TRACK {track_miles: 794}]->(dallas),
      (dallas)-[:TRACK {track_miles: 794}]->(denver),
      (chicago)-[:TRACK {track_miles: 794}]->(newyork),
      (newyork)-[:TRACK {track_miles: 794}]->(chicago),
      (dallas)-[:TRACK {track_miles: 1329}]->(washington),
      (washington)-[:TRACK {track_miles: 1329}]->(dallas),
      (newyork)-[:TRACK {track_miles: 226}]->(washington),
      (washington)-[:TRACK {track_miles: 226}]->(newyork),
      (washington)-[:TRACK {track_miles: 1053}]->(miami),
      (miami)-[:TRACK {track_miles: 1053}]->(washington),
      (anchorage)-[:TRACK {track_miles: 359}]->(fairbanks),
      (fairbanks)-[:TRACK {track_miles: 359}]->(anchorage),
      (sanjuan)-[:TRACK {track_miles: 71}]->(ponce),
      (ponce)-[:TRACK {track_miles: 71}]->(sanjuan),
      (ponce)-[:TRACK {track_miles: 57}]->(mayaguez),
      (mayaguez)-[:TRACK {track_miles: 57}]->(ponce),
      (mayaguez)-[:TRACK {track_miles: 120}]->(sanjuan),
      (sanjuan)-[:TRACK {track_miles: 120}]->(mayaguez)


    """

    session.run(query)

In [12]:
my_create_disconnected_graph()

In [13]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Anchorage,[Station]
1,Berkeley,[Station]
2,Chicago,[Station]
3,Dallas,[Station]
4,Denver,[Station]
5,Fairbanks,[Station]
6,Los Angeles,[Station]
7,Mayaguez,[Station]
8,Miami,[Station]
9,New York,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,Anchorage,[Station],TRACK,Fairbanks,[Station]
1,Berkeley,[Station],TRACK,Denver,[Station]
2,Berkeley,[Station],TRACK,Los Angeles,[Station]
3,Berkeley,[Station],TRACK,Seattle,[Station]
4,Chicago,[Station],TRACK,Denver,[Station]
5,Chicago,[Station],TRACK,New York,[Station]
6,Dallas,[Station],TRACK,Denver,[Station]
7,Dallas,[Station],TRACK,Los Angeles,[Station]
8,Dallas,[Station],TRACK,Washington,[Station]
9,Denver,[Station],TRACK,Berkeley,[Station]


-------------------------
  Density: 0.3
-------------------------


# Lab: Neo4j - Degree Centrality

## Degree Centrality - number of relationships a node has both incoming and outgoing

## Connected Graph

In [14]:
my_create_connected_graph()

In [15]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd2442f7ca0>

In [16]:
query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,degree
0,Denver,4.0
1,Berkeley,3.0
2,Dallas,3.0
3,Washington,3.0
4,Chicago,2.0
5,Los Angeles,2.0
6,New York,2.0
7,Seattle,2.0
8,Miami,1.0


## Disconnected Graph 

In [17]:
my_create_disconnected_graph()

In [18]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a7f4a90>

In [19]:
query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,degree
0,Denver,4.0
1,Berkeley,3.0
2,Dallas,3.0
3,Washington,3.0
4,Chicago,2.0
5,Los Angeles,2.0
6,Mayaguez,2.0
7,New York,2.0
8,Ponce,2.0
9,San Juan,2.0


# Lab: Neo4j - Closeness Centrality, Wasserman and Faust, Harmonic Centrality

## Closeness Centrality - average of shortest path distances between a node and all other nodes; high closeness - shortest distances to other nodes, able to spread info most efficiently; 

## Connected Graph

In [20]:
my_create_connected_graph()

In [21]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a81e700>

In [22]:
query = """

CALL gds.closeness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)



Unnamed: 0,name,closeness
0,Denver,0.615385
1,Dallas,0.615385
2,Washington,0.533333
3,Chicago,0.5
4,Berkeley,0.470588
5,Los Angeles,0.470588
6,New York,0.470588
7,Seattle,0.444444
8,Miami,0.363636


## Disconnected Graph - does not handle small disconnected subsets very well; treats distance as infinity; most end up with a closeness of 1.0 which is misleading as you might assume they are well connected in the entire graph

In [23]:
my_create_disconnected_graph()

In [24]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd24431dfd0>

In [25]:
query = """

CALL gds.closeness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)



Unnamed: 0,name,closeness
0,Anchorage,1.0
1,Fairbanks,1.0
2,San Juan,1.0
3,Ponce,1.0
4,Mayaguez,1.0
5,Denver,0.615385
6,Dallas,0.615385
7,Washington,0.533333
8,Chicago,0.5
9,Berkeley,0.470588


## Wasserman and Faust - better handing of disconnected graphs - standard algorithm sees distances to nodes in disconnected subgraphs as infinity - skews calculations

## Connected Graph

In [26]:
my_create_connected_graph()

In [27]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a7f4340>

In [28]:
query = """

CALL gds.closeness.stream('ds_graph',
                               {useWassermanFaust: true}
                              )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)



Unnamed: 0,name,closeness
0,Denver,0.615385
1,Dallas,0.615385
2,Washington,0.533333
3,Chicago,0.5
4,Berkeley,0.470588
5,Los Angeles,0.470588
6,New York,0.470588
7,Seattle,0.444444
8,Miami,0.363636


## Disconnected Graph - handles much better; closeness numbers are much more in line with their actual closeness to other nodes in the entire graph

In [29]:
my_create_disconnected_graph()

In [30]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a807af0>

In [31]:
query = """

CALL gds.closeness.stream('ds_graph',
                               {useWassermanFaust: true}
                              )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)



Unnamed: 0,name,closeness
0,Denver,0.378698
1,Dallas,0.378698
2,Washington,0.328205
3,Chicago,0.307692
4,Berkeley,0.289593
5,Los Angeles,0.289593
6,New York,0.289593
7,Seattle,0.273504
8,Miami,0.223776
9,San Juan,0.153846


## Harmonic Centrality - another approach to handle disconnected graphs - instead of summing distances, sum the inverses, inverse of infinity is zero; smooths out extremes; generally the most accurate picture for all nodes

## Connected Graph

In [32]:
my_create_connected_graph()

In [33]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a83fd30>

In [34]:
query = """

CALL gds.closeness.harmonic.stream('ds_graph', {})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY closeness DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,closeness
0,Denver,0.729167
1,Dallas,0.6875
2,Washington,0.645833
3,Berkeley,0.614583
4,Chicago,0.583333
5,Los Angeles,0.5625
6,New York,0.5625
7,Seattle,0.552083
8,Miami,0.4375


## Disconnected Graph - handles the disconnected subgraphs and is overall much smoother

In [35]:
my_create_disconnected_graph()

In [36]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a8078b0>

In [37]:
query = """

CALL gds.closeness.harmonic.stream('ds_graph', {})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY closeness DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,closeness
0,Denver,0.448718
1,Dallas,0.423077
2,Washington,0.397436
3,Berkeley,0.378205
4,Chicago,0.358974
5,Los Angeles,0.346154
6,New York,0.346154
7,Seattle,0.339744
8,Miami,0.269231
9,San Juan,0.153846


# Lab: Neo4j - Betweenness Centrality, Randomized-Approximate Brandes

## Betweenness Centrality - all pairs shortest path, for each node how many paths pass through the node; high betweenness - control point, bridge, high influence over flow within graph; pivotal node - lies on every path between two other nodes

## Connected Graph

In [38]:
my_create_connected_graph()

In [39]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {properties: 'track_miles'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a7f4af0>

In [40]:
query = """

CALL gds.betweenness.stream('ds_graph', {relationshipWeightProperty: 'track_miles'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,betweenness
0,Denver,18.0
1,Washington,18.0
2,Chicago,14.0
3,New York,12.0
4,Dallas,10.0
5,Berkeley,6.0
6,Los Angeles,6.0
7,Seattle,0.0
8,Miami,0.0


## Disconnected Graph - the all pairs shortest path will not find paths for disconnected subgraphs; sets them to zero

In [41]:
my_create_disconnected_graph()

In [42]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {properties: 'track_miles'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a794d90>

In [43]:
query = """

CALL gds.betweenness.stream('ds_graph', {relationshipWeightProperty: 'track_miles'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,betweenness
0,Denver,18.0
1,Washington,18.0
2,Chicago,14.0
3,New York,12.0
4,Dallas,10.0
5,Berkeley,6.0
6,Los Angeles,6.0
7,Seattle,0.0
8,Miami,0.0
9,Anchorage,0.0


## Randomized-Approximate Brandes - betweenness can be very time consuming and expensive due to all pairs shortest path; approximates betweenness centrality; random subsets of nodes; either choose nodes randomly uniformly or choose nodes randomly and throw out those with degree less than average; can also limit depth of shortest path algorithm

## Connected Graph

In [44]:
my_create_connected_graph()

In [45]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a794b80>

In [46]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 2
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,6.0
1,Denver,4.5
2,Washington,3.0
3,Berkeley,2.0
4,Los Angeles,1.5
5,Chicago,1.0
6,Seattle,0.0
7,New York,0.0
8,Miami,0.0


In [47]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 5
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,13.0
1,Denver,10.0
2,Washington,5.0
3,Berkeley,3.0
4,New York,3.0
5,Los Angeles,2.0
6,Chicago,2.0
7,Seattle,0.0
8,Miami,0.0


In [48]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 7
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,17.0
1,Denver,14.0
2,Washington,14.0
3,New York,4.0
4,Berkeley,3.0
5,Los Angeles,3.0
6,Chicago,2.0
7,Seattle,0.0
8,Miami,0.0


## Disconnected Graph - the all pairs shortest path will not find paths for disconnected subgraphs; sets them to zero

In [49]:
my_create_disconnected_graph()

In [50]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a794730>

In [51]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 2
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,6.0
1,Denver,4.5
2,Washington,3.0
3,Berkeley,2.0
4,Los Angeles,1.5
5,Chicago,1.0
6,Seattle,0.0
7,New York,0.0
8,Miami,0.0
9,Anchorage,0.0


In [52]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 5
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,13.0
1,Denver,10.0
2,Washington,5.0
3,Berkeley,3.0
4,New York,3.0
5,Los Angeles,2.0
6,Chicago,2.0
7,Seattle,0.0
8,Miami,0.0
9,Anchorage,0.0


In [53]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

sampling_size = 7
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,Dallas,17.0
1,Denver,14.0
2,Washington,14.0
3,New York,4.0
4,Berkeley,3.0
5,Los Angeles,3.0
6,Chicago,2.0
7,Seattle,0.0
8,Miami,0.0
9,Anchorage,0.0


# Lab: Neo4j - Page Rank, Personalized PageRank

## Page Rank - Larry Page of Google; overall influence of a node in graph; direct influence; influence of incoming relationships; and so forth; knowing a lot of influential people makes you more influential

## Connected Graph

In [54]:
my_create_connected_graph()

In [55]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a9198b0>

In [56]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Washington,1.039914
1,Denver,1.032801
2,Berkeley,1.011995
3,Dallas,1.004832
4,New York,0.992025
5,Chicago,0.987711
6,Los Angeles,0.983614
7,Seattle,0.979777
8,Miami,0.967332


## Disconnected Graph - nodes in disconnected subgraphs get a really high page rank; for the entire graph this may be misleading; be careful!

In [57]:
my_create_disconnected_graph()

In [58]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a878610>

In [59]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Washington,1.039914
1,Denver,1.032801
2,Berkeley,1.011995
3,Dallas,1.004832
4,Anchorage,1.0
5,Fairbanks,1.0
6,Mayaguez,1.0
7,Ponce,1.0
8,San Juan,1.0
9,New York,0.992025


## Personalized Page Rank - Page Rank from a single node; what's important to a specific user; target recommendations to a specific user

## Connected Graph

In [60]:
my_create_connected_graph()

In [61]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a81e6d0>

In [62]:
query = """

MATCH (siteA:Station {name: $source})
CALL gds.pageRank.stream('ds_graph', {
  maxIterations: $max_iterations,
  dampingFactor: $damping_factor,
  sourceNodes: [siteA]
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY score DESC, name ASC

"""

source = "Berkeley"
max_iterations = 20
damping_factor = 0.85

my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Berkeley,0.284265
1,Denver,0.181564
2,Seattle,0.118596
3,Los Angeles,0.108447
4,Dallas,0.1003
5,Washington,0.058049
6,Chicago,0.054699
7,New York,0.039152
8,Miami,0.016171


## Disconnected Graph - note that personalized page rank only considers nodes that are connected;  we will try Berkeley, then San Juan

In [63]:
my_create_disconnected_graph()

In [64]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fd20a7b2a30>

In [65]:
query = """

MATCH (siteA:Station {name: $source})
CALL gds.pageRank.stream('ds_graph', {
  maxIterations: $max_iterations,
  dampingFactor: $damping_factor,
  sourceNodes: [siteA]
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY score DESC, name ASC

"""

source = "Berkeley"
max_iterations = 20
damping_factor = 0.85

my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Berkeley,0.284265
1,Denver,0.181564
2,Seattle,0.118596
3,Los Angeles,0.108447
4,Dallas,0.1003
5,Washington,0.058049
6,Chicago,0.054699
7,New York,0.039152
8,Miami,0.016171
9,Anchorage,0.0


In [66]:
query = """

MATCH (siteA:Station {name: $source})
CALL gds.pageRank.stream('ds_graph', {
  maxIterations: $max_iterations,
  dampingFactor: $damping_factor,
  sourceNodes: [siteA]
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY score DESC, name ASC

"""

source = "San Juan"
max_iterations = 20
damping_factor = 0.85

my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,San Juan,0.390589
1,Mayaguez,0.285326
2,Ponce,0.285326
3,Anchorage,0.0
4,Berkeley,0.0
5,Chicago,0.0
6,Dallas,0.0
7,Denver,0.0
8,Fairbanks,0.0
9,Los Angeles,0.0
