# Graph Community Detection Algorithms

In [1]:
import neo4j

import pandas as pd

from IPython.display import display

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [5]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [6]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

## Connected Graph - same graph as last week; high speed rail; for labels 1 = west coast, 2 = east coast

In [7]:
def my_create_connected_graph():
    "create the connected graph"
    
    my_neo4j_wipe_out_database()

    query = """

    CREATE
      (seattle:Station {name: 'Seattle', latitude: 47.6062, longitude: -122.3321, label: 1}),
      (berkeley:Station {name: 'Berkeley', latitude: 37.8715, longitude: -122.2730, label: 1}),
      (losangeles:Station {name: 'Los Angeles', latitude: 34.0522, longitude: -118.2437, label: 1}),
      (denver:Station {name: 'Denver', latitude: 39.7392, longitude: -104.9903}),
      (dallas:Station {name: 'Dallas', latitude: 32.7767, longitude: -96.7970}),
      (chicago:Station {name: 'Chicago', latitude: 41.8781, longitude: -87.6298}),
      (newyork:Station {name: 'New York', latitude: 40.7128, longitude: -74.0060, label: 2}),
      (washington:Station {name: 'Washington', latitude: 38.9072, longitude: -77.0369, label: 2}),
      (miami:Station {name: 'Miami', latitude: 25.7617, longitude: -80.1918, label: 2}),
      (seattle)-[:TRACK {track_miles: 798}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 798}]->(seattle),
      (seattle)-[:TRACK {track_miles: 1303}]->(denver),
      (denver)-[:TRACK {track_miles: 1303}]->(seattle),
      (berkeley)-[:TRACK {track_miles: 1240}]->(denver),
      (denver)-[:TRACK {track_miles: 1240}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 376}]->(losangeles),
      (losangeles)-[:TRACK {track_miles: 376}]->(berkeley),
      (losangeles)-[:TRACK {track_miles: 1436}]->(dallas),
      (dallas)-[:TRACK {track_miles: 1436}]->(losangeles),
      (denver)-[:TRACK {track_miles: 1003}]->(chicago),
      (chicago)-[:TRACK {track_miles: 1003}]->(denver),
      (denver)-[:TRACK {track_miles: 794}]->(dallas),
      (dallas)-[:TRACK {track_miles: 794}]->(denver),
      (chicago)-[:TRACK {track_miles: 794}]->(newyork),
      (newyork)-[:TRACK {track_miles: 794}]->(chicago),
      (dallas)-[:TRACK {track_miles: 1329}]->(washington),
      (washington)-[:TRACK {track_miles: 1329}]->(dallas),
      (newyork)-[:TRACK {track_miles: 226}]->(washington),
      (washington)-[:TRACK {track_miles: 226}]->(newyork),
      (washington)-[:TRACK {track_miles: 1053}]->(miami),
      (miami)-[:TRACK {track_miles: 1053}]->(washington)


    """

    session.run(query)

In [8]:
my_create_connected_graph()

In [9]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Berkeley,[Station]
1,Chicago,[Station]
2,Dallas,[Station]
3,Denver,[Station]
4,Los Angeles,[Station]
5,Miami,[Station]
6,New York,[Station]
7,Seattle,[Station]
8,Washington,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,Berkeley,[Station],TRACK,Denver,[Station]
1,Berkeley,[Station],TRACK,Los Angeles,[Station]
2,Berkeley,[Station],TRACK,Seattle,[Station]
3,Chicago,[Station],TRACK,Denver,[Station]
4,Chicago,[Station],TRACK,New York,[Station]
5,Dallas,[Station],TRACK,Denver,[Station]
6,Dallas,[Station],TRACK,Los Angeles,[Station]
7,Dallas,[Station],TRACK,Washington,[Station]
8,Denver,[Station],TRACK,Berkeley,[Station]
9,Denver,[Station],TRACK,Chicago,[Station]


-------------------------
  Density: 0.6
-------------------------


## Disconnected Graph - add two disconnected subgraphs to our graph: Anchorage and Fairbanks in Alaska, San Juan, Ponce, and Mayaguez in Peurto Rico; labels: 1 = west coast, 2 = east coast, 3 = Alaska, 4 = Peurto Rico

In [13]:
def my_create_disconnected_graph():
    "create the connected graph"
    
    my_neo4j_wipe_out_database()

    query = """

    CREATE
      (seattle:Station {name: 'Seattle', latitude: 47.6062, longitude: -122.3321, label: 1}),
      (berkeley:Station {name: 'Berkeley', latitude: 37.8715, longitude: -122.2730, label: 1}),
      (losangeles:Station {name: 'Los Angeles', latitude: 34.0522, longitude: -118.2437, label: 1}),
      (denver:Station {name: 'Denver', latitude: 39.7392, longitude: -104.9903}),
      (dallas:Station {name: 'Dallas', latitude: 32.7767, longitude: -96.7970}),
      (chicago:Station {name: 'Chicago', latitude: 41.8781, longitude: -87.6298}),
      (newyork:Station {name: 'New York', latitude: 40.7128, longitude: -74.0060, label: 2}),
      (washington:Station {name: 'Washington', latitude: 38.9072, longitude: -77.0369, label: 2}),
      (miami:Station {name: 'Miami', latitude: 25.7617, longitude: -80.1918, label: 2}),
      (anchorage:Station {name: 'Anchorage', latitude: 61.2181, longitude: -149.9003, label:3}),
      (fairbanks:Station {name: 'Fairbanks', latitude: 64.8378, longitude: -147.7164}),
      (sanjuan:Station {name: 'San Juan', latitude: 18.4655, longitude: -66.1057, label:4}),
      (ponce:Station {name: 'Ponce', latitude: 18.0111, longitude: -66.6141}),
      (mayaguez:Station {name: 'Mayaguez', latitude: 18.2013, longitude: -67.1452}),
      (seattle)-[:TRACK {track_miles: 798}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 798}]->(seattle),
      (seattle)-[:TRACK {track_miles: 1303}]->(denver),
      (denver)-[:TRACK {track_miles: 1303}]->(seattle),
      (berkeley)-[:TRACK {track_miles: 1240}]->(denver),
      (denver)-[:TRACK {track_miles: 1240}]->(berkeley),
      (berkeley)-[:TRACK {track_miles: 376}]->(losangeles),
      (losangeles)-[:TRACK {track_miles: 376}]->(berkeley),
      (losangeles)-[:TRACK {track_miles: 1436}]->(dallas),
      (dallas)-[:TRACK {track_miles: 1436}]->(losangeles),
      (denver)-[:TRACK {track_miles: 1003}]->(chicago),
      (chicago)-[:TRACK {track_miles: 1003}]->(denver),
      (denver)-[:TRACK {track_miles: 794}]->(dallas),
      (dallas)-[:TRACK {track_miles: 794}]->(denver),
      (chicago)-[:TRACK {track_miles: 794}]->(newyork),
      (newyork)-[:TRACK {track_miles: 794}]->(chicago),
      (dallas)-[:TRACK {track_miles: 1329}]->(washington),
      (washington)-[:TRACK {track_miles: 1329}]->(dallas),
      (newyork)-[:TRACK {track_miles: 226}]->(washington),
      (washington)-[:TRACK {track_miles: 226}]->(newyork),
      (washington)-[:TRACK {track_miles: 1053}]->(miami),
      (miami)-[:TRACK {track_miles: 1053}]->(washington),
      (anchorage)-[:TRACK {track_miles: 359}]->(fairbanks),
      (fairbanks)-[:TRACK {track_miles: 359}]->(anchorage),
      (sanjuan)-[:TRACK {track_miles: 71}]->(ponce),
      (ponce)-[:TRACK {track_miles: 71}]->(sanjuan),
      (ponce)-[:TRACK {track_miles: 57}]->(mayaguez),
      (mayaguez)-[:TRACK {track_miles: 57}]->(ponce),
      (mayaguez)-[:TRACK {track_miles: 120}]->(sanjuan),
      (sanjuan)-[:TRACK {track_miles: 120}]->(mayaguez)


    """

    session.run(query)

In [11]:
my_create_disconnected_graph()

In [12]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Anchorage,[Station]
1,Berkeley,[Station]
2,Chicago,[Station]
3,Dallas,[Station]
4,Denver,[Station]
5,Fairbanks,[Station]
6,Los Angeles,[Station]
7,Mayaguez,[Station]
8,Miami,[Station]
9,New York,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,Anchorage,[Station],TRACK,Fairbanks,[Station]
1,Berkeley,[Station],TRACK,Denver,[Station]
2,Berkeley,[Station],TRACK,Los Angeles,[Station]
3,Berkeley,[Station],TRACK,Seattle,[Station]
4,Chicago,[Station],TRACK,Denver,[Station]
5,Chicago,[Station],TRACK,New York,[Station]
6,Dallas,[Station],TRACK,Denver,[Station]
7,Dallas,[Station],TRACK,Los Angeles,[Station]
8,Dallas,[Station],TRACK,Washington,[Station]
9,Denver,[Station],TRACK,Berkeley,[Station]


-------------------------
  Density: 0.3
-------------------------


# Lab: Neo4j - Triangle Count, Clustering Coefficient, Overall Relationship Density

## Triangle Count - number of triangles that pass through a node

## Connected Graph

In [14]:
my_create_connected_graph()

In [15]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb9c2dfd30>

In [16]:
query = """

CALL gds.triangleCount.stream('ds_graph')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS name, triangleCount as triangle_count
ORDER BY triangleCount DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,triangle_count
0,Berkeley,1
1,Denver,1
2,Seattle,1
3,Chicago,0
4,Dallas,0
5,Los Angeles,0
6,Miami,0
7,New York,0
8,Washington,0


## Disconnected Graph

In [17]:
my_create_disconnected_graph()

In [18]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb628dd8b0>

In [19]:
query = """

CALL gds.triangleCount.stream('ds_graph')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS name, triangleCount as triangle_count
ORDER BY triangleCount DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,triangle_count
0,Berkeley,1
1,Denver,1
2,Mayaguez,1
3,Ponce,1
4,San Juan,1
5,Seattle,1
6,Anchorage,0
7,Chicago,0
8,Dallas,0
9,Fairbanks,0


## Since we didn't have many triangles, here is an example from the Neo4j documentation

In [20]:
my_neo4j_wipe_out_database()

query = """

CREATE
  (alice:Person {name: 'Alice'}),
  (michael:Person {name: 'Michael'}),
  (karin:Person {name: 'Karin'}),
  (chris:Person {name: 'Chris'}),
  (will:Person {name: 'Will'}),
  (mark:Person {name: 'Mark'}),

  (michael)-[:KNOWS]->(karin),
  (michael)-[:KNOWS]->(chris),
  (will)-[:KNOWS]->(michael),
  (mark)-[:KNOWS]->(michael),
  (mark)-[:KNOWS]->(will),
  (alice)-[:KNOWS]->(michael),
  (will)-[:KNOWS]->(chris),
  (chris)-[:KNOWS]->(karin)

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb627b9520>

In [21]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Person', {KNOWS: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb62804bb0>

In [22]:
query = """

CALL gds.triangleCount.stream('ds_graph')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS name, triangleCount as triangle_count
ORDER BY triangleCount DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,triangle_count
0,Michael,3
1,Chris,2
2,Will,2
3,Karin,1
4,Mark,1
5,Alice,0


## Clustering Coefficient - if A is connected to B and A is connected to C, probability that B is connected to C; probability that neighbors of a node are connected; 1.0 means full clique - every node connected to every other node

## Connected Graph

In [23]:
my_create_connected_graph()

In [24]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb62804580>

In [25]:
query = """

CALL gds.localClusteringCoefficient.stream('ds_graph')
YIELD nodeId, localClusteringCoefficient
RETURN gds.util.asNode(nodeId).name AS name, localClusteringCoefficient as clustering_coefficient
ORDER BY localClusteringCoefficient DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,clustering_coefficient
0,Seattle,1.0
1,Berkeley,0.333333
2,Denver,0.166667
3,Chicago,0.0
4,Dallas,0.0
5,Los Angeles,0.0
6,Miami,0.0
7,New York,0.0
8,Washington,0.0


## Disconnected Graph

In [26]:
my_create_disconnected_graph()

In [27]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {TRACK: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627b9fa0>

In [28]:
query = """

CALL gds.localClusteringCoefficient.stream('ds_graph')
YIELD nodeId, localClusteringCoefficient
RETURN gds.util.asNode(nodeId).name AS name, localClusteringCoefficient as clustering_coefficient
ORDER BY localClusteringCoefficient DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,clustering_coefficient
0,Mayaguez,1.0
1,Ponce,1.0
2,San Juan,1.0
3,Seattle,1.0
4,Berkeley,0.333333
5,Denver,0.166667
6,Anchorage,0.0
7,Chicago,0.0
8,Dallas,0.0
9,Fairbanks,0.0


## Example from the Neo4j documentation

In [29]:
my_neo4j_wipe_out_database()

query = """

CREATE
  (alice:Person {name: 'Alice'}),
  (michael:Person {name: 'Michael'}),
  (karin:Person {name: 'Karin'}),
  (chris:Person {name: 'Chris'}),
  (will:Person {name: 'Will'}),
  (mark:Person {name: 'Mark'}),

  (michael)-[:KNOWS]->(karin),
  (michael)-[:KNOWS]->(chris),
  (will)-[:KNOWS]->(michael),
  (mark)-[:KNOWS]->(michael),
  (mark)-[:KNOWS]->(will),
  (alice)-[:KNOWS]->(michael),
  (will)-[:KNOWS]->(chris),
  (chris)-[:KNOWS]->(karin)

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb9c2d0f70>

In [30]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)



query = "CALL gds.graph.project('ds_graph', 'Person', {KNOWS: {orientation: 'UNDIRECTED'}})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627b9bb0>

In [31]:
query = """

CALL gds.localClusteringCoefficient.stream('ds_graph')
YIELD nodeId, localClusteringCoefficient
RETURN gds.util.asNode(nodeId).name AS name, localClusteringCoefficient as clustering_coefficient
ORDER BY localClusteringCoefficient DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,clustering_coefficient
0,Karin,1.0
1,Mark,1.0
2,Chris,0.666667
3,Will,0.666667
4,Michael,0.3
5,Alice,0.0


# Lab: Neo4j - Strongly Connected Components (SCC), Connected Components, Connected Clusters

## Strongly Connected Components - group of nodes, each node is reachable from every other node in the group, must use direction

## Connected Graph

In [33]:
my_create_connected_graph()

In [34]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK')"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627e3be0>

## Note:  if you see a 0 for component, that just means that it's component 0.  In computer science it's very common to start counting from 0.

In [35]:
query = """

CALL gds.scc.stream('ds_graph', {})
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId AS component
ORDER BY component DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Berkeley,0
1,Chicago,0
2,Dallas,0
3,Denver,0
4,Los Angeles,0
5,Miami,0
6,New York,0
7,Seattle,0
8,Washington,0


## Disconnected Graph

In [36]:
my_create_disconnected_graph()

In [37]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK')"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627593a0>

In [38]:
query = """

CALL gds.scc.stream('ds_graph', {})
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId AS component
ORDER BY component DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Mayaguez,11
1,Ponce,11
2,San Juan,11
3,Anchorage,9
4,Fairbanks,9
5,Berkeley,0
6,Chicago,0
7,Dallas,0
8,Denver,0
9,Los Angeles,0


## Example from the Neo4j documentation

In [39]:
my_neo4j_wipe_out_database()

query = """

CREATE (nAlice:User {name:'Alice'})
CREATE (nBridget:User {name:'Bridget'})
CREATE (nCharles:User {name:'Charles'})
CREATE (nDoug:User {name:'Doug'})
CREATE (nMark:User {name:'Mark'})
CREATE (nMichael:User {name:'Michael'})

CREATE (nAlice)-[:FOLLOW]->(nBridget)
CREATE (nAlice)-[:FOLLOW]->(nCharles)
CREATE (nMark)-[:FOLLOW]->(nDoug)
CREATE (nMark)-[:FOLLOW]->(nMichael)
CREATE (nBridget)-[:FOLLOW]->(nMichael)
CREATE (nDoug)-[:FOLLOW]->(nMark)
CREATE (nMichael)-[:FOLLOW]->(nAlice)
CREATE (nAlice)-[:FOLLOW]->(nMichael)
CREATE (nBridget)-[:FOLLOW]->(nAlice)
CREATE (nMichael)-[:FOLLOW]->(nBridget);

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb9f571490>

In [40]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'FOLLOW')"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627c3ee0>

In [41]:
query = """

CALL gds.scc.stream('ds_graph', {})
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId AS component
ORDER BY component DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Doug,3
1,Mark,3
2,Charles,2
3,Alice,0
4,Bridget,0
5,Michael,0


## Connected Components - direction not considered

## Connected Graph

In [42]:
my_create_connected_graph()

In [43]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6275e1f0>

In [44]:
query = """

CALL gds.wcc.stream('ds_graph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId as component
ORDER BY componentId DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Berkeley,0
1,Chicago,0
2,Dallas,0
3,Denver,0
4,Los Angeles,0
5,Miami,0
6,New York,0
7,Seattle,0
8,Washington,0


## Disconnected Graph

In [45]:
my_create_disconnected_graph()

In [46]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627c1fd0>

In [47]:
query = """

CALL gds.wcc.stream('ds_graph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId as component
ORDER BY componentId DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Mayaguez,9
1,Ponce,9
2,San Juan,9
3,Anchorage,7
4,Fairbanks,7
5,Berkeley,0
6,Chicago,0
7,Dallas,0
8,Denver,0
9,Los Angeles,0


## Example from the Neo4j documentation

In [48]:
my_neo4j_wipe_out_database()

query = """

CREATE
  (nAlice:User {name: 'Alice'}),
  (nBridget:User {name: 'Bridget'}),
  (nCharles:User {name: 'Charles'}),
  (nDoug:User {name: 'Doug'}),
  (nMark:User {name: 'Mark'}),
  (nMichael:User {name: 'Michael'}),

  (nAlice)-[:LINK {weight: 0.5}]->(nBridget),
  (nAlice)-[:LINK {weight: 4}]->(nCharles),
  (nMark)-[:LINK {weight: 1.1}]->(nDoug),
  (nMark)-[:LINK {weight: 2}]->(nMichael);

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb627c1580>

In [49]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6275e6d0>

In [50]:
query = """

CALL gds.wcc.stream('ds_graph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId as component
ORDER BY componentId DESC, name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,component
0,Doug,3
1,Mark,3
2,Michael,3
3,Alice,0
4,Bridget,0
5,Charles,0


# Lab: Neo4j - Label Propagation, Infer Groups Based on Node Labels

## Label Propogation Algorithm (LPA) - fast; used where grouping is less clear; nodes pass labels to neighbors; if a neighbor gets multiple labels: choose label with highest presence in neighborhood, node weights, or relationship weights; push labels: unweighted, serial; pull labels: weighted, parallel

## Connected Graph

In [51]:
my_create_connected_graph()

In [52]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'TRACK', 
                      {nodeProperties: 'label', relationshipProperties: 'track_miles'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6275e490>

In [53]:
query = """

CALL gds.labelPropagation.stream('ds_graph')
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).name AS Name, Community
ORDER BY Community, Name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,Name,Community
0,Berkeley,19
1,Chicago,19
2,Dallas,19
3,Denver,19
4,Los Angeles,19
5,Miami,19
6,New York,19
7,Seattle,19
8,Washington,19


## Disconnected Graph

In [54]:
my_create_disconnected_graph()

In [55]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'TRACK', 
                      {nodeProperties: 'label', relationshipProperties: 'track_miles'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb627c1d90>

In [56]:
query = """

CALL gds.labelPropagation.stream('ds_graph')
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).name AS Name, Community
ORDER BY Community, Name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,Name,Community
0,Berkeley,28
1,Chicago,28
2,Dallas,28
3,Denver,28
4,Los Angeles,28
5,Miami,28
6,New York,28
7,Seattle,28
8,Washington,28
9,Anchorage,37


## Example from the Neo4j documentation

In [57]:
my_neo4j_wipe_out_database()

query = """

CREATE
  (alice:User {name: 'Alice', seed_label: 52}),
  (bridget:User {name: 'Bridget', seed_label: 21}),
  (charles:User {name: 'Charles', seed_label: 43}),
  (doug:User {name: 'Doug', seed_label: 21}),
  (mark:User {name: 'Mark', seed_label: 19}),
  (michael:User {name: 'Michael', seed_label: 52}),

  (alice)-[:FOLLOW {weight: 1}]->(bridget),
  (alice)-[:FOLLOW {weight: 10}]->(charles),
  (mark)-[:FOLLOW {weight: 1}]->(doug),
  (bridget)-[:FOLLOW {weight: 1}]->(michael),
  (doug)-[:FOLLOW {weight: 1}]->(mark),
  (michael)-[:FOLLOW {weight: 1}]->(alice),
  (alice)-[:FOLLOW {weight: 1}]->(michael),
  (bridget)-[:FOLLOW {weight: 1}]->(alice),
  (michael)-[:FOLLOW {weight: 1}]->(bridget),
  (charles)-[:FOLLOW {weight: 1}]->(doug)

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb627c1400>

In [58]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'User', 'FOLLOW', 
                      {nodeProperties: 'seed_label', relationshipProperties: 'weight'})

"""
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6286bd30>

## Unweighted

In [59]:
query = """

CALL gds.labelPropagation.stream('ds_graph')
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).name AS Name, Community
ORDER BY Community, Name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,Name,Community
0,Alice,42
1,Bridget,42
2,Michael,42
3,Charles,45
4,Doug,45
5,Mark,45


## Weighted

In [60]:
query = """

CALL gds.labelPropagation.stream('ds_graph', { relationshipWeightProperty: 'weight' })
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).name AS Name, Community
ORDER BY Community, Name

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,Name,Community
0,Bridget,43
1,Michael,43
2,Alice,45
3,Charles,45
4,Doug,45
5,Mark,45


# Lab: Neo4j - Louvain Modularity, Grouping Quality, Hierarchies

## Louvain Modularity - what if analysis; tried different groups; modularity = how well a node is assigned to a group using relationship weights and densities; creates heirarch of groups at different scales; issues: tend to merge smaller groups into larger groups, brick walls where several options have same modularity

## Update - intermediate communities are now working!  (The videos were recorded before intermediate communities were working.)

## Connected Graph

In [64]:
my_create_connected_graph()

In [62]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'TRACK', 
                      {relationshipProperties: 'track_miles'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6275ee50>

In [63]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,community,intermediate_community
0,Berkeley,1,"[1, 1]"
1,Denver,1,"[1, 1]"
2,Seattle,1,"[1, 1]"
3,Dallas,3,"[3, 3]"
4,Los Angeles,3,"[3, 3]"
5,Chicago,6,"[6, 6]"
6,Miami,6,"[8, 6]"
7,New York,6,"[6, 6]"
8,Washington,6,"[8, 6]"


## Disconnected Graph

In [65]:
my_create_disconnected_graph()

In [66]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'TRACK', 
                      {relationshipProperties: 'track_miles'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb6276de80>

In [67]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,community,intermediate_community
0,Berkeley,3,"[1, 3]"
1,Dallas,3,"[3, 3]"
2,Denver,3,"[1, 3]"
3,Los Angeles,3,"[3, 3]"
4,Seattle,3,"[1, 3]"
5,Chicago,6,"[6, 6]"
6,Miami,6,"[8, 6]"
7,New York,6,"[6, 6]"
8,Washington,6,"[8, 6]"
9,Anchorage,10,"[10, 10]"


## Example from the Neo4j documentation

In [68]:
my_neo4j_wipe_out_database()

query = """

CREATE
  (nAlice:User {name: 'Alice', seed: 42}),
  (nBridget:User {name: 'Bridget', seed: 42}),
  (nCharles:User {name: 'Charles', seed: 42}),
  (nDoug:User {name: 'Doug'}),
  (nMark:User {name: 'Mark'}),
  (nMichael:User {name: 'Michael'}),

  (nAlice)-[:LINK {weight: 1}]->(nBridget),
  (nAlice)-[:LINK {weight: 1}]->(nCharles),
  (nCharles)-[:LINK {weight: 1}]->(nBridget),

  (nAlice)-[:LINK {weight: 5}]->(nDoug),

  (nMark)-[:LINK {weight: 1}]->(nDoug),
  (nMark)-[:LINK {weight: 1}]->(nMichael),
  (nMichael)-[:LINK {weight: 1}]->(nMark);

"""

session.run(query)


<neo4j._sync.work.result.Result at 0x7fdb627c1100>

In [69]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'User', 'LINK', 
                      {nodeProperties: 'seed', relationshipProperties: 'weight'})

"""
session.run(query)

<neo4j._sync.work.result.Result at 0x7fdb62804cd0>

In [70]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,community,intermediate_community
0,Alice,1,[1]
1,Bridget,1,[1]
2,Charles,1,[1]
3,Doug,3,[3]
4,Mark,3,[3]
5,Michael,3,[3]
