# **Environment Setup**

In [None]:
try:
    from graphdatascience import GraphDataScience
except:
    !pip install graphdatascience
    from graphdatascience import GraphDataScience

from graphdatascience.session import SessionMemory, DbmsConnectionInfo, GdsSessions, AuraAPICredentials, AlgorithmCategory
from datetime import timedelta
from getpass import getpass
import pandas as pd
from collections import defaultdict

# **Connecting to Neo4j GDS**

Enter your Aura API credentials to be used for creating the compute for your GDS session.

In [None]:
client_id = getpass("Aura client id")
client_secret = getpass("Aura client secret")

Create the sessions object.

In [None]:
sessions = GdsSessions(api_credentials=AuraAPICredentials(client_id, client_secret))

Enter the connection information for your source database.

In [None]:
db_uri = "neo4j+s://81c642ac.databases.neo4j.io"
db_user = getpass("database user")
db_password = getpass("database password")

Estimate the RAM for the session that we will create.

In [None]:
memory = sessions.estimate(
    node_count=900000,
    relationship_count=4000000,
    algorithm_categories=[AlgorithmCategory.CENTRALITY,
                          AlgorithmCategory.COMMUNITY_DETECTION,
                          AlgorithmCategory.PATH_FINDING,
                          AlgorithmCategory.SIMILARITY,
                          AlgorithmCategory.NODE_EMBEDDING],
)

Create the session.

In [None]:
db_connection = DbmsConnectionInfo(db_uri, db_user, db_password)
gds = sessions.get_or_create(
    session_name="algorithm_lab",
    memory=memory,
    ttl=timedelta(minutes=60),
    db_connection=db_connection
)

## **1. Centrality**
### Find the five Articles with the highest influence, based on CITES relationships.

### 1.1 Create the graph projection
*Use Article nodes and CITES relationships*

Cypher Projection documentation can be found here: https://neo4j.com/docs/graph-data-science-client/current/gds-session/#_syntax_2

In [None]:
query = """your code here"""

G_citations, result = #your code here

In [None]:
#@title Solution:
query = """MATCH (a1:Article)-[:CITES]->(a2)
           RETURN gds.graph.project.remote(a1, a2)"""

G_citations, result = gds.graph.project("citations", query)
result

### 1.2 Execute the PageRank algorithm
Add PageRank scores to the in memory graph (use mutate mode).

PageRank documentation can be found here: https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/

In [None]:
page_rank_results = #your code here

In [None]:
#@title Solution:
page_rank_results = gds.pageRank.mutate(G_citations, mutateProperty="pageRank")
page_rank_results


### 1.3 Check results
Stream the PageRank property from the in memory graph to a pandas data frame. Along with the node id and page rank score, include the article title in the resulting pandas data frame.

Once you have the pandas data frame, sort it in pandas to show the top 5 articles with the highest PageRank.

In [None]:
page_rank_df = gds.graph.nodeProperties.stream('your parameters here')
page_rank_df.sort_values("pageRank", ascending=False).head(5)


In [None]:
#@title Solution:
page_rank_df = gds.graph.nodeProperties.stream(G_citations, ["pageRank"], db_node_properties=["title"], separate_property_columns=True)
page_rank_df.sort_values("pageRank", ascending=False).head(5)

### 1.4 Delete the projected graph

In [None]:
#your code here

In [None]:
#@title Solution:
G_citations.drop()

# **2. Community detection**
### Identify communities of authors who tend to collaborate with each other

### 2.1 Create the graph projection
*Use Author nodes and undirected COLABORATED_WITH relationships and the publications property on the relationships.*

Cypher Projection documentation can be found here: https://neo4j.com/docs/graph-data-science/current/management-ops/graph-creation/graph-project-cypher-projection/


In [None]:
query = """your code here"""

g_coauthors, result = #your code here

In [None]:
#@title Solution:
query = """
    MATCH (a:Author)-[r:COLLABORATED_WITH]->(b)
    RETURN gds.graph.project.remote(a, b,
      {
        relationshipType: 'COLLABORATED_WITH',
        relationshipProperties: {publications: r.publications}
      }
    )
"""

g_coauthors, result = gds.graph.project("coauthors", query, undirected_relationship_types=["COLLABORATED_WITH"])

result

### 2.2 Check the number of weakly connected components
*It's probably not useful to run Louvain on tiny disconnected components*

WCC documetnation can be found here: https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/

In [None]:
wcc_results = #your code here

In [None]:
#@title Solution:
wcc_results = gds.wcc.stats(g_coauthors)
wcc_results

Take a closer look at the component distribution:

In [None]:
wcc_results['componentDistribution']

We see what looks like one giant component, and lots of little islands. Let's trim the tiny islands and work only with the big component.

### 2.3 Trim the tiny components

Run wcc in mutate mode to add a wccId property to nodes.

In [None]:
gds.wcc.mutate(g_coauthors, mutateProperty = "componentId")

Stream the component ids to a pandas data frame. Group the data frame by component id and find the id with the larget count using pandas.

In [None]:
wcc_df = #your code here
component_id = wcc_df['propertyValue'].value_counts().index[0]
print(component_id)

In [None]:
#@title Solution:
wcc_df = gds.graph.nodeProperty.stream(g_coauthors, "componentId")
component_id = wcc_df['propertyValue'].value_counts().index[0]
print(component_id)

Use gds.graph.filter() to created a filtered subgraph where all nodes have the component id of the largest WCC component.

In [None]:
# your code here

In [None]:
#@title Solution:
g_connected_coauthors, result = gds.graph.filter("connected_coauthors", g_coauthors, "n.componentId = 0", "*")
result

Drop the old graph projection.

In [None]:
gds.graph.drop(g_coauthors)

### 2.4 Run Louvain in *stats* mode and inspect the results
Louvain documentation can be found here: https://neo4j.com/docs/graph-data-science/current/algorithms/louvain/

In [None]:
louvain_stats = #your code here
louvain_stats

In [None]:
#@title Solution:
louvain_stats = gds.louvain.stats(g_connected_coauthors, relationshipWeightProperty="publications")
louvain_stats

Observe the modularity increase over successive iterations of the algorithm:

In [None]:
louvain_stats['modularities']

Observe the distribution of community sizes:

In [None]:
louvain_stats['communityDistribution']

### 2.5 Examine Louvain communities

Run louvain in mutate mode to create a coauthorCommunityId property.

In [None]:
louvain_results = #your code here
louvain_results

In [None]:
#@title Solution:
louvain_results = gds.louvain.mutate(g_connected_coauthors, relationshipWeightProperty="publications", mutateProperty="coauthorCommunityId")
louvain_results

*Note that the community count and max community size here is a little different than when we ran in stats mode. This is not a perfectly deterministic algorithm. Communities on successive runs will be similar, but some edge cases might float between neighboring communities.*

Stream the Louvain community ids into a dataframe. Then group by the community id and count the number of rows. Sort by row count and return the top 20 rows.

In [None]:
coauthor_community_df = #your code here
coauthor_community_df.head(20)

In [None]:
#@title Solution:
coauthor_community_df = gds.graph.nodeProperty.stream(g_connected_coauthors, "coauthorCommunityId")
community_size_df = coauthor_community_df['propertyValue'].value_counts()
community_size_df.head(20)


# **3. Path Finding**
Find the shortest path between two authors based on COAUTHOR relationships, using Dijkstra's algorithm.

*This exercises reuses the g_connected_coauthors projection from exercise 2.  If you have not done exercise 2, complete 2.1 to 2.3 now.*

### 3.1 Get node IDs
Using Cypher, fetch the node IDs for authors **W F Sewell** and **Y Suzuki**

In [None]:
node_id_df = gds.run_cypher("""
  #your cypher here
""")
node_id_df

In [None]:
#@title Solution:
node_id_df = gds.run_cypher("""
    MATCH (a1:Author {fullName: "W F Sewell"}), (a2:Author {fullName: "Y Suzuki"})
    RETURN id(a1) as id1, id(a2) as id2""")
node_id_df

### 3.2 Execute Dijkstra's algorithm
Pass those node IDs as parameters to the Dikjstra algorithm, and return all the node IDs along the path.

*Note that the GDS driver wants a Python integer, not a NumPy int64, so we're casting the ID before passing it to the function.*

Dijkstra documetnation can be found here: https://neo4j.com/docs/graph-data-science/current/algorithms/dijkstra-source-target/

In [None]:
shortest_path = #your code here
shortest_path

In [None]:
#@title Solution:
shortest_path = gds.shortestPath.dijkstra.stream(g_connected_coauthors,
                                                 sourceNode=int(node_id_df.loc[0,'id1']),
                                                 targetNode=int(node_id_df.loc[0,'id2']))
shortest_path

Call the asNode function to get python objects representing each node in the path, and get the fullName property from each node.

In [None]:
[gds.util.asNode(nodeId).get('fullName') for nodeId in shortest_path['nodeIds'][0]]

### 3.3 Delete the projected graph

In [None]:
gds.graph.drop(g_connected_coauthors)

# **4. Similarity**

## 4.1 Neighborhood-based similarity
Find the authors who have a high Jaccard similarity, based on the papers they have coauthored.

### 4.1.1 Create the graph projection
Use Article nodes, Author nodes, and WROTE relationships. Include source node labels, target node labels, and relationship type in your projection because we will want to distinguish the WROTE relationships from the new IS_SIMILAR relationships that we will be creating with the algorithm.

In [None]:
query = """your code here"""

g_article_authors, result = #your code here
result


In [None]:
#@title Solution:
query = """MATCH (au:Author)-[:WROTE]->(ar:Article)
           RETURN gds.graph.project.remote(au, ar,
                    {
                      sourceNodeLabels:'Author',
                      targetNodeLabels:'Article',
                      relationshipType: 'WROTE'
                    })"""

g_article_authors, result = gds.graph.project("article_authors", query)
result

### 4.1.2 Run Node Similarity in stats mode
This takes a few minutes because of the quadratic comparison.

Node Similarity documentation can be found here: https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/

In [None]:
similarity_stats = #your code here
similarity_stats

In [None]:
#@title Solution:
similarity_stats = gds.nodeSimilarity.stats(g_article_authors,
                            topK = 3,
                            degreeCutoff = 4,
                            similarityCutoff = 0.5)
similarity_stats

18,200 pairs seems reasonable. Let's look at those results in detail.

### 4.1.3 Call node similarity in mutate mode
Remember that you need to specify `mutateRelationshipType` and `mutateProperty`. Call the new relationship `HAS_SIMILAR_NEIGHBORS` and the new property `similarity`.

In [None]:
#your code here

In [None]:
#@title Solution:
gds.nodeSimilarity.mutate(g_article_authors,
                        degreeCutoff = 4,
                        topK = 3,
                        similarityCutoff = 0.5,
                        mutateRelationshipType = "HAS_SIMILAR_NEIGHBORS",
                        mutateProperty = "similarity")

Use the gds.graph.relationshipProperty.stream procedure to stream the results.

In [None]:
similar_neighbors_df = gds.graph.relationshipProperty.stream(g_article_authors, "similarity", "HAS_SIMILAR_NEIGHBORS")

Get a table of node ids and author names. Join to the similar_neighbors_df.

In [None]:
author_names_df = gds.run_cypher("MATCH (a:Author) RETURN id(a) AS nodeId, a.fullName AS fullName")

In [None]:
similar_neighbors_df = pd.merge(similar_neighbors_df, author_names_df, left_on='sourceNodeId', right_on='nodeId', how='left')
similar_neighbors_df = similar_neighbors_df.rename(columns={'fullName': 'sourceAuthorName', 'propertyValue': 'jaccardSimilarity'})
similar_neighbors_df = similar_neighbors_df.drop(columns=['nodeId'])

similar_neighbors_df = pd.merge(similar_neighbors_df, author_names_df, left_on='targetNodeId', right_on='nodeId', how='left')
similar_neighbors_df = similar_neighbors_df.rename(columns={'fullName': 'targetAuthorName'})
similar_neighbors_df = similar_neighbors_df.drop(columns=['nodeId'])

similar_neighbors_df.sort_values("propertyValue", ascending=False).head(20)


### 4.1.4 Delete the projected graph

In [None]:
g_article_authors.drop()

## 4.2 Embeddings-based similarity

Project Authors and Articles into a vector space. Then find the Authors who are nearest to each other in the vector space.

### 4.2.1 Create an undirected WROTE relationship
*Recall that FastRP prefers undirected graphs* You could create a new graph projection with WROTE as undirected, but you can also create a new undirected relationship in your existing graph based on the existing directed `WROTE` relationship.

Use the function `gds.graph.relationships.toUndirected()` documeted here to create a new `WROTE_UNDIRECTED` relationship: https://neo4j.com/docs/graph-data-science/current/management-ops/graph-update/to-undirected/


In [None]:
query = """your code here"""

G_author_articles, result = #your code here
result

In [None]:
#@title Solution:
gds.graph.relationships.toUndirected(g_article_authors, "WROTE", mutate_relationship_type="WROTE_UNDIRECTED")

### 4.2.2 Check memory requirments
Run a memory estimation to make sure we have enough heap to run FastRP on this graph projection

In [None]:
fastRP_estimate = gds.fastRP.mutate.estimate(g_article_authors,
                                             relationshipTypes = ["WROTE_UNDIRECTED"],
                                             embeddingDimension = 512,
                                             iterationWeights = [0.0, 1.0, 1.0],
                                             randomSeed = 25,
                                             mutateProperty = "fastRPEmbedding")
fastRP_estimate

Looks like we're OK, because heapPercentageMax is less than 1.

### 4.2.2 Add embeddings to the projected graph
Use the same FastRP parameters that we used to estimate the needed memory.

FastRP documentation can be found here: https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp/

In [None]:
fastRP_result = #your code here
fastRP_result

In [None]:
#@title Solution:
fastRP_result = gds.fastRP.mutate(g_article_authors,
                                  relationshipTypes = ["WROTE_UNDIRECTED"],
                                  embeddingDimension = 512,
                                  iterationWeights = [0.0, 1.0,1.0],
                                  randomSeed = 25,
                                  mutateProperty = "fastRPEmbedding")
fastRP_result

### 4.2.3 Identify similarities with KNN

Use KNN to find similarities between Authors. We want to use the randomWalk initial sampler, so we use filtered KNN. That way we can walk along a path that connects Authors via Article nodes, but the similarity relationships we create will be filtered so that the source nodes and target nodes always have the Author label.

* Run filtered KNN in mutate mode.
* Only use the `WROTE_UNDIRECTED` relationship for the initialization random walks.
* Base the KNN calculation on the `fastRPEmbedding` property on the nodes.
* Set both the source node filter and the target node filter to include only `Author` nodes, not `Article` nodes.
* Find the top 3 most similar nodes for each Author node.
* Use a similarity cutoff of 0.75.
* Call your new relationship `HAS_SIMILAR_EMBEDDING` and your new property `similarity`.

In [None]:
# Your code here

In [None]:
#@title Solution:
knn_result_df  = gds.knn.filtered.mutate(g_article_authors,
                                         relationshipTypes = ["WROTE_UNDIRECTED"],
                                         nodeProperties = "fastRPEmbedding",
                                         initialSampler = "randomWalk",
                                         sourceNodeFilter = "Author",
                                         targetNodeFilter = "Author",
                                         topK = 3,
                                         similarityCutoff = 0.75,
                                         mutateRelationshipType = "HAS_SIMILAR_EMBEDDING",
                                         mutateProperty = "similarity")
knn_result_df

### 4.2.4 Compare embedding based similarity with neighborhood based similarity

Stream a dataframe that contains both the `HAS_SIMILAR_EMBEDDING` and `HAS_SIMILAR_NEIGHBORS` relationships so that we can compare.

In [None]:
similarity_df = gds.graph.relationshipProperties.stream(g_article_authors, "similarity", ["HAS_SIMILAR_EMBEDDING", "HAS_SIMILAR_NEIGHBORS"])

I had an AI assistant write code to reshape the dataframe according to this prompt:


> I have a pandas dataframe called similarity_df. It has columns sourceNodeId, targetNodeId, relationshipType, and propertyValue. The relationship type column has two values: HAS_SIMILAR_EMBEDDING and HAS_SIMILAR_NEIGHBORS. I would like to reshape the data so that it has three columns: sourceNodeId, HAS_SIMILAR_EMBEDDING, and HAS_SIMILAR_NEIGHBORS. In the has SIMILAR_EMBEDDING and HAS_SIMILAR_NEIGHBORS columns, there should be a list of tuples. The first value in the tuple should be the value of targetNodeId. The second value in the tuple should be the value of propertyValue. The list should be sorted in descending order of propertyValue.



In [None]:
# First, I'll create a dictionary to collect the data
grouped_data = defaultdict(lambda: {"HAS_SIMILAR_EMBEDDING": [], "HAS_SIMILAR_NEIGHBORS": []})

# Iterate through the DataFrame rows to organize data
for _, row in similarity_df.iterrows():
    source = row['sourceNodeId']
    target = row['targetNodeId']
    rel_type = row['relationshipType']
    prop_value = row['propertyValue']

    # Add the tuple to the appropriate list
    grouped_data[source][rel_type].append((target, prop_value))

# Sort each list by propertyValue in descending order
for source in grouped_data:
    for rel_type in grouped_data[source]:
        grouped_data[source][rel_type].sort(key=lambda x: x[1], reverse=True)

# Create a new DataFrame from the collected data
result_df = pd.DataFrame([
    {
        'sourceNodeId': source,
        'HAS_SIMILAR_EMBEDDING': data['HAS_SIMILAR_EMBEDDING'],
        'HAS_SIMILAR_NEIGHBORS': data['HAS_SIMILAR_NEIGHBORS']
    }
    for source, data in grouped_data.items()
])

# The result_df now has the desired structure

We have a lot more `HAS_SIMILAR_EMBEDDING` relationships than `HAS_SIMILAR_NEIGHBORS` relationships because we required authors to have written at least 4 articles before computing `HAS_SIMILAR_NEIGHBORS`. Filter the dataframe to only those where HAS_SIMILAR_NEIGHBORS is not a zero-length list.

Take a look at some of these results in Neo4j Browser with a Cypher query like this one: `MATCH p = (n)-[:WROTE]->() WHERE id(n) IN [591477, 591479] RETURN p`

In [None]:
result_df[result_df['HAS_SIMILAR_NEIGHBORS'].map(lambda x: len(x)) > 0].head(20)

## Clean up session.

In [None]:
sessions.delete(session_id="algorithm_lab")