Create .env file in the same directory as the notebook and add the following lines:

```env
NEO4J_USERNAME=your_username
NEO4J_PASSWORD=your_password
NEO4J_ENDPOINT=your_endpoint
```

Install python dependencies.

In [None]:
pip install -r requirements.txt

Connect to database.

In [1]:
from databaseconnection import DatabaseConnection
gds = DatabaseConnection().get_database_connection()
gds.version()

'2.3.2'

Should show a version number if connection to database is successful.

https://github.com/neo4j/graph-data-science-client/blob/main/examples/fastrp-and-knn.ipynb

https://neo4j.com/docs/graph-data-science/current/management-ops/projections/graph-project-cypher/#graph-project-example-single-label-type

In [3]:
excluded_survey_ids = ["ff07d216-33e4-464d-8c35-c2fd3962282c", "c29919fa-793f-4b81-af49-92cef6c681fb", "6d0bec28-d5a4-46c3-9d3c-c17c7bf38ce8"]

node_query = f"""
WITH {excluded_survey_ids} AS excludedSurveyIds
MATCH (r:Respondent)<-[:SURVEYED]-(s:Survey) WHERE NOT s.id IN excludedSurveyIds
RETURN id(r) AS id, coalesce(r.position, 0) AS position, labels(r) AS labels
UNION ALL
MATCH (a:Answer)<-[:HAS_ANSWERED]-(r:Respondent)<-[:SURVEYED]-(s:Survey) WHERE NOT s.id IN {excluded_survey_ids}
RETURN id(a) AS id, coalesce(a.position, 0) AS position, labels(a) AS labels
UNION ALL
MATCH (qa:QuestionAlternative)
RETURN id(qa) AS id, coalesce(qa.position, 0) AS position, labels(qa) AS labels
UNION ALL
MATCH (q:Question)
RETURN id(q) AS id, coalesce(q.position, 0) AS position, labels(q) AS labels
"""

relationship_query = f"""
WITH {excluded_survey_ids} AS excludedSurveyIds
MATCH (n)-[r:HAS_ANSWERED|CHOSE|IS_ANSWER_TO|CONSISTS_OF]->(m)
RETURN id(n) AS source, id(m) AS target, type(r) AS type
"""


graph_name = 'respondentAnswerOnlyStudents'

if gds.graph.exists(graph_name).any():
    gds.graph.drop(gds.graph.get(graph_name))

G, result = gds.graph.project.cypher(
    graph_name,
    node_query,
    relationship_query,
    validateRelationships=False
)

print(f"The projection took {result['projectMillis']} ms")
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")

The projection took 800 ms
Graph 'respondentAnswerOnlyStudents' node count: 195704
Graph 'respondentAnswerOnlyStudents' node labels: ['Answer', 'Respondent', 'Question', 'QuestionAlternative']


In [65]:
result = gds.fastRP.mutate(
    G,
    mutateProperty='embedding',
    featureProperties=['position'],
    randomSeed=42,
    embeddingDimension=128,
    iterationWeights=[0.8, 1, 1, 1, 1, 1, 1, 1, 1, 1]
)
print(f"Number of embedding vectors produced: {result['nodePropertiesWritten']}")

FastRP:   0%|          | 0/100 [00:00<?, ?%/s]

Number of embedding vectors produced: 195704


In [7]:
node_projection = {
    "Respondent": {},
    "Answer": {},
    "Question": {},
    "QuestionAlternative": { "properties": { "position": { "defaultValue": 0 }}}
}
relationship_projection = ["HAS_ANSWERED", "CHOSE", "IS_ANSWER_TO", "CONSISTS_OF"]
G, result = gds.graph.project("respondentAnswer", node_projection, relationship_projection)

print(f"The projection took {result['projectMillis']} ms")
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")

The projection took 314 ms
Graph 'respondentAnswer' node count: 225427
Graph 'respondentAnswer' node labels: ['Answer', 'Respondent', 'Question', 'QuestionAlternative']


## CONTINUE HERE!

In [10]:
gds.nodeSimilarity.stream(
    G
)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

In [8]:
gds.nodeSimilarity.write(
    G,
    writeRelationshipType='NODE_SIMILARITY',
    writeProperty='score'
)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

preProcessingMillis                                                       0
computeMillis                                                        587588
writeMillis                                                           21275
postProcessingMillis                                                     -1
nodesCompared                                                        223883
relationshipsWritten                                                2203454
similarityDistribution    {'p1': 0.8000025749206543, 'max': 1.0000071525...
configuration             {'topK': 10, 'writeConcurrency': 4, 'similarit...
Name: 0, dtype: object

In [66]:
result = gds.knn.write(
    G,
    topK=2,
    nodeProperties=["embedding"],
    randomSeed=42,
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR",
    writeProperty="score"
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Knn:   0%|          | 0/100 [00:00<?, ?%/s]

Relationships produced: 391408
Nodes compared: 195704
Mean similarity: 0.9817089798142201


In [67]:
gds.run_cypher(
    """
        MATCH (p1:Respondent)-[r:SIMILAR]->(p2:Respondent)
        WHERE r.score > 0.0
        RETURN p1.id AS person1, p2.id AS person2, r.score AS similarity
        ORDER BY similarity DESCENDING, person1, person2
    """
)

Unnamed: 0,person1,person2,similarity
0,169fd8ab-4ea6-485f-97eb-4c2de2e1a384,d683115f-2307-4ac4-9c6c-7dc6dde09e48,0.919332
1,d683115f-2307-4ac4-9c6c-7dc6dde09e48,169fd8ab-4ea6-485f-97eb-4c2de2e1a384,0.919332
2,487b5581-8126-4ead-8468-d1954fd249fe,96aa2830-97ee-442e-85e6-5743fc2fb081,0.918723
3,96aa2830-97ee-442e-85e6-5743fc2fb081,487b5581-8126-4ead-8468-d1954fd249fe,0.918723
4,486162f2-9bc5-4e91-9cdf-52897c4216f9,edba4f5d-67ec-4dc7-9a4b-2c76573632cf,0.917563
...,...,...,...
5999,ad4e6c78-7676-4f8c-8fb0-fd3b43212143,b23ee2ac-e945-47a1-ac95-56591c6aa437,0.827936
6000,c00c06f3-3cbc-4bc2-bd95-b3a7f24476e5,145e03db-3d59-451f-b5eb-d002a9588ce2,0.826752
6001,c00c06f3-3cbc-4bc2-bd95-b3a7f24476e5,6f69ae99-bda7-40e7-bf76-fda37b9f059b,0.826072
6002,1de99987-d2a6-476f-933c-c9f56d6aa06b,152b7cd4-bc61-4bf5-aea3-643f3ec3d722,0.819342


### Before removing duplicate relationships
![Before removing duplicate relationships](fastrp_knn_duplicate_relationships.png)

### Remove duplicate relationships

In [68]:
gds.run_cypher("""
    MATCH (r1:Respondent)-[rel1:SIMILAR]->(r2:Respondent)
    WHERE id(r1) < id(r2) AND EXISTS((r2)-[:SIMILAR]->(r1))
    WITH rel1
    DELETE rel1
    """)

### After removing duplicate relationships
![After removing duplicate relationships](fastrp_knn_removed_duplicate_relationships.png)

## Community detection for the similar Respondents

### First project the Respondents with the `SIMILAR` relationship with the score property

In [2]:
G, result = gds.graph.project(
    "knnSimilarRespondents",
    ["Respondent"],
    {
        "SIMILAR": { "orientation": "UNDIRECTED" }
    },
    relationshipProperties="score"
)

print(f"The projection took {result['projectMillis']} ms")
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")

The projection took 359 ms
Graph 'knnSimilarRespondents' node count: 3475
Graph 'knnSimilarRespondents' node labels: ['Respondent']


### Louvain

In [13]:
result = gds.louvain.stream(G)

print(f"Number of communities: {len(set(result['communityId']))}")

Louvain:   0%|          | 0/100 [00:00<?, ?%/s]

Number of communities: 25


In [12]:
result = gds.louvain.stream(G, relationshipWeightProperty="score")

print(f"Number of communities: {len(set(result['communityId']))}")

Louvain:   0%|          | 0/100 [00:00<?, ?%/s]

36

In [14]:
result = gds.louvain.write(G, relationshipWeightProperty="score", writeProperty="louvainCommunity")
print(f"No. of communities: {result['communityCount']}")
print(f"Modularity: {result['modularity']}")

Louvain:   0%|          | 0/100 [00:00<?, ?%/s]

No. of communities: 38
Modularity: 0.7312269661112804


```cypher

### Unweighted Louvain
```cypher
CALL gds.louvain.stream('knnSimilarRespondents') YIELD nodeId, communityId, intermediateCommunityIds RETURN COUNT(DISTINCT(communityId));
```

## Weighted Louvain

```cypher
CALL gds.louvain.stream('knnSimilarRespondents', { relationshipWeightProperty: 'score' }) YIELD nodeId, communityId, intermediateCommunityIds RETURN COUNT(DISTINCT(communityId));
```

### APOC seems to not be installed...

https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases  
https://neo4j.com/labs/apoc/5/installation/



### Visualize

https://neo4j.com/labs/apoc/5/export/gephi/
https://gephi.org/

### Apoc installed
Had to edit the config.

### Tried visualizing with gephi

```cypher
MATCH (r1:Respondent)-[s:SIMILAR]-(r2:Respondent)
WITH collect(DISTINCT(r1)) AS res1, collect(DISTINCT(r2)) AS res2, collect(s) AS similarRels
CALL apoc.export.graphml.data(res1 + res2, similarRels, null, {stream:true})
YIELD file, nodes, relationships, properties, data
RETURN file, nodes, relationships, properties, data;
```

Some gephi stuff:

https://towardsdatascience.com/community-detection-of-the-countries-of-the-world-with-neo4j-graph-data-science-4d3a022f8399

https://tbgraph.wordpress.com/2017/04/01/neo4j-to-gephi/