In [1]:

DB_ULR = "neo4j+s://summit.graphdatabase.ninja:7687"
DB_USER = "attendee12"
DB_PASS = "trinity12"
DB_NAME = "eurosong12"

# Create driver
Also set the DB_NAME

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px 
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

# Basic data profiling and answer some questions with cypher

In [3]:
df = pd.DataFrame( 
    gds.run_cypher('match (n) return count(*) as nodeCount') 
)
df.head()

Unnamed: 0,nodeCount
0,54


In [4]:
# Who won in 2006
df = pd.DataFrame( 
    gds.run_cypher("""
    MATCH (c:Country)<-[vote:VOTE_2006_JURY|VOTE_2006_PUBLIC]-()
    RETURN c.name as country, sum(vote.weight) as score
    ORDER BY score DESC LIMIT 5
    """) 
)
df.head()

Unnamed: 0,country,score
0,Finland,292
1,Russia,248
2,Bosnia & Herzegovina,229
3,Romania,172
4,Sweden,170


In [5]:
# Does country-X almost always vote for country-Y
df = pd.DataFrame( 
    gds.run_cypher("""
    MATCH (target:Country)<-[r]-()
    WHERE NOT type(r) IN ['SPLIT_INTO','WAS_RENAMED']
    AND NOT type(r) CONTAINS 'PUBLIC'
    WITH target, count(DISTINCT type(r)) AS totalentries
    WHERE totalentries > 15
    MATCH (target)<-[r]-(source:Country)
    WHERE NOT type(r) IN ['SPLIT_INTO','WAS_RENAMED']
    AND NOT type(r) CONTAINS 'PUBLIC'
    WITH target, source, count(r) as votes, totalentries
    WHERE votes > totalentries * 0.80
    RETURN source.name AS `country-X`, target.name as `country-Y`, votes, totalentries ORDER BY totalentries+votes DESC
    """) 
)
df.head()

Unnamed: 0,country-X,country-Y,votes,totalentries
0,Norway,Sweden,34,42
1,Sweden,Denmark,27,33
2,Spain,Italy,22,25
3,Switzerland,Italy,21,25
4,Estonia,Russia,18,20


In [6]:
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country)-[*0..1]->()
    with c, count(*)-1 as degree
    return avg(degree), stDev(degree), min(degree), max(degree)
    """) 
)
df.head()

Unnamed: 0,avg(degree),stDev(degree),min(degree),max(degree)
0,252.185185,146.062286,0,470


In [7]:
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country{name:"Sweden"})-[r]->()
    return type(r) as year, count(*) as numberOfVotes order by year desc
    """) 
)
df.head()

Unnamed: 0,year,numberOfVotes
0,VOTE_2018_PUBLIC,10
1,VOTE_2018_JURY,10
2,VOTE_2017_PUBLIC,10
3,VOTE_2017_JURY,10
4,VOTE_2016_PUBLIC,10


# Let's try out some algorithms
Can we arrange syptoms based on diseases they share?

In [11]:
G1, project_stats =gds.graph.project('p1',['Country'], '*')
# "Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G1.node_count(), relationship_count=G1.relationship_count())

In [14]:
project_stats

nodeProjection            {'Country': {'label': 'Country', 'properties':...
relationshipProjection    {'__ALL__': {'orientation': 'NATURAL', 'aggreg...
graphName                                                                p1
nodeCount                                                                54
relationshipCount                                                     13618
projectMillis                                                             4
Name: 0, dtype: object

In [13]:
gds.wcc.stats(G1)

componentCount                                                           2
componentDistribution    {'p99': 53, 'min': 1, 'max': 53, 'mean': 27.0,...
postProcessingMillis                                                     2
preProcessingMillis                                                      0
computeMillis                                                            1
configuration            {'jobId': '839a12d0-bb58-4106-adb3-3ff06c297e6...
Name: 0, dtype: object

In [15]:
gds.wcc.write(G1, writeProperty = 'wcc')

writeMillis                                                              1
nodePropertiesWritten                                                   54
componentCount                                                           2
componentDistribution    {'p99': 53, 'min': 1, 'max': 53, 'mean': 27.0,...
postProcessingMillis                                                     2
preProcessingMillis                                                      0
computeMillis                                                            0
configuration            {'jobId': 'c1283b46-25cd-4779-8e1c-e1e36527681...
Name: 0, dtype: object

In [16]:
# What country ended up in its own community
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country)
    return c.wcc as community, count(*) as numberOfContries, collect(c.name) as countries order by numberOfContries asc limit 1
    """) 
)
df.head()

Unnamed: 0,community,numberOfContries,countries
0,29,1,[Macedonia]


In [17]:
G1.drop()

In [18]:
G2, project_stats=gds.graph.project('eurosong1975', ['Country'],  { 'VOTE_1975_JURY' : { 'orientation' : 'UNDIRECTED', 'properties' : 'weight'} } )
project_stats

nodeProjection            {'Country': {'label': 'Country', 'properties':...
relationshipProjection    {'VOTE_1975_JURY': {'orientation': 'UNDIRECTED...
graphName                                                      eurosong1975
nodeCount                                                                54
relationshipCount                                                       380
projectMillis                                                             4
Name: 0, dtype: object

In [20]:
gds.graph.list()

Unnamed: 0,degreeDistribution,graphName,database,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema
0,"{'p99': 27, 'min': 0, 'max': 28, 'mean': 7.037...",eurosong1975,eurosong12,2343 KiB,2399296,54,380,{'relationshipProjection': {'VOTE_1975_JURY': ...,0.132774,2022-06-20T12:22:56.634192000+00:00,2022-06-20T12:22:56.639228000+00:00,"{'graphProperties': {}, 'relationships': {'VOT..."


In [21]:
G2.drop()

In [22]:
G3, project_stats = gds.graph.project.cypher('p3',
    "MATCH (c:Country) WHERE EXISTS ((c)-[:VOTE_1975_JURY]-()) RETURN id(c) as id, labels(c) as labels",
    "MATCH (s:Country)-[r:VOTE_1975_JURY]->(t:Country) RETURN id(s) as source, id(t) as target, type(r) as type, r.weight as weight"
)
project_stats

nodeQuery            MATCH (c:Country) WHERE EXISTS ((c)-[:VOTE_197...
relationshipQuery    MATCH (s:Country)-[r:VOTE_1975_JURY]->(t:Count...
graphName                                                           p3
nodeCount                                                           19
relationshipCount                                                  190
projectMillis                                                        4
Name: 0, dtype: object

In [37]:
df_rank = gds.pageRank.stream(G3, 
    maxIterations = 20,
    dampingFactor= 0.85,
    relationshipWeightProperty = "weight"
)
df_rank.head()




Unnamed: 0,nodeId,score
0,7,0.397409
1,15,1.091695
2,16,1.554096
3,19,0.3998
4,23,1.224899


In [77]:

df = gds.run_cypher(
    """
        unwind $p as x
        match (c:Country) where id(c) = x[0]
        return c.name, x[0] as nodeId, x[1] as score order by score desc
    """, params = { 'p' : df_rank.values.tolist() }) 
df.head(10)

Unnamed: 0,c.name,nodeId,score
0,The Netherlands,49.0,2.137752
1,United Kingdom,52.0,1.930939
2,Italy,25.0,1.560787
3,France,16.0,1.554096
4,Luxembourg,28.0,1.50057
5,Switzerland,48.0,1.352768
6,Ireland,23.0,1.224899
7,Finland,15.0,1.091695
8,Sweden,47.0,1.058185
9,Israel,24.0,0.830797


In [None]:
G3.drop()