In [1]:

DB_ULR = "neo4j+s://summit.graphdatabase.ninja:7687"
DB_USER = "attendee12"
DB_PASS = "fix me"
DB_NAME = "eurosong12"

# Create driver
Also set the DB_NAME

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px 
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

# Basic data profiling and answer some questions with cypher

In [3]:
df = pd.DataFrame( 
    gds.run_cypher('match (n) return count(*) as nodeCount') 
)
df.head()

Unnamed: 0,nodeCount
0,54


In [28]:
# Who won in 2006
df = pd.DataFrame( 
    gds.run_cypher("""
    MATCH (c:Country)<-[vote:VOTE_2006_JURY|VOTE_2006_PUBLIC]-()
    RETURN c.name as country, sum(vote.weight) as score
    ORDER BY score DESC LIMIT 5
    """) 
)
df.head()

Unnamed: 0,country,score
0,Finland,292
1,Russia,248
2,Bosnia & Herzegovina,229
3,Romania,172
4,Sweden,170


In [29]:
# Does country-X almost always vote for country-Y
df = pd.DataFrame( 
    gds.run_cypher("""
    MATCH (target:Country)<-[r]-()
    WHERE NOT type(r) IN ['SPLIT_INTO','WAS_RENAMED']
    AND NOT type(r) CONTAINS 'PUBLIC'
    WITH target, count(DISTINCT type(r)) AS totalentries
    WHERE totalentries > 15
    MATCH (target)<-[r]-(source:Country)
    WHERE NOT type(r) IN ['SPLIT_INTO','WAS_RENAMED']
    AND NOT type(r) CONTAINS 'PUBLIC'
    WITH target, source, count(r) as votes, totalentries
    WHERE votes > totalentries * 0.80
    RETURN source.name AS `country-X`, target.name as `country-Y`, votes, totalentries ORDER BY totalentries+votes DESC
    """) 
)
df.head()

Unnamed: 0,country-X,country-Y,votes,totalentries
0,Norway,Sweden,34,42
1,Sweden,Denmark,27,33
2,Spain,Italy,22,25
3,Switzerland,Italy,21,25
4,Estonia,Russia,18,20


In [19]:
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country)-[*0..1]->()
    with c, count(*)-1 as degree
    return avg(degree), stDev(degree), min(degree), max(degree)
    """) 
)
df.head()

Unnamed: 0,avg(degree),stDev(degree),min(degree),max(degree)
0,252.185185,146.062286,0,470


In [8]:
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country{name:"Sweden"})-[r]->()
    return type(r) as year, count(*) as numberOfVotes order by year desc
    """) 
)
df.head()

Unnamed: 0,year,numberOfVotes
0,VOTE_2018_PUBLIC,10
1,VOTE_2018_JURY,10
2,VOTE_2017_PUBLIC,10
3,VOTE_2017_JURY,10
4,VOTE_2016_PUBLIC,10


# Let's try out some algorithms
Can we arrange syptoms based on diseases they share?

In [9]:
G1=gds.graph.project('p1',['Country'], '*')

"Nodes: {node_count}, Relationships {relationship_count}".format(node_count=G1.node_count(), relationship_count=G1.relationship_count())

'Nodes: 54, Relationships 13618'

In [10]:
gds.wcc.stats(G1)

{'componentCount': 2,
 'componentDistribution': {'p99': 53,
  'min': 1,
  'max': 53,
  'mean': 27.0,
  'p90': 53,
  'p50': 1,
  'p999': 53,
  'p95': 53,
  'p75': 53},
 'postProcessingMillis': 17,
 'preProcessingMillis': 0,
 'computeMillis': 4,
 'configuration': {'jobId': '76650f8c-1251-4a51-bae1-02bd27d5ccfd',
  'seedProperty': None,
  'consecutiveIds': False,
  'threshold': 0.0,
  'relationshipWeightProperty': None,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'username': None,
  'concurrency': 4}}

In [11]:
gds.wcc.write(G1, writeProperty = 'wcc')

{'writeMillis': 14,
 'nodePropertiesWritten': 54,
 'componentCount': 2,
 'componentDistribution': {'p99': 53,
  'min': 1,
  'max': 53,
  'mean': 27.0,
  'p90': 53,
  'p50': 1,
  'p999': 53,
  'p95': 53,
  'p75': 53},
 'postProcessingMillis': 3,
 'preProcessingMillis': 0,
 'computeMillis': 1,
 'configuration': {'jobId': '811f2607-9418-47c6-9942-9dd7d566674b',
  'writeConcurrency': 4,
  'seedProperty': None,
  'consecutiveIds': False,
  'writeProperty': 'wcc',
  'threshold': 0.0,
  'relationshipWeightProperty': None,
  'nodeLabels': ['*'],
  'sudo': False,
  'relationshipTypes': ['*'],
  'concurrency': 4,
  'username': None}}

In [24]:
# What country ended up in its own community
df = pd.DataFrame( 
    gds.run_cypher("""
    match (c:Country)
    return c.wcc as community, count(*) as numberOfContries, collect(c.name) as countries order by numberOfContries asc limit 1
    """) 
)
df.head()

Unnamed: 0,community,numberOfContries,countries
0,29,1,[Macedonia]


In [20]:
G1.drop()