# **Environment Setup**

In [None]:
!pip install pandas
import pandas as pd

try:
    from graphdatascience import GraphDataScience
except:
    !pip install graphdatascience
    from graphdatascience import GraphDataScience


#**Connect to GDS**

Enter your VM IP and database password

In [None]:
bolt_url = "neo4j://34.130.126.98:8443"
password = "DS_Training"

Connect to Neo4j with the GDS Client:

In [None]:
gds = GraphDataScience(bolt_url, auth=('neo4j', password), database='pubmed')

#**1.  View the Graph Model**

1a. In a separate tab, open Neo4j Browser:   `http://[your-vm-ip]:7474`

*Use the same connection info as in the **Connect to GDS** section above*

1b. Write a query to view the current data model:  `CALL db.schema.visualization()`

<img src="https://drive.google.com/uc?id=145-YpeZAufXlWAdUi0opi6fE6vTfjV3O" width='250'/>

#**2.  Explore Label & Type Distribution**

###2a. Write a command to output the number of nodes of each label, including multi-labels:

In [None]:
gds.run_cypher("""
    #your Cypher here
""")

In [None]:
#@title Solution:
gds.run_cypher(
    """MATCH (n)
    RETURN labels(n) as labels,
    count(*) as nodeCount"""
)

Unnamed: 0,labels,nodeCount
0,[Article],450000
1,[Journal],3416
2,[Author],425119


###2b. Write a command to output the total number of relationships of each type:

In [None]:
gds.run_cypher("""
    #your CYpher here
""")

In [None]:
#@title Solution:
gds.run_cypher("""MATCH (s)-[r]->(t)
                  RETURN type(r) AS relType,
                  count(*) AS relCount,
                  labels(s) AS sourceLabels,
                  labels(t) AS targetLabels
                  """)

Unnamed: 0,relType,relCount,sourceLabels,targetLabels
0,IN_JOURNAL,450000,[Article],[Journal]
1,WROTE,1118301,[Author],[Article]
2,CITES,134778,[Article],[Article]
3,PUBLISHED_IN,1118301,[Author],[Journal]


#**3.   Explore Label--Property Associations**

###3a. Write a command to show which properties are associated with each label

In [None]:
labelprops = gds.run_cypher("""
    #your Cypher here
""")

labelprops

In [None]:
#@title Solution:
labelprops = gds.run_cypher(
    "CALL db.schema.nodeTypeProperties"
)
labelprops


Unnamed: 0,nodeType,nodeLabels,propertyName,propertyTypes,mandatory
0,:`Author`,[Author],fullName,[String],True
1,:`Author`,[Author],lastName,[String],False
2,:`Author`,[Author],foreName,[String],False
3,:`Author`,[Author],initials,[String],False
4,:`Journal`,[Journal],title,[String],True
5,:`Article`,[Article],pmid,[Long],True
6,:`Article`,[Article],title,[String],True
7,:`Article`,[Article],publicationYear,[Long],False
8,:`Article`,[Article],publicationMonth,[String],False


#**4.   Node Degree Analysis**

###4a. Project the entire graph
*This allows us to use GDS tools to analyze structure*


In [None]:
allCitations, result = gds.graph.project("all_citations", "*", "*")

allCitations

###4b. Write a command to display node degree statistics
*Include the min, max, and average degree of each node, as well as some key percentiles*

In [None]:
degree_stats = #your code here

In [None]:
#@title Solution:
degree_stats = gds.degree.stats(allCitations)

degree_stats['centralityDistribution']

{'min': 0.0,
 'max': 366.00194549560547,
 'p90': 6.000022888183594,
 'p999': 68.00048065185547,
 'p99': 26.00011444091797,
 'p50': 2.0000076293945312,
 'p75': 2.0000076293945312,
 'p95': 12.000053405761719,
 'mean': 3.2114687794297923}

#5.   Component Analysis

###5a. Write a query to display the number and size of connected components
*Use the same projection as in exercise 4*



In [None]:
wcc_stats = #your code here

In [None]:
#@title Solution:
wcc_stats = gds.wcc.stats(allCitations)

result = wcc_stats['componentCount'], wcc_stats['componentDistribution']

result

(103,
 {'min': 2,
  'p5': 2,
  'max': 877719,
  'p999': 877719,
  'p99': 78,
  'p1': 2,
  'p10': 3,
  'p90': 17,
  'p50': 4,
  'p25': 3,
  'p75': 8,
  'p95': 26,
  'mean': 8529.456310679612})