In [None]:
#! pip install neo4j

# Creating a knowledge graph in neo4j 

Here we provide a sample notebook for KG construction with hypothetical graph data where there are 3 typrs of nodes and one relationship. Following are the general steps for KG set up for personal PC:

- First download and install the neo4j Desktop on your computer.
- Create a new project.
- Click on ```ADD``` tab right to the project and add local DBMS. Make sure your user and passwords are noted for future access.
- Install ```apoc``` library through plugin.
- Open config folder and put provided ```neo4j.config``` and ```apoc.config```.
    	a. Click on the 3 dots on the right of the graph DBMS.
		b. Click open
		c. Click configuration
		d. Copy and paste provided neo4j.config and apoc.config there.
- Start the graph database
- Click open tab of the DBMS to open neo4j Browser.
- Install ```anaconda``` python and run ```Jupyter application```
- Use this notebook to communicate with graph database

In [1]:
import pandas as pd
import json
import time
from neo4j import GraphDatabase

#### Make a connection with Graph Database

In [2]:
import pandas as pd
from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://127.0.0.1:7687",auth=("neo4j","1234"))
session=driver.session()

In [3]:
driver

<neo4j._sync.driver.BoltDriver at 0x7f8fd37807c0>

#### Create Constraints for uniqueness of nodes

In [4]:
def create_constraints(driver):
        query = ["CREATE CONSTRAINT UniqueNode1IdConstraint FOR (n1:Node1) REQUIRE n1.nodeId IS UNIQUE",\
                 "CREATE CONSTRAINT UniqueNode2IdConstraint FOR (n2:Node2) REQUIRE n2.nodeId IS UNIQUE",\
                 "CREATE CONSTRAINT UniqueNode3IdConstraint FOR (n3:Node3) REQUIRE n3.nodeId IS UNIQUE"]
        with driver.session() as session:
            for constraint in query:
                session.run(constraint)

In [5]:
'''UNCOMMENT AND RUN THIS CELL ONLY ONCE'''
#create_constraints(driver)

### Populate Node
- There are three arbitrary node types
- There are one edge types between nodes.

In [6]:
def create_node1(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (n1:Node1{nodeId:value.nodeId})\
            ON CREATE SET n1.property1=value.property1,\
            n1.property2=value.property2,\
            n1.description=value.description"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)
            
def create_node2(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (n2:Node2{nodeId:value.nodeId})\
            ON CREATE SET n2.property1=value.property1,\
            n2.property2=value.property2,\
            n2.description=value.description"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)
            
def create_node3(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MERGE (n3:Node3{nodeId:value.nodeId})\
            ON CREATE SET n3.property1=value.property1,\
            n3.property2=value.property2,\
            n3.description=value.description"
            
            #print(query)
            tx.run(query,data=data)
        
        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [7]:
t1 = time.time()
data = "data/n1.json"
create_node1(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  1.462674856185913


In [8]:
t1 = time.time()
data = "data/n2.json"
create_node2(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  1.2738971710205078


In [9]:
t1 = time.time()
data = "data/n3.json"
create_node3(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  0.2110300064086914


### Create Edges

In [10]:
def create_edge(data):
        def tx_function(tx,data):
            query = "WITH '" + data  + "' as url \
            CALL apoc.load.json(url) YIELD value \
            MATCH (na {nodeId:value.source_nodeId})\
            MATCH (nb {nodeId:value.target_nodeId})\
            MERGE (na)-[:HAS_RELATIONSHIP]->(nb)"

            tx.run(query,data=data)

        with driver.session() as session:   
            session.execute_write(tx_function,data)

In [11]:
t1 = time.time()
data = "data/relation.json"
create_edge(data)
t2 = time.time()
print( "success! total time: ", t2-t1)

success! total time:  1.1760571002960205


### Templet Data Preparation

The code below are not necessary to your class project. These codes are to generate sample data to create arbitrary nodes and edges data to construct template knowledge graph.

In [None]:
import random

def get_node_detail():
    string= 'a@b#c$d%^ef&g*hijklmnopqrstuvwxyz'

    a_chr = string[random.randint(0,26)]
    chrs = [string[random.randint(0,26)] for l in range(10)]
 
    nodeId = "".join([string[random.randint(0,26)] for nl in range(random.randint(5,7))])
    property1 = "".join([string[random.randint(0,26)] for nl in range(random.randint(5,7))])
    property2 = "".join([string[random.randint(0,26)] for nl in range(random.randint(5,7))])

    parap_list = [ "".join([string[random.randint(0,26)] \
                  for nl in range(random.randint(2,8))]) + " " \
                  for nw in range(100)]

    paragraph = "".join(parap_list)
    return nodeId, property1, property2, paragraph

In [None]:
nodeId, property1, property2, paragraph = get_node_detail()

In [None]:
nodeId

In [None]:
def node_generator(N):
    data = []
    for n in range(N):
        nodeId, property1, property2, paragraph = get_node_detail()
        data.append({"nodeId":nodeId,\
                    "property1": property1,\
                    "property2": property2,\
                    "description": paragraph})
    return data

In [None]:
n1_dict = node_generator(100)
n2_dict = node_generator(100)
n3_dict = node_generator(100)

In [None]:
with open("data/n1.json", "w")as f:
    json.dump(n1_dict,f)
with open("data/n2.json", "w")as f:
    json.dump(n1_dict,f)
with open("data/n3.json", "w")as f:
    json.dump(n1_dict,f)

In [None]:
allnodes=[]
for item in n1_dict:
    allnodes.append({"node_type": "Node1", "nodeId": item['nodeId']})
for item in n2_dict:
    allnodes.append({"node_type": "Node2", "nodeId": item['nodeId']})
for item in n3_dict:
    allnodes.append({"node_type": "Node3", "nodeId": item['nodeId']})
    

In [None]:
len(allnodes)

In [None]:
random.choice(allnodes)

In [None]:
relation =[]
for e in range(500):
    nodeA = random.choice(allnodes)
    nodeB = random.choice(allnodes)
    edge = "HAS_RELATIONSHIP"
    
    relation.append({"source_nodeId": nodeA["nodeId"],\
                     "source_type": nodeA['node_type'],\
                     "target_nodeId": nodeB["nodeId"],\
                     "tareget_type": nodeB['node_type'],\
                     "relation": edge})

In [None]:
len(relation)

In [None]:
#relation

In [None]:
with open("data/relation.json", "w")as f:
    json.dump(relation,f)

### References

https://neo4j.com/docs/cypher-cheat-sheet/current/