In [1]:
from neo4j import GraphDatabase
import pandas as pd

import time

In [2]:
df_nodes = pd.read_csv("reuters_nodes - Sheet1.csv")
df_nodes

Unnamed: 0,node_id,node_name,node_NER
0,1,Japan,LOCATION
1,2,Sumita,PERSON
2,3,Paris,LOCATION
3,4,USA,LOCATION
4,5,Tokyo,LOCATION


In [3]:
df_rel = pd.read_csv("reuters_rel - Sheet1.csv")
df_rel

Unnamed: 0,Source,Target,Relationship
0,2,1,WAS_BORN
1,2,5,WAS_BORN
2,2,3,LIVED_IN
3,2,4,STUDIED_IN
4,5,1,IS_CAPITAL


In [4]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [13]:
conn = Neo4jConnection(uri="bolt://23.20.234.60:7687", user="neo4j", pwd="elapses-calculation-presumption")

In [14]:
conn.query('CREATE CONSTRAINT UniqueEntityId ON (e:Entity) ASSERT e.id IS UNIQUE')

[]

In [19]:
def add_nodes(rows, batch_size=10000):

    query = ''' UNWIND $rows AS row
                MERGE (:Node {name: row.node_name, id: row.node_id, type: row.node_NER})
                RETURN count(*) as total
    '''
    return insert_data(query, rows, batch_size)


def add_edges(rows, batch_size=50000):
    
    
    query = """ UNWIND $rows AS row
                MATCH (src:Node {id: row.Source}), (tar:Node {id: row.Target})
                CREATE (src)-[:%s]->(tar)
    """ % relationship
    
    return insert_data(query, rows, batch_size)

def insert_data(query, rows, batch_size = 10000):

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        try:
            total += res[0]['total']
        except:
            total += 0
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result

In [16]:
add_nodes(df_nodes)

{'total': 5, 'batches': 1, 'time': 2.4524519443511963}


{'total': 5, 'batches': 1, 'time': 2.4524519443511963}

In [17]:
for relationship in df_rel['Relationship'].unique():
    print(relationship)

WAS_BORN
LIVED_IN
STUDIED_IN
IS_CAPITAL


In [18]:
for relationship in df_rel['Relationship'].unique():
    print(relationship)
    y = df_rel[df_rel['Relationship'] == relationship]
    #print(y.shape)
    add_edges(y)

WAS_BORN
{'total': 0, 'batches': 1, 'time': 2.0683369636535645}
LIVED_IN
{'total': 0, 'batches': 1, 'time': 0.904289722442627}
STUDIED_IN
{'total': 0, 'batches': 1, 'time': 0.3974142074584961}
IS_CAPITAL
{'total': 0, 'batches': 1, 'time': 0.3996567726135254}


In [24]:
# Update node labels based on node_NER list
conn.query ('''
MATCH (n:Entity) 
CALL apoc.create.addLabels(n, n.node_NER) 
YIELD node 
RETURN node
''')

[]