# Building Knowledge Graph using Neo4J

### Import Packs


In [10]:
import pandas as pd
from py2neo import Graph, Node, Relationship, NodeMatcher
from neo4j import GraphDatabase
import matplotlib
import matplotlib.pyplot as plt
import yaml

In [31]:
### Import Params

In [11]:


# Load the configuration file
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Accessing keys and credentials
# openai_api_key = config["openai"]["api_key"]
neo4j_uri = config["neo4j"]["uri"]
neo4j_user = config["neo4j"]["user"]
neo4j_password = config["neo4j"]["password"]

### Establishing Connection with NEO4J

In [32]:

### Neo4j Connection Details
NEO4J_URI = neo4j_uri
NEO4J_USER = neo4j_user
NEO4J_PASSWORD = neo4j_password

# Establish connection
graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

### Checking the connection by looking over the data in the Database
graph.run("MATCH (n) return count(n)")


count(n)
46206


In [33]:
class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [22]:
conn = Neo4jConnection(uri=NEO4J_URI, user=NEO4J_USER, pwd=NEO4J_PASSWORD)

In [34]:
# df = pd.read_csv("../data/curated.csv")
# print(df.columns)

In [35]:
# df.article_id.count()

#### Load Data from Curated

In [28]:
query = """CALL apoc.periodic.iterate(
  'LOAD CSV WITH HEADERS FROM "file:///curated.csv" AS row RETURN row',
  '
  WITH row
  MERGE (i:ARTICLE {title: row.prod_name})
    ON CREATE SET 
      i.article_id = row.article_id, 
      i.product_code = row.product_code,
      i.detail_desc = row.detail_desc, 
      i.product_group_name = row.product_group_name, 
      i.product_type_name = row.product_type_name
  
  MERGE (a:DEPT {name: row.department_name})
  
  MERGE (b:COLOR {name: row.colour_group_name})
  
  MERGE (c:INDEX {name: row.index_name})
    ON CREATE SET 
      c.index_group_name = row.index_group_name
  
  MERGE (d:GARMENT {name: row.garment_group_name})
  
  MERGE (i)-[:DEPARTMENT_OF]->(a)
  MERGE (i)-[:COLOURED_IN]->(b)
  MERGE (i)-[:GROUP_OF]->(c)
  MERGE (i)-[:TYPE_OF]->(d)
  ',
  {batchSize: 100, parallel: false, failOnError: false}
);"""


In [29]:
conn.query(query)

[<Record batches=1056 total=105542 timeTaken=535 committedOperations=105542 failedOperations=0 failedBatches=0 retries=0 errorMessages={} batch={'total': 1056, 'errors': {}, 'committed': 1056, 'failed': 0} operations={'total': 105542, 'errors': {}, 'committed': 105542, 'failed': 0} wasTerminated=False failedParams={} updateStatistics={'relationshipsDeleted': 0, 'relationshipsCreated': 231096, 'nodesDeleted': 0, 'nodesCreated': 46206, 'labelsRemoved': 0, 'labelsAdded': 46206, 'propertiesSet': 275591}>]