In [144]:
from neo4j import GraphDatabase
import os
import requests
from dotenv import load_dotenv
from getpass import getpass

# get credentials
load_dotenv('target-db.env', override=True)

uri = os.getenv('NEO4J_URI')
username = os.getenv('NEO4J_USERNAME')
password = os.getenv('NEO4J_PASSWORD')

if not uri:
  uri = getpass("Please enter your Neo4j URI: ")
if not username:
  username = getpass("Please enter your Neo4j username: ")
if not password:
  password = getpass("Please enter your Neo4j password: ")

driver = GraphDatabase.driver(uri, auth=(username, password))

In [145]:
from neo4j import RoutingControl

#create uniqueness constraint if not exists
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Generator) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Bus) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Transformer) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Link) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Station) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)


driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Customer) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Installation) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Region) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Consumption) REQUIRE (n.id) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Ticket) REQUIRE (n.ticketNumber) IS NODE KEY',
    #database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x10e095bd0>, keys=[])

In [146]:
data_dir = "source-data"
# helper function
def chunks(xs, n=1_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

In [147]:
import pandas as pd
from neo4j import RoutingControl

generator_df = pd.read_csv(os.path.join(data_dir,'generators.csv'))

for records in chunks(generator_df.to_dict(orient='records')):
    res = generator_df = driver.execute_query("""
      UNWIND $records as rec
      MERGE (g:Generator {id:rec.ID})
      MERGE (b:Bus {id:rec.BUS_ID})
      MERGE (g)-[r:CONNECTED]->(b)
      SET
        g.capacity = rec.CAPACITY,
        g.category = rec.CATEGORY,
        g.geometry = point({latitude: rec.LATITUDE, longitude: rec.LONGITUDE}),
        g.mb_symbol = rec.MB_SYMBOL,
        g.name_eng = rec.NAME_ENG,
        g.name_nat = rec.NAME_NAT,
        g.symbol = rec.SYMBOL,
        g.tso = rec.TSO,
        g.visible = rec.VISIBLE
      RETURN count(rec) AS records_upserted
    """,
        #database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.data(),
        records = records
    )
    print(res)
generator_df

[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 172}]


[{'records_upserted': 172}]

In [148]:
bus_df = pd.read_csv(os.path.join(data_dir,'buses.csv'))

for records in chunks(bus_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (b:Bus {id: rec.ID})
      MERGE (s:Station {id: rec.STATION_ID})
      MERGE (b)-[:IN_STATION]->(s)
      SET
        b.category = rec.CATEGORY,
        b.geometry = point({latitude: rec.LATITUDE, longitude: rec.LONGITUDE}),
        b.mb_symbol = rec.MB_SYMBOL,
        b.name_eng = rec.NAME_ENG,
        b.name_nat = rec.NAME_NAT,
        b.symbol = rec.SYMBOL,
        b.tso = rec.TSO,
        b.visible = rec.VISIBLE,
        b.voltage = rec.VOLTAGE
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 893}]


In [149]:
transformer_df = pd.read_csv(os.path.join(data_dir,'transformers.csv'))

for records in chunks(transformer_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (t:Transformer {id: rec.ID})
      MERGE (b:Bus {id: rec.BUS_ID})
      MERGE (t)-[:CONNECTED]->(b)
      SET
        t.dst_dc = rec.DST_DC,
        t.dst_voltage = rec.DST_VOLTAGE,
        t.geometry = point({latitude: rec.LATITUDE, longitude: rec.LONGITUDE}),
        t.src_dc = rec.SRC_DC,
        t.src_voltage = rec.SRC_VOLTAGE,
        t.symbol = rec.SYMBOL
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

[{'records_upserted': 1000}]
[{'records_upserted': 60}]


In [150]:
link_df = pd.read_csv(os.path.join(data_dir,'links.csv'))

for records in chunks(link_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (l:Link {id: rec.ID})
      MERGE (b:Bus {id: rec.BUS_ID})
      MERGE (l)-[:CONNECTED]->(b)
      SET
        l.circuits = rec.CIRCUITS,
        l.dc = rec.DC,
        l.length_m = rec.LENGTH_M,
        l.shape_leng = rec.SHAPE_LENG,
        l.symbol = rec.SYMBOL,
        l.t9_code = rec.T9_CODE,
        l.underground = rec.UNDERGROUND,
        l.visible = rec.VISIBLE,
        l.voltage = rec.VOLTAGE
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 784}]


In [151]:
station_df = pd.read_csv(os.path.join(data_dir,'stations.csv'))

for records in chunks(station_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (s:Station {id: rec.ID})
      SET
        s.name_eng = rec.NAME_ENG,
        s.geometry = point({latitude: rec.LATITUDE, longitude: rec.LONGITUDE})
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)


[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 1000}]
[{'records_upserted': 811}]


In [152]:
customer_df = pd.read_csv(os.path.join(data_dir,'customers.csv'))

for records in chunks(customer_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (c:Customer {id: rec.ID})
      SET
        c.name = rec.NAME,
        c.type = rec.TYPE
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)


[{'records_upserted': 230}]


In [153]:
installation_df

Unnamed: 0,ID,LINK_ID,CUSTOMER_ID,REGION_ID,INSTALLATIONDATE,NOME,TYPE
0,1,10437.0,37813be6-a3bd-45e5-913b-589355110e02,96322,2024-05-16,Instalação0,
1,2,2716.0,c76c36f3-1447-466b-816c-cccba0122f75,96323,2024-05-16,Instalação1,
2,3,10437.0,,96324,2024-05-16,Instalação2,
3,4,9474.0,7239773e-24bb-46b4-8cfd-72b23b76cad2,96325,2024-05-16,Instalação3,
4,5,9474.0,cf96a870-f659-42b7-b0cb-1955582c7219,96322,2024-05-16,Instalação4,
5,6,2688.0,a4eb7262-6c69-43af-9f80-1cda219bb0e2,96323,2024-05-16,Instalação5,
6,7,,,96324,2024-05-16,Instalação6,
7,8,2688.0,c1001c52-1028-4311-900b-328434ee5ff1,96325,2024-05-16,Instalação7,
8,9,,7e8875ad-872d-47e9-ba2a-b27cdf13fe32,96322,2024-05-16,Instalação8,
9,10,2699.0,52383124-7d02-40dc-8657-a9b9da222a49,96323,2024-05-16,Instalação9,


In [154]:
installation_df = pd.read_csv(os.path.join(data_dir,'installations.csv'))

for records in chunks(installation_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (i:Installation {id: rec.ID})
      SET
        i.installationDate = rec.INSTALLATIONDATE,
        i.nome = rec.NOME,
        i.type = rec.TYPE
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

# Due to data quality issues a minority of installations are missing customers, links, and/or region. We work around for now
for records in chunks(installation_df[~installation_df.LINK_ID.isna()].to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (i:Installation {id: rec.ID})
      MERGE (l:Link {id: rec.LINK_ID})
      MERGE (l)-[:LINK_HAS_INSTALLATION]->(i)
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

for records in chunks(installation_df[~installation_df.CUSTOMER_ID.isna()].to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (i:Installation {id: rec.ID})
      MERGE (c:Customer {id: rec.CUSTOMER_ID})
      MERGE (c)-[:CUSTOMER_HAS_INSTALLATION]->(i)
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

for records in chunks(installation_df[~installation_df.REGION_ID.isna()].to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (i:Installation {id: rec.ID})
      MERGE (r:Region {id: rec.REGION_ID})
      MERGE (i)-[:INSTALL_HAS_REGION]->(r)
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)


[{'records_upserted': 30}]
[{'records_upserted': 26}]
[{'records_upserted': 23}]
[{'records_upserted': 30}]


In [155]:
region_df = pd.read_csv(os.path.join(data_dir,'regions.csv'))

for records in chunks(region_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (r:Region {id: rec.ID})
      SET r.name = rec.NAME
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

[{'records_upserted': 4}]


In [156]:
consumption_df = pd.read_csv(os.path.join(data_dir,'consumption_logs.csv'))

for records in chunks(consumption_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (c:Consumption {id: rec.ID})
      MERGE (i:Installation {id: rec.INSTALLATION_ID})
      MERGE (i)-[:INSTALL_HAS_CONSUMPTION]->(c)
      SET
        c.referenceDate = rec.REFERENCEDATE,
        c.quantity = rec.QUANTIDADE,
        c.consumptionValue = rec.CONSUMPTIONVALUE,
        c.invoiceValue = rec.INVOICEVALUE,
        c.newConsumptionValue = rec.NEWCONSUMPTIONVALUE
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)


[{'records_upserted': 360}]


In [157]:
ticket_df = pd.read_csv(os.path.join(data_dir,'tickets.csv'))

for records in chunks(ticket_df.to_dict(orient='records')):
    res = driver.execute_query("""
      UNWIND $records AS rec
      MERGE (t:Ticket {ticketNumber: rec.TICKETNUMBER})
      MERGE (c:Customer {id: rec.CUSTOMER_ID})
      MERGE (c)-[:CREATED_TICKET]->(t)
      SET
        t.createdDate = rec.CREATEDATE,
        t.resolutionDate = rec.RESOLUTIONDATE,
        t.severity = rec.SEVERITY,
        t.status = rec.STATUS
      RETURN count(rec) AS records_upserted
    """, routing_=RoutingControl.WRITE, result_transformer_=lambda r: r.data(), records=records)
    print(res)

[{'records_upserted': 194}]
