In [1]:
# Project: MIDS W205 Spring 2024 Project 3
# Author: Timothy Majidzadeh
# Purpose: Build the Neo4j graph of the BART stations, 
# modify it for our business case, and test shortest path algorithms.
# Date Created: April 5th, 2024
# Date Updated: April 8th, 2024
# Notes: Re-uses code from T. Majizdadeh's execution of Exercises 3-1 through 3-4.

In [2]:
# Import packages needed for exercises 3-1 thru 3-4

import neo4j
import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [3]:
# Define functions needed for exercise 3-1.
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

cursor = connection.cursor()

def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [4]:
# Define additional functions needed for exercise 3-3.
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
    
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)

def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        print("\n--------------------------------")
        print("   Total Cost: ", total_cost)
        print("   Minutes: ", round(total_cost / 60.0,1))
        print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1

In [5]:
# Define a function to reduce connection.rollback(), cursor.execute(), and connection.commit() statements.
def rollback_and_query(query):
    connection.rollback()
    cursor.execute(query)
    connection.commit()

In [6]:
# Load the tables, including exercise 3-1 & new tables.
query = """
drop table stations
"""

rollback_and_query(query)

In [7]:
query = """
drop table lines
"""

rollback_and_query(query)

In [8]:
query = """
drop table travel_times
"""

rollback_and_query(query)

In [9]:
query = """
drop table additional_nodes
"""

rollback_and_query(query)

In [10]:
query = """
drop table direct_driving_times
"""

rollback_and_query(query)

In [11]:
query = """
drop table driving_times_to_stations
"""

rollback_and_query(query)

In [12]:
query = """

create table stations(
    station varchar(32),
    latitude numeric(9, 6),
    longitude numeric(9, 6),
    transfer_time numeric(3)
)

"""
rollback_and_query(query)

In [13]:
query = """

create table lines(
    line varchar(6),
    sequence numeric(2),
    station varchar(32)
)

"""
rollback_and_query(query)

In [14]:
query = """

create table travel_times(
    station_1 varchar(32),
    station_2 varchar(32),
    travel_time numeric(3)
)

"""

rollback_and_query(query)

In [15]:
query = """

create table additional_nodes(
    location varchar(32),
    latitude numeric(9, 6),
    longitude numeric(9, 6),
    transfer_time numeric(3)
)

"""

rollback_and_query(query)

In [16]:
query = """

create table direct_driving_times(
    location_1 varchar(32),
    location_2 varchar(32),
    travel_time numeric(4)
)

"""

rollback_and_query(query)

In [17]:
query = """

create table driving_times_to_stations(
    location_1 varchar(32),
    location_2 varchar(32),
    travel_time numeric(4)
)

"""

rollback_and_query(query)

In [18]:
query = """

copy stations (
    station,
    latitude,
    longitude,
    transfer_time)
from '/user/projects/project-3-timothy-majidzadeh/data/stations.csv'
    delimiter ','
    NULL
    ''
    csv
    header;

"""

rollback_and_query(query)

In [19]:
query = """

copy lines (
    line,
    sequence,
    station
    )
from '/user/projects/project-3-timothy-majidzadeh/data/lines.csv'
    delimiter ','
    NULL
    ''
    csv
    header;

"""

rollback_and_query(query)

In [20]:
query = """

copy travel_times (
    station_1,
    station_2,
    travel_time)
from '/user/projects/project-3-timothy-majidzadeh/data/travel_times.csv'
    delimiter ','
    NULL
    ''
    csv
    header;

"""

rollback_and_query(query)

In [21]:
query = """

copy additional_nodes(
    location,
    latitude,
    longitude,
    transfer_time)
from '/user/projects/project-3-timothy-majidzadeh/data/additional_nodes.csv'
    delimiter ','
    NULL
    ''
    csv
    header;
"""

rollback_and_query(query)

In [22]:
query = """

copy direct_driving_times(
    location_1,
    location_2,
    travel_time)
from '/user/projects/project-3-timothy-majidzadeh/data/direct_driving_times.csv'
    delimiter ','
    NULL
    ''
    csv
    header;
"""

rollback_and_query(query)

In [23]:
query = """

copy driving_times_to_stations(
    location_1,
    location_2,
    travel_time)
from '/user/projects/project-3-timothy-majidzadeh/data/driving_times_to_stations.csv'
    delimiter ','
    NULL
    ''
    csv
    header;
"""

rollback_and_query(query)

In [24]:
query = """
select * from additional_nodes
"""

rollback_before_flag = True
rollback_after_flag = True
my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,location,latitude,longitude,transfer_time
0,Acme Gourmet Meals,37.855658,-122.260248,0
1,Customer A,37.876238,-122.259558,0
2,Customer B,37.773386,-122.212141,0
3,Customer C,37.795521,-122.393616,0
4,Customer D,37.801461,-122.272673,0
5,Customer E,37.934118,-122.361343,0


In [25]:
# Exercise 3-3: Build the graph using queries from exercise 3-2.
my_neo4j_wipe_out_database()

In [26]:
query = """

select station
from stations
order by station

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    
    my_neo4j_create_node('depart ' + station)
    my_neo4j_create_node('arrive ' + station)

In [27]:
# Also add nodes for Acme and customers.
query = """

select location
from additional_nodes
order by location

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    location = row[0]
    
    print(location)
    
    my_neo4j_create_node(location)
    my_neo4j_create_node('drive ' + location)
    my_neo4j_create_relationship_two_way('drive ' + location, location, 0)

Acme Gourmet Meals
Customer A
Customer B
Customer C
Customer D
Customer E


In [28]:
query = """

select station,
    line
from lines
order by station, lines

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    station, line = row[0], row[1]
    
    my_neo4j_create_node(line + ' ' + station)
    my_neo4j_create_relationship_one_way('depart ' + station,line + ' ' + station,0)
    my_neo4j_create_relationship_one_way(line + ' ' + station,'arrive ' + station,0)

In [29]:
query = """

select li_1.station,
    li_1.line as from_line,
    li_2.line as to_line,
    st.transfer_time
from lines as li_1
join lines as li_2
    on li_1.station = li_2.station
join stations as st
    on li_1.station = st.station
where li_1.line != li_2.line
order by li_1.station, li_1.line, li_2.line

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    station, from_line, to_line, transfer_time = row
    transfer_time = int(transfer_time)
    
    my_neo4j_create_relationship_one_way(from_line + ' ' + station, to_line + ' ' + station,transfer_time)

In [30]:
query = """

select 
    li_1.line,
    li_1.station as "from station",
    li_2.station as "to station",
    tt.travel_time as "travel time in seconds"
from (select
    line,
    sequence + 1 as sequence,
    station
from lines) as li_1
join lines as li_2
    on li_1.line = li_2.line
    and li_1.sequence = li_2.sequence
join travel_times as tt
    on li_1.station = tt.station_1
    and li_2.station = tt.station_2

union

select 
    li_1.line,
    li_1.station as "from station",
    li_2.station as "to station",
    tt.travel_time as "travel time in seconds"
from (select
    line,
    sequence + 1 as sequence,
    station
from lines) as li_1
join lines as li_2
    on li_1.line = li_2.line
    and li_1.sequence = li_2.sequence
join travel_times as tt
    on li_2.station = tt.station_1
    and li_1.station = tt.station_2

order by line, "from station", "to station"

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    line, from_station, to_station, travel_time = row
    travel_time = int(travel_time)
    
    my_neo4j_create_relationship_two_way(line + ' ' + from_station, line + ' ' + to_station, travel_time)

In [31]:
# Also add driving time relations between ACME, the customers, and stations.
# Assume driving routes are two-way relations.
query = """

select * 
from direct_driving_times

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    location_1, location_2, travel_time = row
    travel_time = int(travel_time)
    
    print(location_1, location_2, travel_time)
    
    my_neo4j_create_relationship_two_way('drive ' + location_1, 'drive ' + location_2, travel_time)

Acme Gourmet Meals Customer A 660
Acme Gourmet Meals Customer B 1020
Acme Gourmet Meals Customer C 1260
Acme Gourmet Meals Customer D 780
Acme Gourmet Meals Customer E 1260


In [32]:
query = """

select * 
from driving_times_to_stations

"""

rollback_and_query(query)

rows = cursor.fetchall()

for row in rows:
    location, station, travel_time = row
    travel_time = int(travel_time)
    
    print(location, station, travel_time)
    
    my_neo4j_create_relationship_one_way('drive ' + location, 'depart ' + station, travel_time)
    my_neo4j_create_relationship_one_way('arrive ' + station, 'drive ' + location, travel_time)

Acme Gourmet Meals Downtown Berkeley 540
Acme Gourmet Meals Ashby 300
Acme Gourmet Meals Rockridge 420
Acme Gourmet Meals MacArthur 540
Customer A Downtown Berkeley 300
Customer A North Berkeley 480
Customer B Fruitvale 300
Customer C Embarcadero 240
Customer D 12th Street 120
Customer E Richmond 180


In [33]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 226
  Relationships: 694
-------------------------


In [37]:
for customer in ['A', 'B', 'C', 'D', 'E']:
    my_neo4j_shortest_path('depart MacArthur', 'Customer ' + customer)
    my_neo4j_shortest_path('Acme Gourmet Meals', 'Customer ' + customer)


--------------------------------
   Total Cost:  720
   Minutes:  12.0
--------------------------------
depart MacArthur, 0, 0
orange MacArthur, 0, 0
orange Ashby, 240, 240
orange Downtown Berkeley, 180, 420
arrive Downtown Berkeley, 0, 420
drive Customer A, 300, 720
Customer A, 0, 720

--------------------------------
   Total Cost:  660
   Minutes:  11.0
--------------------------------
Acme Gourmet Meals, 0, 0
drive Acme Gourmet Meals, 0, 0
drive Customer A, 660, 660
Customer A, 0, 660

--------------------------------
   Total Cost:  1080
   Minutes:  18.0
--------------------------------
depart MacArthur, 0, 0
orange MacArthur, 0, 0
orange 19th Street, 180, 180
orange 12th Street, 120, 300
orange Lake Merritt, 180, 480
orange Fruitvale, 300, 780
arrive Fruitvale, 0, 780
drive Customer B, 300, 1080
Customer B, 0, 1080

--------------------------------
   Total Cost:  1020
   Minutes:  17.0
--------------------------------
Acme Gourmet Meals, 0, 0
drive Acme Gourmet Meals, 0, 0
dri