# Graph Data Science for Logistics: Experimentation with Air Cargo Shipment Data

In [69]:
import pandas as pd
import configparser

In [70]:
config = configparser.RawConfigParser()
config.read('/Users/zachblumenfeld/devtools/aura-freight-demo.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

In [71]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

## Airport Importance and Risks

In [72]:
# Directly connect airports to understand location importances and risks
# DOing this on the DB, outside Cypher projection for now
gds.run_cypher('''
    MATCH(a1:Airport)<-[:LOCATED_AT]-(d1:Departure)-[r:DEPART]->(d2:Arrival)-[:LOCATED_AT]->(a2:Airport)
    MERGE(a1)-[s:SENDS_TO {shipmentId:r.shipmentId, legId: r.legId}]->(a2)
    RETURN count(s)
''')

Unnamed: 0,count(s)
0,16167


In [73]:
# What are the most import
g, _ = gds.graph.project('proj', ['Airport'], ['SENDS_TO'])

In [74]:
# betweenness centrality
gds.betweenness.write(g, writeProperty='globalImportance')

nodePropertiesWritten                                                   237
writeMillis                                                               7
centralityDistribution    {'p99': 6337.062499999069, 'min': 0.0, 'max': ...
postProcessingMillis                                                    181
preProcessingMillis                                                       0
computeMillis                                                            17
configuration             {'writeConcurrency': 4, 'writeProperty': 'glob...
Name: 0, dtype: object

In [75]:

gds.run_cypher('''
    MATCH(a:Airport)
    RETURN a.airportId, a.globalImportance
    ORDER BY a.globalImportance DESC LIMIT 20
''')

Unnamed: 0,a.airportId,a.globalImportance
0,128,12382.027312
1,349,10998.689127
2,700,6337.058278
3,815,6290.492129
4,555,3631.742909
5,256,2362.820536
6,308,1822.031209
7,485,1495.812769
8,149,1391.657845
9,431,1103.351073


In [76]:
g.drop()

## Route Finding

In [77]:
# Aggregate Paths to get summary cost metrics for routes
# possible agg cost metrics: min, p25, median, mean, p75, p95, p99, max
gds.run_cypher('''
    MATCH(n0) WHERE NOT n0:Airport
    MATCH(n1) WHERE NOT n1:Airport
    MATCH(n0)-[r]->(n1)
    WITH n0, n1,
        r.effectiveMinutes AS effectiveMinutes,
        r.plannedMinutes AS plannedMinutes,
        r.effectiveMinutes - r.plannedMinutes AS diffMinutes
    WITH n0, n1,
    avg(effectiveMinutes) AS eMinutesMean, percentileCont(effectiveMinutes, 0.99) AS eMinutesP99,
    avg(plannedMinutes) AS pMinutesMean, percentileCont(plannedMinutes, 0.99) AS pMinutesP99,
    avg(diffMinutes) AS dMinutesMean, percentileCont(diffMinutes, 0.99) AS dMinutesP99
    MERGE(n0)-[s:SUPPLIES]-(n1)
    ON CREATE SET s.eMinutesMean = eMinutesMean,  s.eMinutesP99 = eMinutesP99,
        s.pMinutesMean = pMinutesMean,  s.pMinutesP99 = pMinutesP99,
        s.dMinutesMean = dMinutesMean,  s.dMinutesP99 = dMinutesP99
    RETURN count(s)
''')

Unnamed: 0,count(s)
0,2138


In [78]:
g, _ = gds.graph.project('proj', ['Entry', 'DepartureCheckpoint', 'Departure', 'Arrival', 'ArrivalCheckpoint', 'Destination'],
    {'SUPPLIES':{'properties':['eMinutesMean', 'eMinutesP99', 'pMinutesMean', 'pMinutesP99', 'dMinutesMean', 'dMinutesP99']}})

In [79]:
def get_airport_ids(row):
    return list( dict.fromkeys([n.get('airportId') for n in row.path.nodes]))

def calculate_shortest_paths(entry_id, destination_id, cost_metric, number_of_paths=5):
    source_id = gds.find_node_id(['Entry'], {'airportId': entry_id})
    target_id = gds.find_node_id(['Destination'], {'airportId': destination_id})
    paths_df = gds.shortestPath.yens.stream(g, sourceNode=source_id, targetNode=target_id, k=number_of_paths,
                                                   relationshipWeightProperty=cost_metric)
    paths_df['airportPath'] = paths_df.apply(get_airport_ids, axis=1)
    return paths_df[['totalCost', 'airportPath']]

In [80]:
# Shortest Path - find the shortest path between two airports based on average effective times
airport_id_1 = 614
airport_id_2 = 485
calculate_shortest_paths(airport_id_1, airport_id_2, 'eMinutesMean', 7)

Unnamed: 0,totalCost,airportPath
0,2849.608752,"[614, 815, 485]"
1,3635.915528,"[614, 783, 815, 485]"
2,3725.751888,"[614, 128, 485]"
3,4075.9764,"[614, 815, 281, 485]"
4,4115.476511,"[614, 815, 809, 485]"
5,4133.521208,"[614, 815, 128, 485]"
6,4356.054115,"[614, 349, 815, 485]"


In [81]:
# Shortest Path - find the shortest path between two airports based on 99 percentile times...i.e. near worst case scenarios
calculate_shortest_paths(airport_id_1, airport_id_2, 'eMinutesP99', 7)

Unnamed: 0,totalCost,airportPath
0,15785.91,"[614, 815, 485]"
1,16289.62,"[614, 128, 485]"
2,16737.48,"[614, 349, 485]"
3,16830.1,"[614, 815, 809, 485]"
4,17356.06,"[614, 815, 216, 485]"
5,17878.39,"[614, 815, 431, 485]"
6,17990.02,"[614, 128, 694, 485]"


In [82]:
g.drop()

Note for future. Consider:

1. Using other aggregate metrics for comparison in the SUPPLIES relationships for path finding costs.  This includes other percentiles but alo using predicted time or the differences between effective and predicted times
2. Finding paths with the SUPPLIES relationships but evaluating risk of paths via the distribution of historical performance along those paths. i.e. avoiding flaw pof average issues.

Also for path finding we need to be thoughtful about whether we are restricting to paths over single shipments and inbound vs outbound traffic as this effects the interpretation and formality of results....all stuff to look into if we want to pursue this dataset further.