# Graph Data Science for Logistics: Experimentation with Air Cargo Shipment Data

In [1]:
import pandas as pd
import configparser

In [2]:
config = configparser.RawConfigParser()
config.read('/Users/zachblumenfeld/devtools/aura-freight-demo.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

In [3]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

## Logistic Network Statistics

### Shipment Statistics

In [4]:
# Number of shipments
gds.run_cypher('''
    MATCH()-[r:DEPART]->() RETURN count(DISTINCT r.shipmentId) AS numberOfShipments
''')

Unnamed: 0,numberOfShipments
0,3942


In [5]:
# shipments pivot by inbound leg count
gds.run_cypher('''
    MATCH()-[r:DEPART]->()
    WITH r.shipmentId AS shipmentId, max(r.legNumber) AS numberOfInboundLegs
    RETURN numberOfInboundLegs, count(shipmentId) AS shipmentCount
''')

Unnamed: 0,numberOfInboundLegs,shipmentCount
0,3,1366
1,2,1258
2,1,1318


### Shipment Leg Statistics

In [6]:
# Number of legs
gds.run_cypher('''
    MATCH()-[r:DEPART]->()
    WITH DISTINCT r.legId AS legId,
        CASE r.legNumber
            WHEN -1 THEN "OUTBOUND"
            ELSE "INBOUND"
        END AS logisticType
    RETURN logisticType, count(legId) AS numberOfLegs
''')

Unnamed: 0,logisticType,numberOfLegs
0,OUTBOUND,3942
1,INBOUND,7932


In [7]:
# leg pivot by logistic type and segment count
gds.run_cypher('''
    MATCH()-[r:DEPART]->()
    WITH r.legId AS legId,
    max(r.segmentNumber) AS numberOfSegments,
    CASE r.legNumber
        WHEN -1 THEN "OUTBOUND"
        ELSE "INBOUND"
    END AS logisticType
    RETURN logisticType, numberOfSegments, count(legId) AS legCount ORDER BY logisticType, numberOfSegments
''')

Unnamed: 0,logisticType,numberOfSegments,legCount
0,INBOUND,1,5555
1,INBOUND,2,2332
2,INBOUND,3,45
3,OUTBOUND,1,2097
4,OUTBOUND,2,1819
5,OUTBOUND,3,26


In [8]:
# statistics for the accept step
df = gds.run_cypher('''
    MATCH(n0)-[r:ACCEPT]->(n1) //WHERE n0.airportId = n1.airportId
    WITH n0.airportId as departureAirportId, r, r.effectiveMinutes AS effectiveMinutes, r.plannedMinutes AS plannedMinutes, n1.airportId AS arrivalAirportId
    //RETURN departureAirportId, arrivalAirportId, avg(effectiveMinutes) AS mean, stDev(effectiveMinutes) AS stDev, min(effectiveMinutes) AS min, max(effectiveMinutes) AS max, count(*) AS cnt
    RETURN departureAirportId, arrivalAirportId, avg(plannedMinutes), stDev(plannedMinutes), min(plannedMinutes), max(plannedMinutes), count(*) AS cnt
    ORDER BY cnt DESC //departureAirportId, arrivalAirportId
''')
df.describe()

Unnamed: 0,departureAirportId,arrivalAirportId,avg(plannedMinutes),stDev(plannedMinutes),min(plannedMinutes),max(plannedMinutes),cnt
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,461.009217,461.009217,876.38095,247.568144,594.133641,1471.470046,74.502304
std,210.950802,210.950802,483.321692,302.890387,450.628528,972.751762,235.576757
min,100.0,100.0,115.0,0.0,35.0,115.0,1.0
25%,281.0,281.0,594.25,0.0,235.0,785.0,3.0
50%,472.0,472.0,799.509615,151.325341,505.0,1220.0,13.0
75%,643.0,643.0,1050.0,350.506867,780.0,1940.0,59.0
max,815.0,815.0,2618.333333,1444.030817,2346.0,5140.0,2205.0


In [9]:
# statistics for the accept step
df = gds.run_cypher('''
    MATCH(n0)-[r:DEPART]->(n1)
    WITH n0.airportId as departureAirportId, r, r.effectiveMinutes AS effectiveMinutes, r.plannedMinutes AS plannedMinutes, n1.airportId AS arrivalAirportId
    //RETURN departureAirportId, arrivalAirportId, avg(effectiveMinutes) AS mean, stDev(effectiveMinutes) AS stDev, min(effectiveMinutes) AS min, max(effectiveMinutes) AS max, count(*) AS cnt
    RETURN departureAirportId, arrivalAirportId, avg(plannedMinutes), stDev(plannedMinutes), min(plannedMinutes), max(plannedMinutes), count(*) AS cnt
    ORDER BY cnt DESC //departureAirportId, arrivalAirportId
''')
df.describe()
#df[df['min(plannedMinutes)'] < 20]

Unnamed: 0,departureAirportId,arrivalAirportId,avg(plannedMinutes),stDev(plannedMinutes),min(plannedMinutes),max(plannedMinutes),cnt
count,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0
mean,435.342739,442.607469,496.851241,211.242489,318.623237,944.083817,13.416598
std,221.11227,217.081769,531.135968,411.752266,426.301257,1263.960696,28.274034
min,101.0,100.0,5.0,0.0,5.0,5.0,1.0
25%,256.0,256.0,180.0,0.0,150.0,200.0,1.0
50%,403.0,413.0,300.0,2.886751,180.0,330.0,4.0
75%,609.0,610.0,621.6,289.020348,300.0,1335.0,13.0
max,815.0,815.0,4985.0,6201.326471,4780.0,12675.0,444.0


__TODO: Data Validation and Refinement__

Above stats are a bit peculiar.  This may require some data quality steps/and refinement.  Specifically
1. minimum planned depart times can be really low...like 1 minute.  Effective times can be down to a minute as well. This is not characteristic of flight times.  ACCEPT planned stats have more reasonable minimums which may suggest that they are actually the true representation of flight times meaning I misinterpreted the source data model during ETL. However, effective accept times still get super low.
2. Variance for both depart and accept times appear relativbely high, again not characteristic for flights, along the same flight path, one would expect these to be relatively similar.

### Airport Statistics

In [27]:
# Airports pivot by departures and arrivals
airport_df = gds.run_cypher('''
    MATCH (a:Airport)<-[:LOCATED_AT]-(v:Arrival)
    OPTIONAL MATCH (v)<-[r:DEPART]-()
    WITH  a, count(r) as numberOfArrivals
    MATCH (a:Airport)<-[:LOCATED_AT]-(d:Departure)
    OPTIONAL MATCH (d)-[r:DEPART]->()
    RETURN a.airportId AS airportId, numberOfArrivals, count(r) AS numberOfDepartures ORDER BY numberOfDepartures DESC
''')
airport_df['percOfArrivals'] = airport_df['numberOfArrivals']/airport_df['numberOfArrivals'].sum()
airport_df['percOfDepartures'] = airport_df['numberOfDepartures']/airport_df['numberOfDepartures'].sum()
airport_df

Unnamed: 0,airportId,numberOfArrivals,numberOfDepartures,percOfArrivals,percOfDepartures
0,815,1091,2240,0.067483,0.138554
1,128,1839,2195,0.113750,0.135770
2,700,2205,2003,0.136389,0.123894
3,349,1312,1104,0.081153,0.068287
4,485,758,711,0.046886,0.043978
...,...,...,...,...,...
232,477,2,0,0.000124,0.000000
233,488,1,0,0.000062,0.000000
234,502,4,0,0.000247,0.000000
235,504,4,0,0.000247,0.000000


In [22]:
airport_df.describe()

Unnamed: 0,airportId,numberOfArrivals,numberOfDepartures
count,237.0,237.0,237.0
mean,460.759494,68.21519,68.21519
std,209.516729,226.327234,260.498919
min,100.0,0.0,0.0
25%,281.0,2.0,0.0
50%,470.0,10.0,5.0
75%,641.0,51.0,32.0
max,815.0,2205.0,2240.0


## Airport Importance and Risks

In [72]:
# Directly connect airports to understand location importances and risks
# Doing this on the DB, outside Cypher projection for now
gds.run_cypher('''
    MATCH(a1:Airport)<-[:LOCATED_AT]-(d1:Departure)-[r:DEPART]->(d2:Arrival)-[:LOCATED_AT]->(a2:Airport)
    MERGE(a1)-[s:SENDS_TO {shipmentId:r.shipmentId, legId: r.legId}]->(a2)
    RETURN count(s)
''')

Unnamed: 0,count(s)
0,16167


In [73]:
# What are the most import
g, _ = gds.graph.project('proj', ['Airport'], ['SENDS_TO'])

In [74]:
# betweenness centrality
gds.betweenness.write(g, writeProperty='globalImportance')

nodePropertiesWritten                                                   237
writeMillis                                                               7
centralityDistribution    {'p99': 6337.062499999069, 'min': 0.0, 'max': ...
postProcessingMillis                                                    181
preProcessingMillis                                                       0
computeMillis                                                            17
configuration             {'writeConcurrency': 4, 'writeProperty': 'glob...
Name: 0, dtype: object

In [75]:

gds.run_cypher('''
    MATCH(a:Airport)
    RETURN a.airportId, a.globalImportance
    ORDER BY a.globalImportance DESC LIMIT 20
''')

Unnamed: 0,a.airportId,a.globalImportance
0,128,12382.027312
1,349,10998.689127
2,700,6337.058278
3,815,6290.492129
4,555,3631.742909
5,256,2362.820536
6,308,1822.031209
7,485,1495.812769
8,149,1391.657845
9,431,1103.351073


In [76]:
g.drop()

## Route Finding

In [77]:
# Aggregate Paths to get summary cost metrics for routes
# possible agg cost metrics: min, p25, median, mean, p75, p95, p99, max
gds.run_cypher('''
    MATCH(n0) WHERE NOT n0:Airport
    MATCH(n1) WHERE NOT n1:Airport
    MATCH(n0)-[r]->(n1)
    WITH n0, n1,
        r.effectiveMinutes AS effectiveMinutes,
        r.plannedMinutes AS plannedMinutes,
        r.effectiveMinutes - r.plannedMinutes AS diffMinutes
    WITH n0, n1,
    avg(effectiveMinutes) AS eMinutesMean, percentileCont(effectiveMinutes, 0.99) AS eMinutesP99,
    avg(plannedMinutes) AS pMinutesMean, percentileCont(plannedMinutes, 0.99) AS pMinutesP99,
    avg(diffMinutes) AS dMinutesMean, percentileCont(diffMinutes, 0.99) AS dMinutesP99
    MERGE(n0)-[s:SUPPLIES]-(n1)
    ON CREATE SET s.eMinutesMean = eMinutesMean,  s.eMinutesP99 = eMinutesP99,
        s.pMinutesMean = pMinutesMean,  s.pMinutesP99 = pMinutesP99,
        s.dMinutesMean = dMinutesMean,  s.dMinutesP99 = dMinutesP99
    RETURN count(s)
''')

Unnamed: 0,count(s)
0,2138


In [78]:
g, _ = gds.graph.project('proj', ['Entry', 'DepartureCheckpoint', 'Departure', 'Arrival', 'ArrivalCheckpoint', 'Destination'],
    {'SUPPLIES':{'properties':['eMinutesMean', 'eMinutesP99', 'pMinutesMean', 'pMinutesP99', 'dMinutesMean', 'dMinutesP99']}})

In [79]:
def get_airport_ids(row):
    return list( dict.fromkeys([n.get('airportId') for n in row.path.nodes]))

def calculate_shortest_paths(entry_id, destination_id, cost_metric, number_of_paths=5):
    source_id = gds.find_node_id(['Entry'], {'airportId': entry_id})
    target_id = gds.find_node_id(['Destination'], {'airportId': destination_id})
    paths_df = gds.shortestPath.yens.stream(g, sourceNode=source_id, targetNode=target_id, k=number_of_paths,
                                                   relationshipWeightProperty=cost_metric)
    paths_df['airportPath'] = paths_df.apply(get_airport_ids, axis=1)
    return paths_df[['totalCost', 'airportPath']]

In [80]:
# Shortest Path - find the shortest path between two airports based on average effective times
airport_id_1 = 614
airport_id_2 = 485
calculate_shortest_paths(airport_id_1, airport_id_2, 'eMinutesMean', 7)

Unnamed: 0,totalCost,airportPath
0,2849.608752,"[614, 815, 485]"
1,3635.915528,"[614, 783, 815, 485]"
2,3725.751888,"[614, 128, 485]"
3,4075.9764,"[614, 815, 281, 485]"
4,4115.476511,"[614, 815, 809, 485]"
5,4133.521208,"[614, 815, 128, 485]"
6,4356.054115,"[614, 349, 815, 485]"


In [81]:
# Shortest Path - find the shortest path between two airports based on 99 percentile times...i.e. near worst case scenarios
calculate_shortest_paths(airport_id_1, airport_id_2, 'eMinutesP99', 7)

Unnamed: 0,totalCost,airportPath
0,15785.91,"[614, 815, 485]"
1,16289.62,"[614, 128, 485]"
2,16737.48,"[614, 349, 485]"
3,16830.1,"[614, 815, 809, 485]"
4,17356.06,"[614, 815, 216, 485]"
5,17878.39,"[614, 815, 431, 485]"
6,17990.02,"[614, 128, 694, 485]"


In [82]:
g.drop()

Note for future. Consider:

1. Using other aggregate metrics for comparison in the SUPPLIES relationships for path finding costs.  This includes other percentiles but alo using predicted time or the differences between effective and predicted times
2. Finding paths with the SUPPLIES relationships but evaluating risk of paths via the distribution of historical performance along those paths. i.e. avoiding flaw pof average issues.

Also for path finding we need to be thoughtful about whether we are restricting to paths over single shipments and inbound vs outbound traffic as this effects the interpretation and formality of results....all stuff to look into if we want to pursue this dataset further.