# create a network graph of flight data

In [1]:
import sqlalchemy as sa
import pandas as pd
import pprint
from datetime import datetime
import time
from datetime import timedelta
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import math
from networkx.drawing.nx_agraph import graphviz_layout

CONNECTION_STRING_SQLALCHEMY = 'postgresql://localhost/FlightData'
con = sa.create_engine(CONNECTION_STRING_SQLALCHEMY)

In [2]:
# helper functions 
def runQuery(sql): 
    conn = sa.create_engine(CONNECTION_STRING_SQLALCHEMY)
    retVal = None
    result = pd.read_sql(sql, conn,params=None)
    retVal = result
    return retVal

def setInEdges(collection, source):
    for edge in collection: 
        # create a tuple of (source, target)
        edgeTup = (source, edge.text)
        edgeList.append(edgeTup)
        #citeSources.append(source)
        #citeTargets.append(edge.text)

def setOutEdges(collection, target):
    for edge in collection:
        edgeTup = (edge.text, target)
        edgeList.append(edgeTup)
        #citeSources.append(edge.text)
        #citeTargets.append(target)

## helpers related to data load

In [3]:
def loadAirports(G, airports):
    # create a node list from airports 
    for index, airport in airports.iterrows():
        airportcode = airport['iata']
        name = airport['name']
        long = airport['latitude']
        lat=airport['longitude']
        totalflights=airport['totalflights']
        numberofdelays=airport['numberofdelays']
        delaypercentage=airport['delaypercentage']
        carrierdelay=airport['carrierdelay']
        weatherdelay=airport['weatherdelay']
        nasdelay=airport['nasdelay']
        securitydelay=airport['securitydelay']
        lateaircraftdelay=airport['lateaircraftdelay']
        G.add_node(airportcode, code=airportcode, name=name, long=long,lat=lat,totalflights=totalflights, 
                   numberofdelays=numberofdelays, delaypercentage=delaypercentage,
                   carrierdelay=carrierdelay, weatherdelay=weatherdelay,nasdelay=nasdelay,
                   securitydelay=securitydelay, lateaircraftdelay=lateaircraftdelay)
    return G

def loadRoutes(G, routes):
    # load the edges 
    for index, route in routes.iterrows():
        G.add_edge(route["origin"], route["destination"], weight=route["count"],
                   arrivaldelaymins=route["total_arrival_delay_minutes"], numofdelays=route["number_of_delays"],
                  delaypercentage=route["delaypercentage"])
    return G
    
def getOverviewGraph(airports, routes):
    
    print("start graph load")
    
    # initiate NX objects 
    G=nx.MultiDiGraph()
    nodes = []
    
    print("creating airport nodes")
    G = loadAirports(G, airports)
    print("Loading edges")
    G = loadRoutes(G, routes)
    
    # network summary 
    print("number of nodes:", len(G))
    print("number of edges:", nx.number_of_edges(G))
    print("Graph Density:", nx.density(G))

    isolatedAirports = nx.isolates(G) # airports with no flight data?
    print("Total isolated airports found:", len(isolatedAirports))

    G.remove_nodes_from(isolatedAirports)

    print("number of nodes post isolated:", len(G))
    print("number of edges post isolated:", nx.number_of_edges(G))
    print("Graph Density post isolated:", nx.density(G))
    
    print("end graph load")
    
    return G



## load the edges 
### possible combinations 
* load edges from flights that are delayed only?
- load by flight number (capture the unique flights between airports). Repeat flighnts reflected by edge weight.
- load edges by carrier (e.g. AA, Delta) 
- load by all flights (capture any flights between two airports). Repeat flights reflected by edge weight. 
- Load all flights and save aggregated delay in the nodes (which airport has the most amount of delay?) 
- load all flights and capture the total number of delays as opposed to delay in minutes. 
- filter by month. 
- filter daily? 
- filter by year? 

In [14]:
# pandas load data files 

airports = pd.read_csv('data/airline_stats_network_2008.csv')
routes = pd.read_csv('data/flightdata2008.csv')

In [15]:
# load the network
G = getOverviewGraph(airports=airports, routes=routes)

start graph load
creating airport nodes
Loading edges
number of nodes: 3376
number of edges: 5112
Graph Density: 0.0004486571879936809
Total isolated airports found: 3073
number of nodes post isolated: 303
number of edges post isolated: 5112
Graph Density post isolated: 0.055865189167923414
end graph load


## carry out centrality calculations

In [16]:
#G = nx.convert_node_labels_to_integers(G)

In [17]:
bb = nx.betweenness_centrality(G, weight="weight")
outdeg = nx.out_degree_centrality(G)
indeg = nx.in_degree_centrality(G)
#eigen = nx.eigenvector_centrality(G)
isinstance(bb, dict)
isinstance(outdeg, dict)
isinstance(indeg, dict)
#bb

True

In [18]:
for airport in (sorted(bb,key=bb.get, reverse=True)):
    print(airport, bb[airport])

ATL 0.20952914518675297
SLC 0.11620390070112355
DFW 0.10728373636220448
MSP 0.10246774639976602
ORD 0.08818055269441359
ANC 0.07210662358084305
DTW 0.06876935137212326
DEN 0.06515843513538519
IAH 0.059574650373151516
LAX 0.04913096877988335
SFO 0.045973263655420565
CVG 0.03935605915038366
SEA 0.03833730171145242
PHX 0.03209772737994136
LAS 0.019531435564660353
JNU 0.01831326887987506
MCO 0.017215400158154592
EWR 0.01649355928996716
CLT 0.016352049443426712
MEM 0.011102657032244159
JFK 0.010548467789107683
PDX 0.009312287579390895
MKE 0.008346809648940256
IAD 0.007696964652453864
LGA 0.006958580279886503
KTN 0.0064578703805563496
FLL 0.005396587018072953
BOS 0.005334885699833588
HNL 0.005117438740266624
CLE 0.005078859906523481
TPA 0.004419449714196751
FAI 0.003793287194455131
BWI 0.0037741554830833324
AUS 0.0037735067731941674
SAN 0.0034893941065182667
SMF 0.003488903206837548
JAX 0.0034564340884064124
COS 0.0033936974878581083
DCA 0.0026792730619862237
ABQ 0.002552799504367757
MIA 0.0

In [19]:
nx.set_node_attributes(G=G,values=bb,name="betweenness")
nx.set_node_attributes(G=G,values=outdeg,name="outdegree")
nx.set_node_attributes(G=G,values=indeg,name="indegree")

In [20]:
between = nx.get_node_attributes(G, 'betweenness')
print(between['ATL'])
between = nx.get_node_attributes(G, 'outdegree')
print(between['ATL'])
between = nx.get_node_attributes(G, 'indegree')
print(between['ATL'])

0.20952914518675297
0.5695364238410596
0.5728476821192053


In [21]:
for origin,destination,attr in G.edges_iter(data=True):
    print(attr)
#for airport,attr in G.nodes(data=True):
#   print(attr)

{'weight': 853, 'arrivaldelaymins': 9643, 'numofdelays': 183, 'delaypercentage': 21}
{'weight': 805, 'arrivaldelaymins': 672, 'numofdelays': 87, 'delaypercentage': 10}
{'weight': 465, 'arrivaldelaymins': -1333, 'numofdelays': 53, 'delaypercentage': 11}
{'weight': 247, 'arrivaldelaymins': -1092, 'numofdelays': 16, 'delaypercentage': 6}
{'weight': 997, 'arrivaldelaymins': 5168, 'numofdelays': 152, 'delaypercentage': 15}
{'weight': 3, 'arrivaldelaymins': 73, 'numofdelays': 3, 'delaypercentage': 100}
{'weight': 9, 'arrivaldelaymins': 962, 'numofdelays': 8, 'delaypercentage': 88}
{'weight': 1425, 'arrivaldelaymins': 21810, 'numofdelays': 364, 'delaypercentage': 25}
{'weight': 2, 'arrivaldelaymins': 34, 'numofdelays': 1, 'delaypercentage': 50}
{'weight': 2660, 'arrivaldelaymins': 13168, 'numofdelays': 293, 'delaypercentage': 11}
{'weight': 368, 'arrivaldelaymins': 4549, 'numofdelays': 103, 'delaypercentage': 27}
{'weight': 1067, 'arrivaldelaymins': 2029, 'numofdelays': 129, 'delaypercentage'

{'weight': 378, 'arrivaldelaymins': 2034, 'numofdelays': 70, 'delaypercentage': 18}
{'weight': 836, 'arrivaldelaymins': 9677, 'numofdelays': 172, 'delaypercentage': 20}
{'weight': 412, 'arrivaldelaymins': -571, 'numofdelays': 49, 'delaypercentage': 11}
{'weight': 365, 'arrivaldelaymins': 837, 'numofdelays': 53, 'delaypercentage': 14}
{'weight': 1209, 'arrivaldelaymins': -5573, 'numofdelays': 96, 'delaypercentage': 7}
{'weight': 236, 'arrivaldelaymins': -2654, 'numofdelays': 29, 'delaypercentage': 12}
{'weight': 637, 'arrivaldelaymins': 2905, 'numofdelays': 84, 'delaypercentage': 13}
{'weight': 366, 'arrivaldelaymins': -85, 'numofdelays': 50, 'delaypercentage': 13}
{'weight': 732, 'arrivaldelaymins': 9460, 'numofdelays': 176, 'delaypercentage': 24}
{'weight': 362, 'arrivaldelaymins': 2767, 'numofdelays': 69, 'delaypercentage': 19}
{'weight': 702, 'arrivaldelaymins': 6222, 'numofdelays': 162, 'delaypercentage': 23}
{'weight': 7685, 'arrivaldelaymins': 18576, 'numofdelays': 734, 'delayper

{'weight': 728, 'arrivaldelaymins': 1706, 'numofdelays': 111, 'delaypercentage': 15}
{'weight': 3402, 'arrivaldelaymins': 21335, 'numofdelays': 611, 'delaypercentage': 17}
{'weight': 359, 'arrivaldelaymins': 35, 'numofdelays': 33, 'delaypercentage': 9}
{'weight': 874, 'arrivaldelaymins': -827, 'numofdelays': 89, 'delaypercentage': 10}
{'weight': 2825, 'arrivaldelaymins': -2142, 'numofdelays': 301, 'delaypercentage': 10}
{'weight': 973, 'arrivaldelaymins': 5492, 'numofdelays': 168, 'delaypercentage': 17}
{'weight': 1101, 'arrivaldelaymins': 8703, 'numofdelays': 182, 'delaypercentage': 16}
{'weight': 204, 'arrivaldelaymins': 2204, 'numofdelays': 48, 'delaypercentage': 23}
{'weight': 1554, 'arrivaldelaymins': -3463, 'numofdelays': 157, 'delaypercentage': 10}
{'weight': 1692, 'arrivaldelaymins': 9423, 'numofdelays': 344, 'delaypercentage': 20}
{'weight': 2804, 'arrivaldelaymins': 21209, 'numofdelays': 597, 'delaypercentage': 21}
{'weight': 1452, 'arrivaldelaymins': 889, 'numofdelays': 214,

In [22]:
# write to CSV?
import csv

with open('exports/airportList.csv', 'w') as csvfile:
    fieldnames = ['iata', 'name','long','lat','totalflights','numberofdelays',
                  'delaypercentage','carrierdelay','weatherdelay','nasdelay','securitydelay',
                 'lateaircraftdelay','betweenness','outdegree','indegree']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    # loop through node list 
    for airport, attr in G.nodes(data=True):
        writer.writerow({'iata': airport, 'name': attr['name'], 'long': attr['long'],'lat': attr['lat'], 
                         'totalflights': attr['totalflights'],'numberofdelays': attr['numberofdelays'],
                         'delaypercentage': attr['delaypercentage'],'carrierdelay': attr['carrierdelay'],
                         'weatherdelay': attr['weatherdelay'],'nasdelay': attr['nasdelay'],
                         'securitydelay': attr['securitydelay'],'lateaircraftdelay': attr['lateaircraftdelay'],
                         'betweenness': attr['betweenness'],'outdegree': attr['outdegree'],
                         'indegree': attr['indegree']
                        })



In [23]:
# edge list  

def getLat(code):
    temp = (airports.loc[airports.iata==code,"latitude"].values[0])
    return temp

def getLong(code):
    temp = (airports.loc[airports.iata==code,"longitude"].values[0])
    return temp

with open('exports/edgeList.csv', 'w') as csvfile:
    fieldnames = ['origin','originlat','originlong',
                  'destination','destinationlat','destinationlong',
                  'weight','arrivaldelaymins','numofdelays','delaypercentage']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    # loop through edge list 
    for origin,destination,attr in G.edges_iter(data=True):
        # get the origin lat/long
        # get the destination lat/long
        olat = getLat(origin)
        olng = getLong(origin)
        dlat = getLat(destination)
        dlng = getLong(destination)
        writer.writerow({'origin': origin,'originlat': olat, "originlong": olng,
                         'destination':destination,'destinationlat': dlat, "destinationlong": dlng,
                         'weight': attr['weight'], 'arrivaldelaymins': attr['arrivaldelaymins'], 
                         'numofdelays': attr['numofdelays'],'delaypercentage': attr['delaypercentage']
                        })

In [156]:
# write to a graphml file
nx.write_graphml(G, "flightsOverviewDelays.graphml")

In [None]:
# release 
G = None