## Data Format: XML

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

from nose.tools import ok_, assert_equal

In [7]:
dest_origin = pd.read_csv(
    '/Users/Constance/2001.csv',
    encoding='latin-1',
    usecols=('Dest', 'Origin')
)

print(dest_origin.head())

  Origin Dest
0    BWI  CLT
1    BWI  CLT
2    BWI  CLT
3    BWI  CLT
4    BWI  CLT


### 1. Function: get_total_flights()

In [15]:
def get_total_flights(df):
    '''
    Takes a dataframe that has two columns Dest and Origin
    and returns a dataframe that has a column named flights
    and is indexed by IATA codes.
    
    Parameters
    ----------
    df: pandas.DataFrame
    
    Returns
    -------
    pandas.DataFrame
    '''
    
    df1=df.groupby('Dest').size()
    df2=df.groupby('Origin').size()
    result=df1.add(df2, fill_value=0).astype(dtype=np.int32)
    return result

In [16]:
flights = get_total_flights(dest_origin)
print(flights)

ABE     10524
ABI      5134
ABQ     72477
ACT      5368
ACY         1
ADQ      1452
AKN       568
ALB     32713
AMA     12267
ANC     42381
APF       725
ATL    503163
AUS     85809
AVL      3172
AVP      2893
AZO      5290
BDL     71983
BET      2306
BFL      3338
BGM       751
BGR      7417
BHM     37566
BIL      6249
BIS      2779
BMI      2869
BNA    112603
BOI     24152
BOS    266032
BPT      3481
BQN       518
        ...  
SHV     12011
SIT      2758
SJC    144653
SJT      4505
SJU     52957
SLC    152859
SMF     80394
SNA     86871
SPS      3985
SRQ      9044
STL    324477
STT      6723
STX      1817
SUX       546
SWF      2386
SYR     22281
TLH      2957
TOL      4483
TPA    137286
TRI      1095
TUL     45562
TUS     39101
TVC      5067
TXK      3475
TYR      6361
TYS     11131
VPS      3455
WRG      1452
XNA     11749
YAK      1450
Length: 231, dtype: int32


In [17]:
test1 = pd.DataFrame({
    'Dest': ['A', 'B', 'A', 'A', 'C'],
    'Origin': ['B', 'A', 'B', 'B', 'A']
    })

answer1 = pd.Series([5, 4, 1], index=['A', 'B', 'C'], dtype=np.int32)

test2 = pd.DataFrame({
    'Dest': ['A', 'B'],
    'Origin': ['C', 'D']
    })

answer2 = pd.Series([1, 1, 1, 1], index=['A', 'B', 'C', 'D'], dtype=np.int32)

ok_(get_total_flights(test1).equals(answer1))
ok_(get_total_flights(test2).equals(answer2))

In [18]:
top20 = flights.sort_values(ascending=False, inplace=False)[:20]
print(top20)

ORD    682636
DFW    624361
ATL    503163
LAX    450019
PHX    368631
STL    324477
DTW    297522
MSP    284955
LAS    272293
BOS    266032
DEN    265184
IAH    257193
CLT    256626
SFO    243473
EWR    241016
PHL    239390
LGA    232964
PIT    212738
SEA    205486
BWI    199674
dtype: int32


### 2. Function: is_delayed()

In [122]:
def is_delayed(xml):
    '''
    Takes an IATA code and returns a Boolean.
    
    Parameter
    ---------
    xml (str): an XML code to parse.

    Returns
    -------
    None if if the 'Delay' tag in the XML is false, e.g. <Delay>false</Delay>
    If the 'Delay' tag is true, e.g. <Delay>true</Delay>, the function returns
    a tuple of two strings, 'MinDelay' and 'MaxDelay'.
    For example, when
    <Delay>true</Delay>
    <MinDelay>16 minutes</MinDelay>
    <MaxDelay>30 minutes</MaxDelay>
    the function returns ('16 minutes', '30 minutes').
    '''
    
    tree = ET.ElementTree(ET.fromstring(xml))
    root = tree.getroot()
    if root[0].text=='true':
        a=[]
        for airport in root.findall('Status'):
            a.append(airport[3].text)
            a.append(airport[5].text)
        a=tuple(a)
    else:
        a=None
    return a

In [124]:
test1 = '''
<?xml version="1.0" encoding="UTF-8"?><AirportStatus> 
					<Delay>true</Delay>
				
					<IATA>ORD</IATA>
				
					<State>Illinois</State>
				
					<Name>Chicago OHare International</Name>
				
					<Weather><Visibility>10.00</Visibility>
				
					<Weather>Partly Cloudy</Weather>
				
					<Meta><Credit>NOAA&apos;s National Weather Service</Credit>
				
					<Updated>12:51 PM Local</Updated>
				
					<Url>http://weather.gov/</Url></Meta>
				
					<Temp>53.0 F (11.7 C)</Temp>
				
					<Wind>Northwest at 12.7mph</Wind></Weather>
				
					<ICAO>KORD</ICAO>
				
					<City>Chicago</City>
				
					<Status><Reason>VOL:Multi-taxi</Reason>
				
					<ClosureBegin></ClosureBegin>
				
					<EndTime></EndTime>
				
					<MinDelay>16 minutes</MinDelay>
				
					<AvgDelay></AvgDelay>
				
					<MaxDelay>30 minutes</MaxDelay>
				
					<ClosureEnd></ClosureEnd>
				
					<Trend>Increasing</Trend>
				
					<Type>Departure</Type></Status>
				</AirportStatus> 
'''.strip()

test2 = '''
<?xml version="1.0" encoding="UTF-8"?><AirportStatus> 
					<Delay>false</Delay>
				
					<IATA>SFO</IATA>
				
					<State>California</State>
				
					<Name>San Francisco International</Name>
				
					<Weather><Visibility>10.00</Visibility>
				
					<Weather>Partly Cloudy</Weather>
				
					<Meta><Credit>NOAA&apos;s National Weather Service</Credit>
				
					<Updated>12:56 PM Local</Updated>
				
					<Url>http://weather.gov/</Url></Meta>
				
					<Temp>68.0 F (20.0 C)</Temp>
				
					<Wind>North at 5.8mph</Wind></Weather>
				
					<ICAO>KSFO</ICAO>
				
					<City>San Francisco</City>
				
					<Status><Reason>No known delays for this airport.</Reason>
				
					<ClosureBegin></ClosureBegin>
				
					<EndTime></EndTime>
				
					<MinDelay></MinDelay>
				
					<AvgDelay></AvgDelay>
				
					<MaxDelay></MaxDelay>
				
					<ClosureEnd></ClosureEnd>
				
					<Trend></Trend>
				
					<Type></Type></Status>
				</AirportStatus> 
'''.strip()

In [131]:
assert_equal(is_delayed(test1), ('16 minutes', '30 minutes'))
assert_equal(is_delayed(test2), None)

In [132]:
for airport in top20.index:
    url = 'http://services.faa.gov/airport/status/{}'.format(airport)
    payload = {
        'format': 'application/xml'
    }
    r = requests.get(url)
    delay = is_delayed(r.text)
    if delay is None:
        print('{} is not delayed.'.format(airport))
    else:
        print('{} is delayed by {} to {}.'.format(airport, delay[0], delay[1]))

ORD is not delayed.
DFW is not delayed.
ATL is not delayed.
LAX is not delayed.
PHX is not delayed.
STL is not delayed.
DTW is not delayed.
MSP is not delayed.
LAS is not delayed.
BOS is not delayed.
DEN is not delayed.
IAH is not delayed.
CLT is not delayed.
SFO is not delayed.
EWR is delayed by None to None.
PHL is not delayed.
LGA is not delayed.
PIT is not delayed.
SEA is not delayed.
BWI is not delayed.
