# Airline Delays
## Data Wrangling for Capstone 2

I downloaded the data below on airline departures/delays from the US Department of Transportation. It covers flights from/to North Carolina in the period Jan-Jun 2022.

See source here: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FGK&QO_fu146_anzr=b0-gvzr

In [None]:
import pandas as pd

In [66]:
# let's first take a peek at what our data looks like!

df_Jan = pd.read_csv('Jan_2022.csv')
df_Jan.head()

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_NM,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,5,3,1/5/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,100.0,76.0,431.0,,,,,
1,1,1,6,4,1/6/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
2,1,1,7,5,1/7/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
3,1,1,10,1,1/10/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,112.0,89.0,431.0,,,,,
4,1,1,11,2,1/11/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,137.0,81.0,431.0,0.0,0.0,17.0,0.0,63.0


In [67]:
df_Feb = pd.read_csv('Feb_2022.csv')
df_Mar = pd.read_csv('Mar_2022.csv')
df_Apr = pd.read_csv('Apr_2022.csv')
df_May = pd.read_csv('May_2022.csv')
df_Jun = pd.read_csv('Jun_2022.csv')

In [68]:
# we'll want to concat our dataframes together
# to get Jan-Jun 2022

df = df_Jan.append(df_Feb).append(df_Mar).append(df_Apr).append(df_May).append(df_Jun)
df

# 291422 rows × 33 columns

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_NM,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,5,3,1/5/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,100.0,76.0,431.0,,,,,
1,1,1,6,4,1/6/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
2,1,1,7,5,1/7/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
3,1,1,10,1,1/10/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,112.0,89.0,431.0,,,,,
4,1,1,11,2,1/11/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,137.0,81.0,431.0,0.0,0.0,17.0,0.0,63.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51803,2,6,1,3,6/1/2022 12:00:00 AM,YX,3424,CLT,"Charlotte, NC",North Carolina,...,0.0,122.0,108.0,76.0,529.0,,,,,
51804,2,6,1,3,6/1/2022 12:00:00 AM,YX,3418,EWR,"Newark, NJ",New Jersey,...,0.0,108.0,113.0,77.0,416.0,,,,,
51805,2,6,1,3,6/1/2022 12:00:00 AM,YX,3417,EWR,"Newark, NJ",New Jersey,...,1.0,108.0,,,445.0,,,,,
51806,2,6,1,3,6/1/2022 12:00:00 AM,YX,3410,CLT,"Charlotte, NC",North Carolina,...,1.0,132.0,,,529.0,,,,,


In [69]:
# what does the first row look like? 
# also we get to see the columns

df.iloc[0]

QUARTER                                   1
MONTH                                     1
DAY_OF_MONTH                              5
DAY_OF_WEEK                               3
FL_DATE                1/5/2022 12:00:00 AM
OP_UNIQUE_CARRIER                        YX
OP_CARRIER_FL_NUM                      4903
ORIGIN                                  LGA
ORIGIN_CITY_NAME               New York, NY
ORIGIN_STATE_NM                    New York
DEST                                    RDU
DEST_CITY_NAME           Raleigh/Durham, NC
DEST_STATE_NM                North Carolina
CRS_DEP_TIME                            605
DEP_TIME                                608
DEP_DELAY_NEW                             3
DEP_DEL15                                 0
DEP_DELAY_GROUP                           0
CRS_ARR_TIME                            805
ARR_TIME                                748
ARR_DELAY_NEW                             0
ARR_DEL15                                 0
ARR_DELAY_GROUP                 

In [72]:
# let's simplify some of the column names

df = df.rename(columns={'OP_UNIQUE_CARRIER':'CARRIER', 
                        'OP_CARRIER_FL_NUM':'FL_NUM', 'ORIGIN_CITY_NAME':'ORIGIN_CITY',
                       'ORIGIN_STATE_NM':'ORIGIN_STATE', 'DEST_CITY_NAME':'DEST_CITY',
                       'DEST_STATE_NM':'DEST_STATE', 'DEP_DELAY_NEW':'DEP_DELAY',
                       'ARR_DELAY_NEW':'ARR_DELAY'})
df.head()

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,5,3,1/5/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,100.0,76.0,431.0,,,,,
1,1,1,6,4,1/6/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
2,1,1,7,5,1/7/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,1.0,120.0,,,431.0,,,,,
3,1,1,10,1,1/10/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,112.0,89.0,431.0,,,,,
4,1,1,11,2,1/11/2022 12:00:00 AM,YX,4903,LGA,"New York, NY",New York,...,0.0,120.0,137.0,81.0,431.0,0.0,0.0,17.0,0.0,63.0


In [74]:
# we need to specify that some of the column data
# is categorical, not numerical

## NB! I'm not going to download as many columns next time. 
## No taxi out, wheels off, wheels on, taxi in
## Also no cancellation code, distance_group
## ALSO NB: carrier delay, weather, nas, security, and late aircraft delay
## are NOT categorical but numeric! They measure the delay in min

df['QUARTER'] = df['QUARTER'].astype('category')
df['MONTH'] = df['MONTH'].astype('category')
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype('category')
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('category')
df['FL_DATE'] = df['FL_DATE'].astype('datetime64') # not sure if this one works
df['FL_NUM'] = df['FL_NUM'].astype('category')
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].astype('datetime64[ns]')
df['DEP_TIME'] = df['DEP_TIME'].astype('datetime64[ns]')
df['DEP_DEL15'] = df['DEP_DEL15'].astype('category')
df['DEP_DELAY_GROUP'] = df['DEP_DELAY_GROUP'].astype('category')
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].astype('datetime64[ns]')
df['ARR_TIME'] = df['ARR_TIME'].astype('datetime64[ns]')
df['ARR_DEL15'] = df['ARR_DEL15'].astype('category')
df['ARR_DELAY_GROUP'] = df['ARR_DELAY_GROUP'].astype('category')
df['CANCELLED'] = df['CANCELLED'].astype('category')

In [78]:
# let's see how many flights are cancelled
# group by month, and day of the week

df[df['CANCELLED']==1].groupby(['MONTH', 'DAY_OF_WEEK']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,QUARTER,DAY_OF_MONTH,FL_DATE,CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,DEST,DEST_CITY,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
MONTH,DAY_OF_WEEK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,783,783,783,783,783,783,783,783,783,783,...,783,783,0,0,783,0,0,0,0,0
1,2,108,108,108,108,108,108,108,108,108,108,...,108,108,0,0,108,0,0,0,0,0
1,3,70,70,70,70,70,70,70,70,70,70,...,70,70,0,0,70,0,0,0,0,0
1,4,162,162,162,162,162,162,162,162,162,162,...,162,162,0,0,162,0,0,0,0,0
1,5,548,548,548,548,548,548,548,548,548,548,...,548,548,0,0,548,0,0,0,0,0
1,6,719,719,719,719,719,719,719,719,719,719,...,719,719,0,0,719,0,0,0,0,0
1,7,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,...,1725,1725,0,0,1725,0,0,0,0,0
2,1,69,69,69,69,69,69,69,69,69,69,...,69,69,0,0,69,0,0,0,0,0
2,2,29,29,29,29,29,29,29,29,29,29,...,29,29,0,0,29,0,0,0,0,0
2,3,154,154,154,154,154,154,154,154,154,154,...,154,154,0,0,154,0,0,0,0,0


In [81]:
# let's see how many flights are cancelled
# now group by airline

df[df['CANCELLED']==1].groupby(['CARRIER']).count()

Unnamed: 0_level_0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,DEST,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
CARRIER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9E,630,630,630,630,630,630,630,630,630,630,...,630,630,0,0,630,0,0,0,0,0
AA,4014,4014,4014,4014,4014,4014,4014,4014,4014,4014,...,4014,4014,0,0,4014,0,0,0,0,0
AS,29,29,29,29,29,29,29,29,29,29,...,29,29,0,0,29,0,0,0,0,0
B6,233,233,233,233,233,233,233,233,233,233,...,233,233,0,0,233,0,0,0,0,0
DL,316,316,316,316,316,316,316,316,316,316,...,316,316,0,0,316,0,0,0,0,0
F9,107,107,107,107,107,107,107,107,107,107,...,107,107,0,0,107,0,0,0,0,0
G4,283,283,283,283,283,283,283,283,283,283,...,283,283,0,0,283,0,0,0,0,0
G7,107,107,107,107,107,107,107,107,107,107,...,107,107,0,0,107,0,0,0,0,0
MQ,200,200,200,200,200,200,200,200,200,200,...,200,200,0,0,200,0,0,0,0,0
NK,79,79,79,79,79,79,79,79,79,79,...,79,79,0,0,79,0,0,0,0,0


In [86]:
# what about percentage of flights cancelled
# group by airline?

df[df['CANCELLED']==1].groupby(['CARRIER']).count()['QUARTER']/df.groupby(['CARRIER']).count()['QUARTER']

# looks like G4=Allegiant Air has the highest percentage of cancelled flights, 6.9%
# followed by AS=Alaska Air, 6.8% and B6=JetBlue Airways, 6.7%

CARRIER
9E    0.049350
AA    0.038691
AS    0.068235
B6    0.066533
C5         NaN
DL    0.015829
F9    0.029714
G4    0.069126
G7    0.056614
MQ    0.028649
NK    0.042111
OH    0.032638
OO    0.019201
PT    0.026513
UA    0.033681
WN    0.033679
YV    0.044331
YX    0.053298
ZW    0.038003
Name: QUARTER, dtype: float64

In [87]:
# let's see how many flights have a departure delay of more than 15 minutes
# group by month, and day of the week

df[df['DEP_DEL15']==1].groupby(['MONTH', 'DAY_OF_WEEK']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,QUARTER,DAY_OF_MONTH,FL_DATE,CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,DEST,DEST_CITY,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
MONTH,DAY_OF_WEEK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,1602,1602,1602,1602,1602,1602,1602,1602,1602,1602,...,1602,1602,1596,1596,1602,1333,1333,1333,1333,1333
1,2,677,677,677,677,677,677,677,677,677,677,...,677,677,677,677,677,528,528,528,528,528
1,3,595,595,595,595,595,595,595,595,595,595,...,595,595,593,593,595,453,453,453,453,453
1,4,706,706,706,706,706,706,706,706,706,706,...,706,706,705,705,706,565,565,565,565,565
1,5,1017,1017,1017,1017,1017,1017,1017,1017,1017,1017,...,1017,1017,1001,1001,1017,801,801,801,801,801
1,6,1075,1075,1075,1075,1075,1075,1075,1075,1075,1075,...,1075,1075,1073,1073,1075,840,840,840,840,840
1,7,1459,1459,1459,1459,1459,1459,1459,1459,1459,1459,...,1459,1459,1453,1453,1459,1194,1194,1194,1194,1194
2,1,1108,1108,1108,1108,1108,1108,1108,1108,1108,1108,...,1108,1108,1107,1107,1108,913,913,913,913,913
2,2,949,949,949,949,949,949,949,949,949,949,...,949,949,948,948,949,794,794,794,794,794
2,3,630,630,630,630,630,630,630,630,630,630,...,630,630,628,628,630,498,498,498,498,498


In [88]:
# let's see how many flights have an arrival delay of more than 15 minutes
# now group by airline

df[df['ARR_DEL15']==1].groupby(['CARRIER']).count()

Unnamed: 0_level_0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,DEST,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
CARRIER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9E,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
AA,20574,20574,20574,20574,20574,20574,20574,20574,20574,20574,...,20574,20574,20574,20574,20574,20574,20574,20574,20574,20574
AS,80,80,80,80,80,80,80,80,80,80,...,80,80,80,80,80,80,80,80,80,80
B6,1045,1045,1045,1045,1045,1045,1045,1045,1045,1045,...,1045,1045,1045,1045,1045,1045,1045,1045,1045,1045
C5,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
DL,2811,2811,2811,2811,2811,2811,2811,2811,2811,2811,...,2811,2811,2811,2811,2811,2811,2811,2811,2811,2811
F9,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048,...,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048
G4,1320,1320,1320,1320,1320,1320,1320,1320,1320,1320,...,1320,1320,1320,1320,1320,1320,1320,1320,1320,1320
G7,456,456,456,456,456,456,456,456,456,456,...,456,456,456,456,456,456,456,456,456,456
MQ,1121,1121,1121,1121,1121,1121,1121,1121,1121,1121,...,1121,1121,1121,1121,1121,1121,1121,1121,1121,1121


In [89]:
# again we want to see the percentage of flights delayed
# by airline

df[df['ARR_DEL15']==1].groupby(['CARRIER']).count()['QUARTER']/df.groupby(['CARRIER']).count()['QUARTER']

# the worst offenders for flights delayed by more than 15 min
# are G4=Allegiant Air, 3.2%, NK=Spirit Airlines, 3.2%, B6=JetBlue, 3%

CARRIER
9E    0.169199
AA    0.198311
AS    0.188235
B6    0.298401
C5    0.238095
DL    0.140810
F9    0.291030
G4    0.322423
G7    0.241270
MQ    0.160579
NK    0.317164
OH    0.168986
OO    0.173845
PT    0.139687
UA    0.215370
WN    0.236678
YV    0.158067
YX    0.194997
ZW    0.138968
Name: QUARTER, dtype: float64

In [97]:
# how about the average arrival delay in minutes by airline?

df.groupby('CARRIER')['ARR_DELAY'].mean()

# B6=JetBlue has the highest average arrival delay, 27 min
# followed by NK=Spirit at 25 min

CARRIER
9E    15.071836
AA    15.836026
AS     8.305556
B6    27.174885
C5    15.428571
DL    11.213266
F9    22.012038
G4    23.609268
G7    21.601461
MQ    10.034758
NK    25.181920
OH    13.470728
OO    12.679215
PT    12.145907
UA    17.155419
WN    13.157574
YV    16.789714
YX    15.320613
ZW    10.884956
Name: ARR_DELAY, dtype: float64

In [106]:
NC_airports = df[df['ORIGIN_STATE']=='North Carolina']['ORIGIN'].unique()
NC_airports

array(['RDU', 'GSO', 'CLT', 'AVL', 'ILM', 'FAY', 'OAJ', 'PGV', 'USA',
       'EWN'], dtype=object)

In [113]:
# here's a subset of the dataframe
# containing only flights that leave from NC

df_NC = df[df['ORIGIN'].isin(NC_airports)]
df_NC

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY,ORIGIN_STATE,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
19,1,1,4,2,2022-01-04,YX,4903,RDU,"Raleigh/Durham, NC",North Carolina,...,0.0,113.0,99.0,67.0,431.0,,,,,
20,1,1,5,3,2022-01-05,YX,4903,RDU,"Raleigh/Durham, NC",North Carolina,...,0.0,113.0,100.0,68.0,431.0,0.0,0.0,43.0,0.0,0.0
21,1,1,6,4,2022-01-06,YX,4903,RDU,"Raleigh/Durham, NC",North Carolina,...,1.0,113.0,,,431.0,,,,,
22,1,1,7,5,2022-01-07,YX,4903,RDU,"Raleigh/Durham, NC",North Carolina,...,1.0,113.0,,,431.0,,,,,
23,1,1,8,6,2022-01-08,YX,4903,RDU,"Raleigh/Durham, NC",North Carolina,...,0.0,113.0,106.0,72.0,431.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51799,2,6,1,3,2022-06-01,YX,3453,CLT,"Charlotte, NC",North Carolina,...,0.0,147.0,132.0,92.0,599.0,,,,,
51800,2,6,1,3,2022-06-01,YX,3430,CLT,"Charlotte, NC",North Carolina,...,1.0,129.0,,,529.0,,,,,
51802,2,6,1,3,2022-06-01,YX,3427,CLT,"Charlotte, NC",North Carolina,...,0.0,130.0,136.0,82.0,529.0,,,,,
51803,2,6,1,3,2022-06-01,YX,3424,CLT,"Charlotte, NC",North Carolina,...,0.0,122.0,108.0,76.0,529.0,,,,,


In [114]:
# what's the average arrival delay
# for this subsetted dataframe 
# containing only flights that leave from NC?

df_NC.groupby('CARRIER')['ARR_DELAY'].mean()

# the average delay time is pretty similar
# B6=JetBlue still leads at 27.6 min,
# NK=Spirit, 26.5 min

CARRIER
9E    16.522607
AA    15.081922
AS     9.469697
B6    27.619398
C5    17.809524
DL    11.281008
F9    23.246560
G4    23.487599
G7    23.564877
MQ    10.230254
NK    26.456180
OH    12.550028
OO    13.556263
PT    10.707997
UA    15.673086
WN    10.414552
YV    16.241985
YX    15.780025
ZW    10.454653
Name: ARR_DELAY, dtype: float64