# EXPLORATORY DATA ANALYSIS (EPD)

In [2]:
import pandas as pd
import scipy.stats as st
import plotly.express as px

# IMPORTING AND SELECTING DATA

In [3]:
flights = pd.read_csv('C:/Users/danfv/OneDrive/Knowledge/Courses/Data Analysis/LHL - Data Analysis Bootcamp/lighthouse-data-notes/W6 Midterm Project/Midterm_Project/Data/Flights - Sample 50000 rand.csv').sort_values('fl_date')
flights = flights.drop('no_name', axis=1)
flights['total_delay'] = flights['carrier_delay'] + flights['weather_delay'] + flights['nas_delay'] + flights['security_delay'] + flights['late_aircraft_delay']

## Training data overview:

In [4]:
# print(flights_1000_rand.shape)
print(flights.columns)
print(flights.columns.shape)

pd.set_option('display.max_columns', None)
flights.head()

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'total_delay'],
      dtype='object')
(42,)


Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,total_delay
27163,2018-01-01,DL,DL,DL,260,DL,N964DN,260,14869,SLC,"Salt Lake City, UT",13487,MSP,"Minneapolis, MN",1715,1717.0,2.0,26.0,1743.0,2053.0,3.0,2057,2056.0,-1.0,0,,0,N,162,159.0,130.0,1,991,,,,,,,,,
7553,2018-01-01,UA,UA,UA,2117,UA,N12109,2117,12892,LAX,"Los Angeles, CA",11618,EWR,"Newark, NJ",1200,1150.0,-10.0,18.0,1208.0,1944.0,7.0,2020,1951.0,-29.0,0,,0,N,320,301.0,276.0,1,2454,,,,,,,,,
39374,2018-01-01,WN,WN,WN,1508,WN,N8306H,1508,13232,MDW,"Chicago, IL",11057,CLT,"Charlotte, NC",1335,1344.0,9.0,15.0,1359.0,1618.0,5.0,1625,1623.0,-2.0,0,,0,N,110,99.0,79.0,1,584,,,,,,,,,
27211,2018-01-01,NK,NK,NK,967,NK,N639NK,967,11042,CLE,"Cleveland, OH",14635,RSW,"Fort Myers, FL",1520,1520.0,0.0,42.0,1602.0,1824.0,5.0,1805,1829.0,24.0,0,,0,N,165,189.0,142.0,1,1025,0.0,0.0,24.0,0.0,0.0,,,,24.0
39196,2018-01-01,NK,NK,NK,467,NK,N525NK,467,10721,BOS,"Boston, MA",13495,MSY,"New Orleans, LA",1420,1414.0,-6.0,16.0,1430.0,1707.0,21.0,1715,1728.0,13.0,0,,0,N,235,254.0,217.0,1,1368,,,,,,,,,


In [5]:
pd.reset_option('max_columns')

## Training/Testing Features:

In [6]:
# Features in training vs testing data
flights_set = set(flights.columns)

# testing data to set:
flights_test = pd.read_csv('C:/Users/danfv/OneDrive/Knowledge/Courses/Data Analysis/LHL - Data Analysis Bootcamp/lighthouse-data-notes/W6 Midterm Project/Midterm_Project/Data/Flights_test - Sample 50000 rand.csv').sort_values('fl_date')
flights_test_set = set(flights_test.columns)

In [7]:
print('Shared features:', flights_set & flights_test_set, sep="\n")
print('In train but not test:', flights_set - flights_test_set, sep="\n")
print('In test but not train empty?:', len(flights_test_set - flights_set) == 0)

Shared features:
{'dest_city_name', 'origin_airport_id', 'distance', 'crs_dep_time', 'dest', 'mkt_carrier', 'dest_airport_id', 'fl_date', 'op_unique_carrier', 'branded_code_share', 'dup', 'op_carrier_fl_num', 'tail_num', 'origin', 'crs_elapsed_time', 'flights', 'crs_arr_time', 'mkt_unique_carrier', 'origin_city_name', 'mkt_carrier_fl_num'}
In train but not test:
{'cancelled', 'late_aircraft_delay', 'security_delay', 'carrier_delay', 'first_dep_time', 'total_add_gtime', 'taxi_out', 'arr_delay', 'actual_elapsed_time', 'arr_time', 'total_delay', 'wheels_on', 'diverted', 'cancellation_code', 'nas_delay', 'weather_delay', 'wheels_off', 'air_time', 'dep_time', 'longest_add_gtime', 'taxi_in', 'dep_delay'}
In test but not train empty?: True


- FEATURES NOT PRESENT IN THE TESTING DATA CANNOT BE USED FOR MODELING OR FEATURE ENGINEERING

## Categorical features:

In [8]:
print('Different dtypes in df:', flights.dtypes.unique())
print(flights.select_dtypes(include='object').nunique().sort_values(ascending=False))


Different dtypes in df: [dtype('O') dtype('int64') dtype('float64')]
tail_num              6029
fl_date                730
dest                   366
origin                 365
dest_city_name         360
origin_city_name       358
op_unique_carrier       28
branded_code_share      16
mkt_carrier             11
mkt_unique_carrier      11
cancellation_code        3
dup                      1
dtype: int64


In [9]:
# Categorical that are not in testing data:
flights_categ_set = set(flights.select_dtypes(include='object').nunique().sort_values(ascending=False).index)
print('Also in test data:', flights_test_set & flights_categ_set, sep="\n")
print('Categorical in train but no test:', flights_categ_set - flights_test_set, sep="\n")

Also in test data:
{'dup', 'dest_city_name', 'dest', 'mkt_carrier', 'tail_num', 'mkt_unique_carrier', 'origin', 'origin_city_name', 'fl_date', 'op_unique_carrier', 'branded_code_share'}
Categorical in train but no test:
{'cancellation_code'}


### Browsing categorical unique values:

In [10]:
# narrow down the unique values to look:
categorical_to_browse = flights.select_dtypes(include='object').nunique().sort_values(ascending=False)[-6:].drop('cancellation_code')
categorical_to_browse

op_unique_carrier     28
branded_code_share    16
mkt_carrier           11
mkt_unique_carrier    11
dup                    1
dtype: int64

In [11]:
categorical_dict = {}
for feature in categorical_to_browse.index:
    categorical_dict[feature] = flights[feature].unique()

categorical_dict

{'op_unique_carrier': array(['DL', 'UA', 'WN', 'NK', 'OH', 'AA', 'MQ', 'OO', '9E', 'PT', 'C5',
        'AS', 'YV', 'G7', 'F9', 'B6', 'AX', 'YX', 'ZW', 'EV', 'CP', 'QX',
        'HA', 'VX', 'G4', '9K', 'EM', 'KS'], dtype=object),
 'branded_code_share': array(['DL', 'UA', 'WN', 'NK', 'AA_CODESHARE', 'AA', 'DL_CODESHARE',
        'UA_CODESHARE', 'AS', 'F9', 'B6', 'AS_CODESHARE', 'HA', 'VX', 'G4',
        'HA_CODESHARE'], dtype=object),
 'mkt_carrier': array(['DL', 'UA', 'WN', 'NK', 'AA', 'AS', 'F9', 'B6', 'HA', 'VX', 'G4'],
       dtype=object),
 'mkt_unique_carrier': array(['DL', 'UA', 'WN', 'NK', 'AA', 'AS', 'F9', 'B6', 'HA', 'VX', 'G4'],
       dtype=object),
 'dup': array(['N'], dtype=object)}

## Initial dropped features (not in testing):

In [12]:
# pretty print off for horizontal lists
%pprint

Pretty printing has been turned OFF


In [13]:
# Initial drop list
drop_list = []
drop_list.extend(list(flights_set - flights_test_set))
drop_list.remove('total_delay')
drop_list


['cancelled', 'late_aircraft_delay', 'security_delay', 'carrier_delay', 'first_dep_time', 'total_add_gtime', 'taxi_out', 'arr_delay', 'actual_elapsed_time', 'arr_time', 'wheels_on', 'diverted', 'cancellation_code', 'nas_delay', 'weather_delay', 'wheels_off', 'air_time', 'dep_time', 'longest_add_gtime', 'taxi_in', 'dep_delay']

In [33]:
# DF with columns we can use:
flight_distilled1 = flights.drop(drop_list, axis=1)

## Null Values:

In [34]:
# Null in the db with the variables we will have
null_num = flight_distilled1.isnull().sum().sort_values(ascending=False)
null_perc = (flight_distilled1.isnull().sum()/flight_distilled1.isnull().count()).sort_values(ascending=False)
null_flights = pd.concat([null_num, null_perc], axis=1, keys=['Null #', 'Null %'])
null_flights[null_flights['Null #'] != 0]

Unnamed: 0,Null #,Null %
total_delay,40504,0.81008
tail_num,140,0.0028


We can:
- Replace null with 0 in total_delay
- Delete the observations with null tail_num

In [35]:
# filling null in total_delay
flight_distilled1 = flight_distilled1.fillna(value={'total_delay' : 0})
# dropping the tail_num nulls
flight_distilled1 = flight_distilled1.dropna()
flight_distilled1

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,total_delay
27163,2018-01-01,DL,DL,DL,260,DL,N964DN,260,14869,SLC,...,13487,MSP,"Minneapolis, MN",1715,2057,N,162,1,991,0.0
7553,2018-01-01,UA,UA,UA,2117,UA,N12109,2117,12892,LAX,...,11618,EWR,"Newark, NJ",1200,2020,N,320,1,2454,0.0
39374,2018-01-01,WN,WN,WN,1508,WN,N8306H,1508,13232,MDW,...,11057,CLT,"Charlotte, NC",1335,1625,N,110,1,584,0.0
27211,2018-01-01,NK,NK,NK,967,NK,N639NK,967,11042,CLE,...,14635,RSW,"Fort Myers, FL",1520,1805,N,165,1,1025,24.0
39196,2018-01-01,NK,NK,NK,467,NK,N525NK,467,10721,BOS,...,13495,MSY,"New Orleans, LA",1420,1715,N,235,1,1368,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19861,2019-12-31,AS,AS,AS,1969,AS,N628VA,1969,14679,SAN,...,14771,SFO,"San Francisco, CA",2040,2219,N,99,1,447,0.0
22602,2019-12-31,DL,DL_CODESHARE,DL,5502,9E,N8839E,5502,13230,MDT,...,11433,DTW,"Detroit, MI",540,738,N,118,1,371,0.0
8196,2019-12-31,UA,UA_CODESHARE,UA,5445,OO,N962SW,5445,11603,EUG,...,14771,SFO,"San Francisco, CA",1439,1624,N,105,1,451,0.0
4016,2019-12-31,WN,WN,WN,3985,WN,N7731A,3985,11292,DEN,...,13198,MCI,"Kansas City, MO",925,1200,N,95,1,533,0.0


## Categorical Variables Analysis:

In [38]:
flight_distilled1.select_dtypes(include='object')

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name,dup
27163,2018-01-01,DL,DL,DL,DL,N964DN,SLC,"Salt Lake City, UT",MSP,"Minneapolis, MN",N
7553,2018-01-01,UA,UA,UA,UA,N12109,LAX,"Los Angeles, CA",EWR,"Newark, NJ",N
39374,2018-01-01,WN,WN,WN,WN,N8306H,MDW,"Chicago, IL",CLT,"Charlotte, NC",N
27211,2018-01-01,NK,NK,NK,NK,N639NK,CLE,"Cleveland, OH",RSW,"Fort Myers, FL",N
39196,2018-01-01,NK,NK,NK,NK,N525NK,BOS,"Boston, MA",MSY,"New Orleans, LA",N
...,...,...,...,...,...,...,...,...,...,...,...
19861,2019-12-31,AS,AS,AS,AS,N628VA,SAN,"San Diego, CA",SFO,"San Francisco, CA",N
22602,2019-12-31,DL,DL_CODESHARE,DL,9E,N8839E,MDT,"Harrisburg, PA",DTW,"Detroit, MI",N
8196,2019-12-31,UA,UA_CODESHARE,UA,OO,N962SW,EUG,"Eugene, OR",SFO,"San Francisco, CA",N
4016,2019-12-31,WN,WN,WN,WN,N7731A,DEN,"Denver, CO",MCI,"Kansas City, MO",N


In [39]:
flight_distilled1.select_dtypes(include='object').nunique().sort_values(ascending=False)

tail_num              6029
fl_date                730
dest                   366
origin                 365
dest_city_name         360
origin_city_name       358
op_unique_carrier       28
branded_code_share      16
mkt_carrier             11
mkt_unique_carrier      11
dup                      1
dtype: int64

### Binning dates

In [47]:
# per season
flight_distilled1['Season'] = ((pd.DatetimeIndex(flight_distilled1['fl_date']).month)%12 // 3 + 1)

In [52]:
# weekeday
flight_distilled1['Weekday'] = pd.DatetimeIndex(flight_distilled1['fl_date']).weekday

In [74]:
# flying season
# https://www.planestats.com/season_2015oct

def is_peak_season(date):
    '''
    Return a number indicating flying season.

        Paraneters:
            x(int): month number 1-12
        
        Returns:
            int: 0 for low season, 1 for medium, 2 for high
    '''
    
    if date >= 1 and date <= 2:
        return 0
    elif date >= 3 and date <= 5:
        return 1
    elif date >= 6 and date <= 8:
        return 2
    elif date >= 9 and date <= 11:
        return 0
    else:
        return 1

is_peak_season(12)

flight_distilled1['month'] = pd.DatetimeIndex(flight_distilled1['fl_date']).month
flight_distilled1['peak_season'] = flight_distilled1['month'].apply(lambda x: is_peak_season(x))

In [75]:
flight_distilled1

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_arr_time,dup,crs_elapsed_time,flights,distance,total_delay,Season,Weekday,month,peak_season
27163,2018-01-01,DL,DL,DL,260,DL,N964DN,260,14869,SLC,...,2057,N,162,1,991,0.0,1,0,1,0
7553,2018-01-01,UA,UA,UA,2117,UA,N12109,2117,12892,LAX,...,2020,N,320,1,2454,0.0,1,0,1,0
39374,2018-01-01,WN,WN,WN,1508,WN,N8306H,1508,13232,MDW,...,1625,N,110,1,584,0.0,1,0,1,0
27211,2018-01-01,NK,NK,NK,967,NK,N639NK,967,11042,CLE,...,1805,N,165,1,1025,24.0,1,0,1,0
39196,2018-01-01,NK,NK,NK,467,NK,N525NK,467,10721,BOS,...,1715,N,235,1,1368,0.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19861,2019-12-31,AS,AS,AS,1969,AS,N628VA,1969,14679,SAN,...,2219,N,99,1,447,0.0,1,1,12,1
22602,2019-12-31,DL,DL_CODESHARE,DL,5502,9E,N8839E,5502,13230,MDT,...,738,N,118,1,371,0.0,1,1,12,1
8196,2019-12-31,UA,UA_CODESHARE,UA,5445,OO,N962SW,5445,11603,EUG,...,1624,N,105,1,451,0.0,1,1,12,1
4016,2019-12-31,WN,WN,WN,3985,WN,N7731A,3985,11292,DEN,...,1200,N,95,1,533,0.0,1,1,12,1


In [77]:
# holidays
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

dr = pd.date_range(start='2018-01-01', end='2019-12-31')
df = pd.DataFrame()
df['Date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

flight_distilled1['holiday'] = flight_distilled1['fl_date'].isin(holidays)
flight_distilled1

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [85]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

dr = pd.date_range(start='2015-07-01', end='2015-07-31')
df = pd.DataFrame()
df['Date'] = dr

cal = calendar()
cal
# holidays = cal.holidays(start='2015-07-01', end='2015-07-31')
# holidays
# # df['Holiday'] = df['Date'].isin(holidays)

<pandas.tseries.holiday.USFederalHolidayCalendar object at 0x00000299C23FE820>

# ANALYZING FEATURES THAT ARE NOT PRESENT IN THE DATA

## Looking at delay features:

In [None]:
# Are all  delayed features independent? YES
flights_delayed = flights[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay','late_aircraft_delay']]
flights_delayed_filled = flights_delayed[flights_delayed['carrier_delay'].isnull() == False]
# Different kinds of delays are not mutually exclusive
flights_delayed_filled[(flights_delayed_filled['carrier_delay'] != 0) & (flights_delayed_filled['late_aircraft_delay'] != 0)]

Unnamed: 0,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
27579,1.0,0.0,0.0,0.0,17.0
48280,1.0,0.0,3.0,0.0,183.0
29965,20.0,0.0,0.0,0.0,28.0
18507,15.0,0.0,0.0,0.0,17.0
40234,5.0,0.0,0.0,0.0,72.0
...,...,...,...,...,...
1771,5.0,0.0,2.0,0.0,12.0
14975,6.0,0.0,1.0,0.0,9.0
32397,10.0,0.0,7.0,0.0,38.0
49163,127.0,0.0,1.0,0.0,22.0


- Carrier Delay: due to circumstances within the airline's control.
- Late Aircraft Delay: due to the late arrival of the same aircraft at a previous airport

## Cancelled vs. Delays:

In [None]:
# Do cancelled flights have delayed metrics?
print("Cancelled flights are not delayed:")
flights[(flights['cancelled'] == 1) & (flights['late_aircraft_delay'].isnull() == False)].empty
flights[(flights['cancellation_code'].isnull() == False) & (flights['late_aircraft_delay'].isnull() == False)].empty

Cancelled flights are not delayed:


True

## Ground time vs. Delays:

In [None]:
# Ground times and delays:
# Some gorund times don't imply delays
flights[(flights['total_add_gtime'].isnull() == False)]
# Some delays didn't have any ground time
flights[(flights['total_delay'].isnull() == False) & (flights['total_add_gtime'].isnull() == True)]

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,total_delay
27211,2018-01-01,NK,NK,NK,967,NK,N639NK,967,11042,CLE,"Cleveland, OH",14635,RSW,"Fort Myers, FL",1520,1520.0,0.0,42.0,1602.0,1824.0,5.0,1805,1829.0,24.0,0,,0,N,165,189.0,142.0,1,1025,0.0,0.0,24.0,0.0,0.0,,,,24.0
39098,2018-01-01,WN,WN,WN,401,WN,N449WN,401,10529,BDL,"Hartford, CT",13204,MCO,"Orlando, FL",545,601.0,16.0,16.0,617.0,857.0,4.0,845,901.0,16.0,0,,0,N,180,180.0,160.0,1,1050,16.0,0.0,0.0,0.0,0.0,,,,16.0
27579,2018-01-01,AA,AA,AA,447,AA,N559UW,447,14107,PHX,"Phoenix, AZ",12892,LAX,"Los Angeles, CA",1100,1139.0,39.0,15.0,1154.0,1157.0,11.0,1150,1208.0,18.0,0,,0,N,110,89.0,63.0,1,370,1.0,0.0,0.0,0.0,17.0,,,,18.0
48280,2018-01-01,WN,WN,WN,5859,WN,N283WN,5859,13796,OAK,"Oakland, CA",14679,SAN,"San Diego, CA",1730,2034.0,184.0,12.0,2046.0,2157.0,5.0,1855,2202.0,187.0,0,,0,N,85,88.0,71.0,1,446,1.0,0.0,3.0,0.0,183.0,,,,187.0
38518,2018-01-01,AA,AA_CODESHARE,AA,4029,MQ,N648AE,4029,11298,DFW,"Dallas/Fort Worth, TX",10627,BIS,"Bismarck/Mandan, ND",1900,2004.0,64.0,13.0,2017.0,2238.0,6.0,2150,2244.0,54.0,0,,0,N,170,160.0,141.0,1,977,54.0,0.0,0.0,0.0,0.0,,,,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42702,2019-12-31,AS,AS,AS,266,AS,N282AK,266,14771,SFO,"San Francisco, CA",11618,EWR,"Newark, NJ",935,1118.0,103.0,20.0,1138.0,1941.0,7.0,1805,1948.0,103.0,0,,0,N,330,330.0,303.0,1,2565,103.0,0.0,0.0,0.0,0.0,,,,103.0
34547,2019-12-31,UA,UA_CODESHARE,UA,4518,G7,N543GJ,4518,11003,CID,"Cedar Rapids/Iowa City, IA",13930,ORD,"Chicago, IL",1515,1641.0,86.0,12.0,1653.0,1738.0,11.0,1635,1749.0,74.0,0,,0,N,80,68.0,45.0,1,196,0.0,0.0,0.0,0.0,74.0,,,,74.0
8097,2019-12-31,UA,UA,UA,1182,UA,N17719,1182,12441,JAC,"Jackson, WY",13930,ORD,"Chicago, IL",1307,1343.0,36.0,25.0,1408.0,1731.0,2.0,1707,1733.0,26.0,0,,0,N,180,170.0,143.0,1,1162,0.0,0.0,6.0,0.0,20.0,,,,26.0
42133,2019-12-31,UA,UA,UA,613,UA,N75851,613,12892,LAX,"Los Angeles, CA",13930,ORD,"Chicago, IL",1023,1200.0,97.0,20.0,1220.0,1801.0,21.0,1626,1822.0,116.0,0,,0,N,243,262.0,221.0,1,1744,0.0,0.0,19.0,0.0,97.0,,,,116.0


## Columns to drop:
- cancelled: cancelled flights are not delayed
- cancellation_code: ""

## Columns with 0s in Nan:
- total_add_gtime:

## Observations to filter:
- Filter out cancelled flights

## Columns to create:
- Total delay

In [None]:
nan_with_zeroes = ['total_add_gtime']