In [69]:
# Importing all necessary libraries
# munging imports
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# modeling imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

# Logistic regression visualization utility function
def generate_logreg_vis(beta=-.05):
    
    # draw feature values and use beta to calculate predicted probability
    # of positive class for each feature value
    x = np.random.uniform(low=-1, high=1, size=150) * 100
    p = 1 / (1 + np.exp(-(beta*x))) # logistic regression function
    
    # plot predicted probs against the feature 
    plt.scatter(x, p)
    plt.title('Probability of Default vs. Credit Rating Score')
    plt.ylabel('Predicted Probability of Default')
    plt.xlabel('Credit Rating Score')
    
    # 50% probability threshold
    plt.gca().axhline(.5,c='k',ls='--',lw=1)


# Flight Delay Classification Model Building

This notebook outlines the code pull in the data that will later be used to create a classification model for flights being delayed.

### Data

The main dataset was pulled from [kaggle](https://www.kaggle.com/yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018/code "2008-2019 Domestic Flight Data"). 

Additional data will need to be added, including:
* During holiday season?
* Weather conditions
* Distance to destination

#### Pulling in kaggle data

In [70]:
#read them in 
# kag1=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/Flight Kaggle Data/2014.csv')
# kag2=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/Flight Kaggle Data/2015.csv')
kag3=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/Flight Kaggle Data/2016.csv')
kag4=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/Flight Kaggle Data/2017.csv')
kag5=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/Flight Kaggle Data/2018.csv')

##### I want to test if flight delays have gone down over the years with improved technology. It might not be reasonable to use data from varying years if flight delay times are going down by the year. The following shows the average arrival delay time by year, clearly there has not been an explicit decline, so we will use all the data. 

In [71]:
#
print ('Averages:')
# print(f'2014 Avg Delay:   {round(kag1.ARR_DELAY.mean(),1)} minutes')
# print(f'2015 Avg Delay:   {round(kag2.ARR_DELAY.mean(),1)} minutes')
print(f'2016 Avg Delay:   {round(kag3.ARR_DELAY.mean(),1)} minutes')
print(f'2017 Avg Delay:   {round(kag4.ARR_DELAY.mean(),1)} minutes')
print(f'2018 Avg Delay:   {round(kag5.ARR_DELAY.mean(),1)} minutes')

Averages:
2016 Avg Delay:   3.5 minutes
2017 Avg Delay:   4.3 minutes
2018 Avg Delay:   5.0 minutes


In [72]:
#concat all into one dataframe
frames= [#kag1,kag2,
         kag3,kag4,kag5]
df= pd.concat(frames)

Cleaning the dataset by renaming columns, & clarifying other metrics like airline.

In [73]:
df.rename(columns={
    'FL_DATE':'DATE',
    'OP_CARRIER':'AIRLINE',
    'OP_CARRIER_FL_NUM':'FLIGHT_NUM',
    'ORIGIN':'ORIGIN',
    'DEST':'DEST',
    'CRS_DEP_TIME':'CRS_DEP_TIME',
    'DEP_TIME':'DEP_TIME',
    'DEP_DELAY':'DEP_DELAY',
    'TAXI_OUT':'TAXI_OUT',
    'WHEELS_OFF':'WHEELS_OFF',
    'TAXI_IN':'TAXI_IN',
    'CRS_ARR_TIME':'CRS_ARR_TIME',
    'ARR_TIME':'ARR_TIME',
    'ARR_DELAY':'ARR_DELAY',
    'CANCELLED':'CANCELLED',
    'CANCELLATION_CODE':'CANCELLATION_CODE',
    'DIVERTED':'DIVERTED',
    'CRS_ELAPSED_TIME':'CRS_ELAPSED_TIME',
    'ACTUAL_ELAPSED_TIME':'ELAPSED_TIME',
    'AIR_TIME':'AIR_TIME',
    'DISTANCE':'DISTANCE',
    'CARRIER_DELAY':'CARRIER_DELAY',
    'WEATHER_DELAY':'WEATHER_DELAY',
    'NAS_DELAY':'NAS_DELAY',
    'SECURITY_DELAY':'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY':'LATE_AIRCRAFT_DELAY'
}, inplace=True)

In [74]:
df['AIRLINE'].replace({
    'UA':'United Airlines',
    'AS':'Alaska Airlines',
    '9E':'Endeavor Air',
    'B6':'JetBlue Airways',
    'EV':'ExpressJet',
    'F9':'Frontier Airlines',
    'G4':'Allegiant Air',
    'HA':'Hawaiian Airlines',
    'MQ':'Envoy Air',
    'NK':'Spirit Airlines',
    'OH':'PSA Airlines',
    'OO':'SkyWest Airlines',
    'VX':'Virgin America',
    'WN':'Southwest Airlines',
    'YV':'Mesa Airline',
    'YX':'Republic Airways',
    'AA':'American Airlines',
    'DL':'Delta Airlines'
},inplace=True)

Drop uneeded columns.

In [76]:
df.drop(columns=['TAXI_OUT','WHEELS_OFF','WHEELS_ON','TAXI_IN',
                 'ARR_TIME','CANCELLATION_CODE','DIVERTED',
                 'ELAPSED_TIME','AIR_TIME','CARRIER_DELAY',
                 'WEATHER_DELAY','NAS_DELAY','SECURITY_DELAY',
                 'LATE_AIRCRAFT_DELAY','Unnamed: 27'
                ],inplace=True)

Add a datetime column.

In [77]:
pd.to_datetime(df.DATE)

0         2016-01-01
1         2016-01-01
2         2016-01-01
3         2016-01-01
4         2016-01-01
             ...    
7213441   2018-12-31
7213442   2018-12-31
7213443   2018-12-31
7213444   2018-12-31
7213445   2018-12-31
Name: DATE, Length: 18505725, dtype: datetime64[ns]

Add seperate month,year, day of week columns

In [78]:
#adding month and day of week columns
df['MONTH'] = pd.to_datetime(df['DATE']).dt.month
df['DAYOFWEEK'] = pd.to_datetime(df['DATE']).dt.weekday

Pull in and Add Airport Data

In [79]:
#make dataframe from airport info csv
airport_info=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/airport_info.csv')

In [80]:
airport_info.head()

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,country_name,iso_country,...,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords,score,last_updated
0,3632,KLAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125.0,,United States,US,...,Los Angeles,1,KLAX,LAX,LAX,https://www.flylax.com/,https://en.wikipedia.org/wiki/Los_Angeles_Inte...,,1335475,2020-04-26T22:37:22+00:00
1,3754,KORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,672.0,,United States,US,...,Chicago,1,KORD,ORD,ORD,https://www.flychicago.com/ohare/home/pages/de...,https://en.wikipedia.org/wiki/O'Hare_Internati...,"CHI, Orchard Place",1503175,2018-09-16T02:35:35+00:00
2,3622,KJFK,large_airport,John F Kennedy International Airport,40.639801,-73.7789,13.0,,United States,US,...,New York,1,KJFK,JFK,JFK,https://www.jfkairport.com/,https://en.wikipedia.org/wiki/John_F._Kennedy_...,"Manhattan, New York City, NYC, Idlewild",1052075,2021-04-10T17:02:47+00:00
3,3384,KATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026.0,,United States,US,...,Atlanta,1,KATL,ATL,ATL,http://www.atlanta-airport.com/,https://en.wikipedia.org/wiki/Hartsfield–Jacks...,,2002475,2018-09-19T14:50:01+00:00
4,3878,KSFO,large_airport,San Francisco International Airport,37.618999,-122.375,13.0,,United States,US,...,San Francisco,1,KSFO,SFO,SFO,http://www.flysfo.com/,https://en.wikipedia.org/wiki/San_Francisco_In...,"QSF, QBA",1112475,2008-06-13T14:30:04+00:00


In [81]:
#clean airport dataset
airport_info.drop(['id','name','continent','country_name','elevation_ft','iso_country','scheduled_service','gps_code','home_link','wikipedia_link','keywords','score','last_updated','iso_region','local_region','region_name','elevation_ft'],inplace=True, axis=1)

In [82]:
#changing names in type column
airport_info['type'].replace({
    'large_airport':'large',
    'medium_airport':'medium',
    'small_airport':'small',
    'closed':'closed',
    'seaplane_base':'seaplane_base',
    'heliport':'heliport',
    'balloonport':'balloonport'
},inplace=True)

In [83]:
#rename iata_code as origin & local_code as dest.
airport_info.rename({'iata_code':'ORIGIN','local_code':'DEST'},inplace=True,axis=1)

In [84]:
df=df.merge(airport_info.drop(columns=['DEST']),how='left',on='ORIGIN')

In [85]:
df=df.merge(airport_info.drop(columns=['ORIGIN']),how='left',on='DEST')

In [86]:
df.rename({'latitude_deg_x':'origin_lat',
           'longitude_deg_x':'origin_lon',
           'municipality_x':'municipality1',
    'latitude_deg_y':'dest_lat',
           'longitude_deg_y':'dest_lon',
           'municipality_y':'municipality2',
           'type_x':'origin_type',
           'type_y':'dest_type'
            },axis=1,inplace=True)

In [87]:
holiday_dates=['2014-01-01',#New Years Day,
               '2014-12-31' ,#New Years Eve,
               '2014-01-20',#MLK Day(changes),
               '2014-02-17',#President's day(changes),
               '2014-04-20',#Easter Sunday (changes),
               '2014-04-19',#day before easter sunday(changes),
               '2014-04-21',#day after easter sunday(changes),
               '2014-05-26',#memorial day(changes),
               '2014-07-04',#4th of july,
               '2014-08-01',#labor day(changes),
               '2014-11-11',#Vets day,
               '2014-11-27',#thanksgiving day(changes),
               '2014-11-26',#day after thanksgiving(changes),
               '2014-11-24',#day before thanksgiving(changes),
               '2014-12-25',#christmas day,
               '2014-12-24',#christmas eve,
               #2015
               '2015-01-01',#New Years Day,
               '2015-12-31', #New Years Eve,
               '2015-01-19',#MLK Day(changes),
               '2015-02-16',#President's day(changes),
               '2015-04-05',#Easter Sunday (changes),
               '2015-04-04',#day before easter sunday(changes),
               '2015-04-06',#day after easter sunday(changes),
               '2015-05-25',#memorial day(changes),
               '2015-07-04',#4th of july,
               '2015-08-07',#labor day(changes),
               '2015-11-11',#Vets day,
               '2015-11-26',#thanksgiving day(changes),
               '2015-11-27',#day after thanksgiving(changes),
               '2015-11-25',#day before thanksgiving(changes),
               '2015-12-25',#christmas day,
               '2015-12-24',#christmas eve,
               #2016
               '2016-01-01',#New Years Day,
               '2016-12-31', #New Years Eve,
               '2016-01-18',#MLK Day(changes),
               '2016-03-27',#President's day(changes),
               '2016-04-20',#Easter Sunday (changes),
               '2016-04-19',#day before easter sunday(changes),
               '2016-04-21',#day after easter sunday(changes),
               '2016-05-30',#memorial day(changes),
               '2016-07-04',#4th of july,
               '2016-08-05',#labor day(changes),
               '2016-11-11',#Vets day,
               '2016-11-24',#thanksgiving day(changes),
               '2016-11-25',#day after thanksgiving(changes),
               '2016-11-23',#day before thanksgiving(changes),
               '2016-12-25',#christmas day,
               '2016-12-24',#christmas eve,
               #2017
               '2017-01-01',#New Years Day,
               '2017-12-31', #New Years Eve,
               '2017-01-16',#MLK Day(changes),
               '2017-02-20',#President's day(changes),
               '2017-04-16',#Easter Sunday (changes),
               '2017-04-15',#day before easter sunday(changes),
               '2017-04-17',#day after easter sunday(changes),
               '2017-05-29',#memorial day(changes),
               '2017-07-04',#4th of july,
               '2017-08-04',#labor day(changes),
               '2017-11-11',#Vets day,
               '2017-11-23',#thanksgiving day(changes),
               '2017-11-24',#day after thanksgiving(changes),
               '2017-11-22',#day before thanksgiving(changes),
               '2017-12-25',#christmas day,
               '2017-12-24',#christmas eve,
               #2018
               '2018-01-01',#New Years Day,
               '2018-12-31', #New Years Eve,
               '2018-01-15',#MLK Day(changes),
               '2018-02-19',#President's day(changes),
               '2018-04-01',#Easter Sunday (changes),
               '2018-03-31',#day before easter sunday(changes),
               '2018-04-02',#day after easter sunday(changes),
               '2018-05-28',#memorial day(changes),
               '2018-07-04',#4th of july,
               '2018-08-03',#labor day(changes),
               '2018-11-11',#Vets day,
               '2018-11-22',#thanksgiving day(changes),
               '2018-11-23',#day after thanksgiving(changes),
               '2018-11-21',#day before thanksgiving(changes),
               '2018-12-25',#christmas day,
               '2018-12-24']#christmas eve

Add Column for "during holiday season or not" ['holiday_szn']

In [88]:
#create the new column using the list above of dates included in the holiday season!
df['holiday_szn'] = df['DATE'].isin(holiday_dates)

In [89]:
# df.info()
proper_len=[4]

Drop NAs and infinite values

In [90]:
#making a column to say if sched dept time listed is in correct formatted time
df['sch_len']= df['CRS_DEP_TIME'].astype(str).str.len()
df['proper']= df['sch_len'].isin(proper_len)

In [91]:
#dropping len column
df.drop(columns=['sch_len'],inplace=True)

In [92]:
#dropping all values with False in our proper column 
    #proper is false
index_names = df[ df['proper'] == False ].index
df.drop(index_names, inplace = True)

Now for proper scheduled arrival times

In [93]:
#making a column to say if sched arr time listed is in correct formatted time
df['sch_len']= df['CRS_ARR_TIME'].astype(str).str.len()
df['proper']= df['sch_len'].isin(proper_len)

In [94]:
#dropping len column
df.drop(columns=['sch_len'],inplace=True)

In [95]:
#dropping all values with False in our proper column 
    #proper is false
index_names = df[ df['proper'] == False ].index
df.drop(index_names, inplace = True)

In [96]:
df.drop(columns=['proper'],inplace=True)

Now we need to take the 2 time columns and make hours columns: one for hour of scheduled departure, and two for hour of scheduled arrival. Make hour columns for scheduled departing hour and scheduled arrival hour.

In [97]:
df['DEP_HOUR'] = df['CRS_DEP_TIME'].astype(str).apply(lambda s:s[:2])
df['ARR_HOUR'] = df['CRS_ARR_TIME'].astype(str).apply(lambda s:s[:2])

Drop the extra ident column and rename the first one to AirportCode so we can later use it to merge in weather data.

In [99]:
df.drop(columns=['ident_x'],inplace=True)
df.rename({'ident_y':'AirportCode'},axis=1,inplace=True)

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_DELAY,...,origin_lon,municipality1,ident_y,dest_type,dest_lat,dest_lon,municipality2,holiday_szn,DEP_HOUR,ARR_HOUR
0,2016-01-01,Delta Airlines,1248,DTW,LAX,1935,1935.0,0.0,2144,-24.0,...,-83.353401,Detroit,KLAX,large,33.942501,-118.407997,Los Angeles,True,19,21
1,2016-01-01,Delta Airlines,1251,ATL,GRR,2125,2130.0,5.0,2321,-2.0,...,-84.428101,Atlanta,KGRR,medium,42.880798,-85.522797,Grand Rapids,True,21,23
3,2016-01-01,Delta Airlines,1255,SLC,ATL,1656,1700.0,4.0,2229,-16.0,...,-111.979746,Salt Lake City,KATL,large,33.636700,-84.428101,Atlanta,True,16,22
5,2016-01-01,Delta Airlines,1257,ATL,BNA,1233,1356.0,83.0,1239,83.0,...,-84.428101,Atlanta,KBNA,large,36.124500,-86.678200,Nashville,True,12,12
6,2016-01-01,Delta Airlines,1257,BNA,ATL,1320,1446.0,86.0,1530,74.0,...,-86.678200,Nashville,KATL,large,33.636700,-84.428101,Atlanta,True,13,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18505720,2018-12-31,American Airlines,1815,DCA,CLT,1534,1530.0,-4.0,1714,-5.0,...,-77.037697,Washington,KCLT,large,35.214001,-80.943100,Charlotte,True,15,17
18505721,2018-12-31,American Airlines,1816,CLT,DFW,1751,1757.0,6.0,1952,1.0,...,-80.943100,Charlotte,KDFW,large,32.896801,-97.038002,Dallas-Fort Worth,True,17,19
18505722,2018-12-31,American Airlines,1817,CLT,MEM,2015,2010.0,-5.0,2107,11.0,...,-80.943100,Charlotte,KMEM,large,35.042400,-89.976700,Memphis,True,20,21
18505723,2018-12-31,American Airlines,1818,CLT,RDU,1300,1323.0,23.0,1350,14.0,...,-80.943100,Charlotte,KRDU,large,35.877602,-78.787498,Raleigh/Durham,True,13,13


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12477222 entries, 0 to 18505724
Data columns (total 27 columns):
 #   Column            Dtype  
---  ------            -----  
 0   DATE              object 
 1   AIRLINE           object 
 2   FLIGHT_NUM        int64  
 3   ORIGIN            object 
 4   DEST              object 
 5   CRS_DEP_TIME      int64  
 6   DEP_TIME          float64
 7   DEP_DELAY         float64
 8   CRS_ARR_TIME      int64  
 9   ARR_DELAY         float64
 10  CANCELLED         float64
 11  CRS_ELAPSED_TIME  float64
 12  DISTANCE          float64
 13  MONTH             int64  
 14  DAYOFWEEK         int64  
 15  origin_type       object 
 16  origin_lat        float64
 17  origin_lon        float64
 18  municipality1     object 
 19  AirportCode       object 
 20  dest_type         object 
 21  dest_lat          float64
 22  dest_lon          float64
 23  municipality2     object 
 24  holiday_szn       bool   
 25  DEP_HOUR          object 
 26  ARR_HOUR    

We have 12,477,222 entries to work with now. 

Save this as a csv and pull it up in a new notebook (Initial Data Pulling 2). From that second notebook, we will pull this created csv file and use it to try and pull in weather data and flight price data if possible.

In [105]:
df.to_csv('/Users/mehikapatel/Flights_Project/FlightsDataAfterNB1')