In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls newdata

2007.csv        carriers.csv    planedata.csv
airports.csv    df.csv          [34mspark-warehouse[m[m


In [None]:
flights = pd.read_csv('newdata/2007.csv')

In [None]:
airports = pd.read_csv('newdata/airports.csv')

In [None]:
carriers = pd.read_csv('newdata/carriers.csv')

In [None]:
planes = pd.read_csv('newdata/planedata.csv')

In [None]:
flights[(flights['Cancelled']==1)&(flights['ArrDelay']>0)]

In [None]:
flights.head()

In [None]:
airports.head()

In [None]:
flights['Origin'].head()

In [None]:
airports[airports['iata'] == 'ORD']

In [None]:
#match airports and flights on IATA

In [None]:
flights['TailNum'].head()

In [None]:
planes.columns

In [None]:
planes['tailnum'].head()

In [None]:
#match flights and planes on tailnum

In [None]:
flights.columns

In [None]:
flights[['ArrDelay', 'DepDelay', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].head(10)

In [None]:
#Target variable: arrival delay

In [None]:
fl1 = flights.merge(airports, how='inner', left_on='Origin', right_on='iata')

In [None]:
fl1.rename(columns={'airport':'OriginAirport', 'city':'OriginCity', 'state':'OriginState',
            'country':'OriginCountry', 'lat':'OriginLat', 'long':'OriginLong'}, inplace=True)

In [None]:
fl1.drop(columns=['iata'],inplace=True)

In [None]:
fl1.columns

In [None]:
fl2 = fl1.merge(airports, how='inner', left_on='Dest', right_on='iata')

In [None]:
fl2.rename(columns={'airport':'DestAirport', 'city':'DestCity', 'state':'DestState',
            'country':'DestCountry', 'lat':'DestLat', 'long':'DestLong'}, inplace=True)

In [None]:
fl2.columns

In [None]:
fl2.drop(columns=['iata'],inplace=True)

In [None]:
df = fl2.merge(planes, how='inner', left_on='TailNum', right_on='tailnum')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.rename(columns={'status':'PlaneStatus', 'tailnum':'TailNum', 'type':'PlaneOwnership', 
                  'engine_type':'Engine', 'model':'PlaneModel', 'issue_date':'PlaneIssueData',
                  'manufacturer':'Manufacturer', 'year':'PlaneYear', 'aircraft_type':'AircraftType',
                   'UniqueCarrier':'Carrier'
                  }, inplace=True)

In [None]:
df.columns

In [None]:
df.PlaneOwnership.unique()

In [None]:
df['OriginCityIata'] = df['OriginCity'] +'-'+df['Origin']

In [None]:
#Carriers

In [None]:
carriers.head()

In [None]:
len(carriers.Description.unique())

In [None]:
lst = []
for i in df['Carrier']:
    if i == "AS":
        lst.append("Alaska")
    elif i == "AA":
        lst.append("American")
    elif i == "US":
        lst.append("USAirways")
    elif i == "DL":
        lst.append("Delta")
    elif i == "NK":
        lst.append("Spirit")
    elif i == "UA":
        lst.append("United")
    elif i == "HA":
        lst.append("Hawaiian")
    elif i == "B6":
        lst.append("JetBlue")
    elif i == "OO":
        lst.append("SkyWest")
    elif i == "EV":
        lst.append("ExpressJet")
    elif i == "MQ":
        lst.append("AmericanEagle")
    elif i == "F9":
        lst.append("Frontier")
    elif i == "WN":
        lst.append("SouthWest")
    elif i == "VX":
        lst.append("VirginAmerica")
    else:
        lst.append('Other')

In [None]:
df['CarrierName'] = lst

In [None]:
#Carriers file seems to be adding little value so skipping it for now

In [None]:
#df.filter(regex='Orig*', axis=1)

In [None]:
df.dtypes

In [None]:
#Creating some features that will certainly be useful:

In [None]:
df['FlightTimeStamp'] = df['Year'].astype(str) + '-' + df['Month'].astype(str) + '-' + df['DayofMonth'].astype(str)

In [None]:
#Converting departure and arrival time:

In [None]:
df['DepTime'].head()

In [None]:
#Looks as if this is really hour and minute time
print(df['DepTime'].max())
print(df['DepTime'].min())

In [None]:
print(df['ArrTime'].max())
print(df['ArrTime'].min())

In [None]:
#df['DepTime'] = df['ArrTime'].fillna(9999)

In [None]:
df['DepTime'].head()

In [None]:
df.shape

In [None]:
#Removing Null Target Variable Values:

In [None]:
sum(df['ArrTime'].isnull())

In [None]:
sum(df['DepTime'].isnull())

In [None]:
df = df.dropna(subset=['ArrTime'], how='any')

Data Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

Which Airport had the most departures in 2007?

In [None]:
np.arange(10)[::-1]

In [None]:
from matplotlib import cm

df['OriginCityIata'].value_counts().head(10).plot('bar', title='Top 10 Airports by Flight Counts in 2007', 
                                          figsize=(15,8), color=cm.OrRd_r(np.linspace(.01,3,50)), rot=45)\
                    .set(xlabel='Origin', ylabel='Flights Count')
plt.show()

In [None]:
dist = df['OriginCityIata'].value_counts().head(30)

In [None]:
df_temp = df[(df['OriginCityIata'].isin(dist.index))&(df['CarrierName'].isin(['SouthWest', 'United', 'JetBlue', 
    'SkyWest', 'USAirways', 'Alaska', 'Delta', 'American', 'AmericanEagle']))]\
             [['OriginCityIata','CarrierName','TailNum']].groupby(['OriginCityIata','CarrierName']).count().reset_index()

In [None]:
df_temp.columns = ['OriginCityIata','CarrierName','TailNum','Drop']

In [None]:
df_temp = df_temp.drop('Drop',axis=1)

In [None]:
df_temp = df_temp.sort_values('TailNum', ascending=False)

In [None]:
df_pivot = df_temp.pivot(index='OriginCityIata', columns='CarrierName', values='TailNum')

In [None]:
df_pivot.plot(kind='bar', stacked=True, figsize=(15,8), title='Flights Count by Airline', rot=45)\
                    .set(xlabel='Origin City', ylabel='Flights Count')
plt.show()

In [None]:
top_airports = df['OriginCityIata'].value_counts().reset_index()
top_airports.columns = ['IATA', 'FlightsCount']

In [None]:
top_airports = top_airports.merge(airports, how='left', left_on='IATA', right_on='iata').iloc[:100,:]

In [None]:
import plotly.graph_objects as go

In [None]:
top_airports.columns

In [None]:
top_airports['text'] = top_airports['iata'] + ' - ' + top_airports['city'] + ', ' + top_airports['state'] \
                                + ' - ' + 'Departures: ' + top_airports['FlightsCount'].astype(str)

In [None]:
fig = go.Figure(data=go.Scattergeo(
        locationmode = 'USA-states',
        lon = top_airports['long'],
        lat = top_airports['lat'],
        text = top_airports['text'],
        mode = 'markers',
        #hoverinfo="text",
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = False,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'Reds',
            cmin = 0,
            color = top_airports['FlightsCount'],
            cmax = top_airports['FlightsCount'].max(),
            colorbar_title="Departing flights<br> 2007"
        )))

In [None]:
fig.update_layout(title = 'Departures from US Top 100 Airports in 2007<br>(Hover over for airport names)',
        geo = dict(
            scope='usa',
            projection_type='albers usa',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5),)
fig.show()

In [None]:
#Bonus: reasons for flight delays - which column had a delay

In [None]:
df.columns

In [None]:
df_temp = df[(df['OriginCityIata'].isin(dist.index))&(df['Carrier'].isin(['AA','US','UA','DL','NW']))]\
             [['OriginCityIata','CarrierDelay','WeatherDelay', 'NASDelay', 'SecurityDelay', 
               'LateAircraftDelay']].groupby(['OriginCityIata']).sum().reset_index()

In [None]:
df_temp.plot(kind='bar', figsize=(16,8)).set_xticklabels(df_temp['OriginCityIata'].unique(), rotation=45)
plt.show()

In [None]:
df_temp = df[(df['OriginCityIata'].isin(dist.index))&(df['Carrier'].isin(['AA','US','UA','DL','NW']))]\
             [['CarrierName','CarrierDelay','WeatherDelay', 'NASDelay', 'SecurityDelay', 
               'LateAircraftDelay']].groupby(['CarrierName']).sum().reset_index()

In [None]:
df_temp.plot(kind='bar', figsize=(16,8), title='Delay Reasons for Major US Airlines in 2007', rot=45)\
            .set(xlabel='Major US Airline', ylabel='Total Minutes Delayed',
                 xticklabels=df_temp['CarrierName'].unique())

plt.show()

Total Number of Flights per Carrier

In [None]:
df.CarrierName.unique()

In [None]:
ax = df['CarrierName'].value_counts().plot('bar', title='Total Number of Flights in 2007', figsize=(16,8), 
                                     color=cm.OrRd_r(np.linspace(.01,3,50)), rot=45)\
                                    .set(xlabel='Major US Carrier', ylabel='Flights Count')

#ax.title.set_size(10)

plt.show()

Determine which carrier had the worst on time performance overall

In [None]:
#on time defined as arr delay <=0

In [None]:
#carrier performance can be defined in different ways: average performance? distribution of late arrivals?
#distribution of all arrivals? Will build all and comapare.

In [None]:
#Building DF with stats used for analysis

In [None]:
#per carrier total number of flights late / total number of flights

In [None]:
flights_delayed_bycarrier = df[df['ArrDelay']>0][['CarrierName','ArrDelay']].groupby('CarrierName')\
        .count().sort_values('ArrDelay', ascending=False).reset_index().rename(columns={'ArrDelay':'DelayedCount'})

In [None]:
flights_count = df[['CarrierName','ArrDelay']].groupby('CarrierName')\
        .count().sort_values('ArrDelay', ascending=False).reset_index().rename(columns={'ArrDelay':'FlightsCount'})

In [None]:
delays = flights_delayed_bycarrier.merge(flights_count, how='inner', on='CarrierName')

In [None]:
delays['PercentageFlightsDelayed'] = delays['DelayedCount']/delays['FlightsCount']

In [None]:
minutes_delayed_bycarrier = df[df['ArrDelay']>0][['CarrierName','ArrDelay']].groupby('CarrierName')\
        .sum().sort_values('ArrDelay', ascending=False).reset_index().rename(columns={'ArrDelay':'MinutesDelayed'})

In [None]:
minutes_flown_bycarrier = df[df['ActualElapsedTime']>0][['CarrierName','ActualElapsedTime']].groupby('CarrierName')\
        .sum().sort_values('ActualElapsedTime', ascending=False).reset_index().rename(columns={'ActualElapsedTime':'MinutesFlown'})

In [None]:
delays = delays.merge(minutes_delayed_bycarrier, how='inner', on='CarrierName')\
                    .merge(minutes_flown_bycarrier, how='inner', on='CarrierName')

In [None]:
delays['PercentageMinutesDelayed'] = delays['MinutesDelayed']/delays['MinutesFlown']

In [None]:
delays.sort_values('PercentageFlightsDelayed', ascending=False)['PercentageFlightsDelayed']\
        .plot(kind='bar', title='Percentage Flights Delayed by US Carrier in 2007', figsize=(16,8), 
                                                    color=cm.OrRd_r(np.linspace(.01,3,50)), rot=45)\
        .set(xlabel='US Carrier', ylabel='Flights Delayed %', xticklabels=delays['CarrierName'].unique())
plt.show()

In [None]:
delays.sort_values('PercentageMinutesDelayed', ascending=False)['PercentageMinutesDelayed'].plot(kind='bar', 
    title='Minutes Delayed to Flown Percentage in 2007', figsize=(16,8), color=cm.OrRd_r(np.linspace(.01,3,50)), rot=45)\
    .set(xlabel='US Carrier', ylabel='Percentage Minutes Delayed', xticklabels=delays['CarrierName'].unique())
plt.show()

In [None]:
#Minutes late by Airport

In [None]:
dist = df['OriginCityIata'].value_counts().head(30)

In [None]:
df_temp = df[(df['OriginCityIata'].isin(dist.index))&(df['CarrierName'].isin(['SouthWest', 'United', 'JetBlue', 
    'SkyWest', 'USAirways', 'Alaska', 'Delta', 'American', 'AmericanEagle']))]\
             [['OriginCityIata','CarrierName','ArrDelay']].groupby(['OriginCityIata','CarrierName']).sum().reset_index()

In [None]:
df_temp = df_temp.sort_values('ArrDelay', ascending=False)

In [None]:
df_pivot = df_temp.pivot(index='OriginCityIata', columns='CarrierName', values='ArrDelay')

In [None]:
df_pivot.plot(kind='bar', stacked=True, figsize=(15,8), title='Sum of Minute Delays by Airline by Airport', rot=45)\
                    .set(xlabel='Origin Airport', ylabel='Delays Sum')
plt.show()

In [None]:
df[['ActualElapsedTime', 'CRSElapsedTime', 'ArrDelay', 'CRSArrTime', 'ArrTime', 'CRSDepTime','DepTime']].head()

In [None]:
delays.columns

In [None]:
#Ability to Catch Up (even though flight is late it has )

In [None]:
df.to_csv('newdata/df.csv')

In [None]:
#del df