In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import calendar
plt.style.use('ggplot')
import seaborn as sns
import datetime
pd.set_option('display.max_columns', 500)
from scipy.stats import ttest_ind
#from mpl_toolkits.basemap import Basemap
#from geopy.geocoders import Nominatim

FIG_SIZE = (15,5)

[Top reasons for delays](http://www.mro-network.com/maintenance-repair-overhaul/top-5-reasons-flight-delays/gallery?slide=5)
* late arriving aircraft
* air carrier delays
* heavy traffic volume
* extreme weather
* security lines


In [None]:
size = 'medium' # ['small','medium','large']
flights = pd.read_csv("flights_{0}.csv".format(size))
flights["Date"] = pd.to_datetime(flights["Date"])
n,m = flights.shape
any_nans = flights.isnull().values.any()
print("n observations = {0:,}. n features = {1}. Any NANs = {2}".format(n,m,any_nans))
flights.head(5)

### Get year and month of flight

In [None]:
flights.insert(loc=1,column="month",value=pd.DatetimeIndex(flights['Date']).month)
flights.insert(loc=1,column="year",value=pd.DatetimeIndex(flights['Date']).year)
flights[["Date","year","month"]].head(20)

### Log delays as highly skewed (focus on departure delays as opposed to arrivals)

In [None]:
#keep_range = np.percentile(flights['Departure_delay'],[1,99]) # extreme delays
#print(keep_range) # 98% of flight delays in between -12mins 2hrs20
#long_delays = flights[flights['Departure_delay']>keep_range[1]]
flights = flights[flights['Departure_delay'].between(keep_range[0],keep_range[1])] #(remove outliers)
flights['Departure_delay']#not interested in 'negative' delays

In [None]:
plt.hist(np.log(flights['Departure_delay']))

### Bin Departure delays 

In [None]:
flights['Departure_delay_bin'] = pd.cut(flights['Departure_delay'],
                                        bins=(-15,15,45,240),labels=("No delay","Short delay","Long Delay"))
flights['Departure_delay_bin'].value_counts()

### What is the trend over time in number of flights?

In [None]:
flights_by_year = flights.groupby('year').size()
flights_by_month = flights.groupby('month').size()
f, (ax1, ax2) = plt.subplots(1, 2,figsize=FIG_SIZE)
months = flights_by_month.index 
months_names = [calendar.month_abbr[mon] for mon in months]
width=1.0
ax1.plot(flights_by_year); ax1.set_xlabel("Year"); ax1.set_ylabel("Total number of flights")
ax2.bar(months,flights_by_month,width=width); ax2.set_xticklabels(months_names); ax2.set_xticks(months + width / 2);
ax2.set_xlabel("Month"); ax2.set_ylabel("Total number of flights")
plt.show()

* Flights by year shows decrease in total annual flights after 2006. Possibly due to financial crash.
* Flights by month differences might possibly be related to US holidays.

### In delay times? 

In [None]:
departure_delay_year = flights[["Departure_delay","year"]].groupby("year").agg(["mean","median"]) 
departure_delay_month = flights[["Departure_delay","month"]].groupby('month').agg(["mean","median"])
f, (ax1, ax2) = plt.subplots(1, 2,figsize=FIG_SIZE)
ax1.plot(departure_delay_year)
ax1.legend(("Mean departure delay","Median departure delay"))
ax1.set_ylabel("Depature delay time"); ax1.set_xlabel("Year")

width = 0.25
ax2.bar(months,departure_delay_month["Departure_delay"]["mean"].values,width=width,color='red');
ax2.bar(months+width,departure_delay_month["Departure_delay"]["median"].values,width=width);
ax2.set_xticklabels(months_names); ax2.set_xticks(months + width / 2);
ax2.set_xlabel("Month"); ax2.set_ylabel("Depature delay time")
ax2.legend(("mean","median"))
plt.show()

* Delays seem to be correlated with overall traffic. E.g. delays are more likely to be long in high traffic months such as june, july as well as high traffic years e.g. 2006. 
* Delays also very higher in winter months as expected.
* Mean >> Median implying positivley skewed delays. I.e there are few very long delays

### Delays arguably related to weather which is related to location. Should see relationship between latitude ("north-south") and delay time

In [None]:
flights['latitude_bin'] = np.where(flights.Latitude>40,'north','south') # anything above 40 lat classes as north

In [None]:
def plot_two_dist(dist1,dist2,title_append=''):
    dist1_n, dist2_n = dist1['values'].size, dist2['values'].size
    f, ax1 = plt.subplots(1, 1,figsize=FIG_SIZE)
    sns.kdeplot(dist1['values'],ax=ax1,label=dist1['name'],bw=4,color='red')
    sns.kdeplot(dist2['values'],ax=ax1,label=dist2['name'],bw=4,color='blue')
    ax1.vlines(dist1['values'].mean(), 0, 1,colors='red')
    ax1.vlines(dist2['values'].mean(), 0, 1,colors='blue')
    plt.title("Counts for {0}={1}, {2}={3}. {4}".format(dist1['name'],dist1_n,
                                                   dist2['name'],dist2_n,
                                                     title_append))
    plt.legend()
    plt.show()

In [None]:
flights_winter = flights[flights['month'].isin([1,12])] # winter months
north_delays_winter = flights_winter[(flights_winter['latitude_bin']=='north')]['Departure_delay']
south_delays_winter = flights_winter[(flights_winter['latitude_bin']=='south')]['Departure_delay']
dist1={'values':north_delays_winter,'name':'North'}
dist2={'values':south_delays_winter,'name':'South'}
plot_two_dist(dist1,dist2)


* no real difference in departure delays. possibly need specific weather information

### Load weather, look at wind/snow by date and aiport (departure airport)

In [None]:
weather = pd.read_csv("weather.csv")
weather["Date"] = pd.to_datetime(weather["Date"])
weather = weather.rename(columns={"airport": "Departure_Airport"})

### if snow and rain do not change (i.e. std==0) by day at each airport then we can just merge our flight data and weather data and keep any one of the time values

In [None]:
weather_by_date_airport = weather[['snow','wind',"Date","Departure_Airport"]].groupby(["Date","Departure_Airport"]).agg(['std'])
np.any(weather_by_date_airport>0) # snow and rain values do not change each day given an airport.
weather.drop('time',axis=1,inplace=True) # therefore time is redundant
weather.drop_duplicates(inplace=True) 

### now merge flight data with weather

In [None]:
flights_weather = flights.merge(weather,how='left',on=["Date","Departure_Airport"], indicator=True)

### assume null = 0. i.e. no entry has been entered into dataset if no snow or wind

In [None]:
flights_weather[['snow','wind']] = flights_weather[['snow','wind']].fillna(0)
f, (ax1,ax2) = plt.subplots(1, 2,figsize=FIG_SIZE)
sns.distplot(flights_weather.wind,bins=np.arange(0,15,1),ax=ax1,label='wind',kde=False)
sns.distplot(flights_weather.snow,bins=np.arange(0,5,1),ax=ax2,label='snow',kde=False)
plt.show()

* Bin wind into NONE, less than 6 and greater than 6. 
* Bin snow into YES/NO

### Check delay distribution by whether or not snow was present

In [None]:
for snow_value in range(1,12,2):
    flights_weather['snow_bool'] = flights_weather[['snow']] > snow_value
    no_snow = flights_weather['Departure_delay'][flights_weather.snow_bool==0]
    snow = flights_weather['Departure_delay'][flights_weather.snow_bool==1]
    dist1={'values':no_snow,'name':'No snow'}
    dist2={'values':snow,'name':'Snow'}
    title = 'snow var > {0}'.format(snow_value)
    plot_two_dist(dist1,dist2,title_append=title)
    

* small difference in distributions which gets greater the higher the threshold of 'snow_bool'. Number of events gets very small however.

### Check delay distribution by wind

In [None]:
for wind_value in range(1,20,4):
    flights_weather['wind_bin'] = flights_weather['wind'] > wind_value
    no_wind = flights_weather['Departure_delay'][flights_weather.wind_bin==0]
    wind = flights_weather['Departure_delay'][flights_weather.wind_bin>0]
    dist1={'values':no_wind,'name':'No wind'}
    dist2={'values':wind,'name':'Wind'}
    title = 'Wind thresh > {0}'.format(wind_value)
    plot_two_dist(dist1,dist2,title)

### Ice or no ice (sub zero)

In [None]:
plt.hist(flights_weather['temperature']); plt.show()

* Must be in farenheit

In [None]:
flights_weather['sub_zero'] = flights_weather['temperature'] < 32 #0C = 32F
flights_weather.temperature.fillna(flights_weather.temperature.mean(),inplace=True) # assume mean temperature for missing

In [None]:
no_ice = flights_weather['Departure_delay'][flights_weather.sub_zero==False] ## may not be actual ice, just a name.
ice = flights_weather['Departure_delay'][flights_weather.sub_zero==True]
dist1={'values':no_ice,'name':'No ice'}
dist2={'values':ice,'name':'Ice'}
plot_two_dist(dist1,dist2)

* "Ice" is quite rare event but looks like it increases delays

### Flights delays by airport

In [None]:
flights_weather.columns

In [None]:
state_delay = flights_weather[["Departure_delay","Departure_delay_bin","Departure_State"]].groupby("Departure_State").agg(['mean','count','std'])

In [None]:
x = state_delay['Departure_delay']['count'].values
y = state_delay['Departure_delay']['mean'].values

In [None]:
plt.scatter(x,)