# Introduction

In [1]:
# setting directory
import os
os.chdir("/Users/nuraisha/Desktop/chasingflights")

In [2]:
import pandas as pd
Y2007 = pd.read_csv("2007.csv", encoding='latin-1')

In [None]:
# importing data; lesser data due to limited ram
import pandas as pd
Y1995 = pd.read_csv("1995.csv", encoding='latin-1')
Y1996 = pd.read_csv("1996.csv", encoding='latin-1')

Y2000 = pd.read_csv("2000.csv", encoding='latin-1')
Y2001 = pd.read_csv("2001.csv", encoding='latin-1')
Y2002 = pd.read_csv("2002.csv", encoding='latin-1')

Y2006 = pd.read_csv("2006.csv", encoding='latin-1')
Y2007 = pd.read_csv("2007.csv", encoding='latin-1')

In [None]:
# collating data into one
database = pd.concat([Y1995, Y1996, Y2000, Y2001, Y2002, Y2006, Y2007])

In [None]:
# exploratory data analysis (EDA)
database.info()
database.head(n=5)

In [None]:
# removing columns
# not using 24:28 due to NaN values in earlier years
database.drop(database.columns[[11,12,13,18,19,20,
                                22,24,25,26,27,28]], 
              axis=1, inplace=True)

# removing NA values
database = database.dropna()

#checking for NA values
database.isnull().values.any()

# Optimal Schedule

In [None]:
# filtering the data
nondel_flights = database[(database['ArrDelay']<= 0) & 
                          (database['DepDelay']<= 0)]

# selecting relevant columns
nondel_sched = nondel_flights[['Month', 'DayOfWeek', 'CRSDepTime']]

# finding frequency of each sched
schedfreq = nondel_sched.value_counts().reset_index(name='Frequency')

# finding top 3 values
schedfreq.nlargest(3, 'Frequency')

# Efficiency of Older Planes

In [None]:
# filtering the data
del_flights = database[(database['ArrDelay'] > 15) | 
                       (database['DepDelay'] > 15)]

# finding total delay
del_flights['TotalDelay'] = del_flights[['ArrDelay', 'DepDelay']].sum(axis=1)

# finding average delay per year
avgdel_year = del_flights.groupby('Year')['TotalDelay'].mean().reset_index().rename(columns={'TotalDelay' : 'AvgTotalDelay'})
avgdel_year

# graph
from matplotlib import pyplot as plt
avgdel_year.plot(y='AvgTotalDelay', use_index = True, c = 'red', xticks = avgdel_year.index)
plt.title('Average Duration of Delay Over Time')
plt.show()

# Flight Destinations

In [None]:
# filter data
flightpath = database[(database['Cancelled'] == 0) & 
                      (database['Diverted'] == 0)]


In [None]:
# finding frequency of paths
freq_flights = flightpath.groupby(['Origin','Dest','Year']).size().reset_index(name='counts')
freq_flights['Flight'] = freq_flights['Origin'] +"-"+ freq_flights['Dest']
freq_flights

In [None]:
# new column for total observation each year
totalobs_year = pd.DataFrame(flightpath[['Year']].value_counts().reset_index(name='total'))
newdf = pd.merge(freq_flights, totalobs_year, on = 'Year', how = "left")
#finding proportion
newdf['proportion'] = newdf['counts']/newdf['total']

In [None]:
# finding the list of top 30 flight paths
# find frequency of each path
sumflights = flightpath [['Origin', 'Dest']].value_counts().reset_index(name='counts')
sumflights['Flight'] = sumflights['Origin'] +"-"+ sumflights['Dest']
# find top 30
toppaths = list((sumflights.nlargest(30, 'counts'))['Flight'])
freqtoppaths = newdf.query('Flight in @toppaths')

In [None]:
# pivot table
pivot = freqtoppaths.pivot_table(index=['Flight'], columns = 'Year', values = 'proportion')
# graph
from matplotlib import pyplot as plt
import numpy as np
plt.pcolor(pivot, cmap = 'Blues')
plt.yticks(np.arange(len(pivot.index.values)),
          labels = pivot.index.values)
plt.xticks(np.arange(len(pivot.columns.values)),
          labels = pivot.columns.values)
plt.title("Frequency of Top 30 Domestic Flight Paths")
plt.colorbar()
plt.show()

# cascading failure

In [None]:
# data
del2007 = Y2007[(Y2007['ArrDelay'] > 15) | 
                       (Y2007['DepDelay'] > 15)]

In [None]:
# random sample
random_seed = 222
del2007.sample(n=1, random_state = random_seed)

In [None]:
E1 = Y2007[(Y2007['FlightNum'] == 1442) & (Y2007['TailNum'] == 'N987DL')
          & (Y2007['Month'] == 1) & (Y2007['DayofMonth'] == 28)]
E1

In [None]:
act_arrtime = E1[['Dest', 'ArrTime']].rename(columns={'Dest' : 'Airport', 
                                               'ArrTime' : 'Time'})
act_deptime = E1[['Origin', 'DepTime']].rename(columns={'Origin' : 'Airport',
                                                 'DepTime' : 'Time'})
act_time = (pd.concat([act_arrtime, act_deptime])).sort_values(by=['Time'])

In [None]:
est_arrtime = E1[['Dest', 'CRSArrTime']].rename(columns={'Dest' : 'Airport', 
                                               'CRSArrTime' : 'Time'})
est_deptime = E1[['Origin', 'CRSDepTime']].rename(columns={'Origin' : 'Airport',
                                                 'CRSDepTime' : 'Time'})
est_time = (pd.concat([est_arrtime, est_deptime])).sort_values(by=['Time'])

In [None]:
#graph
plt.plot(act_time['Time'], act_time['Airport'], linestyle = '-',
         marker = 'o', color = 'red', label='Actual Time')
plt.plot(est_time['Time'], est_time['Airport'], linestyle = '-',
         marker = 'o', color = 'green', label='Estimated Time')
plt.title('Example of a cascading failure')
plt.legend()
plt.show()

# predicting delays

In [3]:
import numpy as np # work with arrays
Y2007.drop(Y2007.columns[[8,9,10,16,17,22]], 
              axis=1, inplace=True)
Y2007 = Y2007.dropna()
Y2007['Delay'] = np.where((Y2007['ArrDelay'] > 15) | (Y2007['DepDelay'] > 15), 1, 0)
sample = Y2007.sample(n=10000, replace=False)

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2
target = sample['Delay']
features = sample.loc[:, sample.columns != 'Delay']

selected_features = []
selector = SelectKBest(f_regression, k = 'all')
selector.fit_transform(features, target)

supports = selector.get_support()

print(supports)
print(features.columns)

for support, feature in zip(supports, features.columns):

  if (support == True):
    selected_features.append(feature)

print('Selected features are: ', selected_features)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'ActualElapsedTime', 'CRSElapsedTime',
       'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
       'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')
Selected features are:  ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state = 13)

In [6]:
from sklearn.preprocessing import StandardScaler
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
#(1) K Nearest Neighbour
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 5)
KNN.fit(X_train, Y_train)

Y_pred = KNN.predict(X_test)

accuracy_KNN = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of KNN is ', accuracy_KNN)

Accuracy of KNN is  90.63
