*This Jupyter Notebook was created by Manuel Klein and belongs to the final project of the Data Science Bootcamp from neuefische Hamburg.*

# US Flight Delay Analysis and Prediction
## Flight Delay Prediction Demo

This Jupyter Notebook contains a function that requrests some user input about a future flight and predicts the amout of delay in minutes for that particular flight.

## 1. Demo Preparation

### 1.1 Adjusting Jupyter Notebook Settings

In [1]:
# Adjusting the Jupyter Notebook window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# Markdown tables left alignment and markdown table cell content left aligment:

In [3]:
%%html
<style> 
table {float:left}
table td, table th, table tr {text-align:left !important;}
</style>


<a id='anchor_12'></a>

### 1.2 Importing necessary libraries

In [4]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import folium
#from mpl_toolkits.basemap import Basemap
#import plotly as py
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from numpy.random import normal
from scipy.stats import norm

In [5]:
# setting pandas display options to have more columns shown in .head() etc.
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 400)


<a id='anchor_13'></a>

### 1.3 Importing the data

In [6]:
df = pd.read_pickle(r'C:\Project_Data_NF/dfendofstep5.pkl')

In [7]:
df_flightroutes = pd.read_pickle(r'C:\Project_Data_NF\df_flightroutes.pkl')

In [8]:
df_geo_airports = pd.read_pickle(r'C:\Project_Data_NF\df_geo_airports.pkl')

In [9]:
df_demo = pd.read_pickle(r'C:\Project_Data_NF\df_demo.pkl')

In [10]:
X_train = pd.read_pickle(r'C:\Project_Data_NF\X_train.pkl')

In [11]:
y_train = pd.read_pickle(r'C:\Project_Data_NF\y_train.pkl')

### 1.4 Creating additional features for geo plotting

In [12]:
def radius_scaler(no_flights):
    if (no_flights < 100):
        return 0.2
    elif (no_flights >= 100) and (no_flights < 1000):
        return 0.4
    elif (no_flights >= 1000) and (no_flights < 10000):
        return 0.6
    elif (no_flights >= 10000) and (no_flights < 100000):
        return 0.9
    else:
        return 1.3

In [13]:
def color_scaler(percentage_delayed_flights):
    if (percentage_delayed_flights < 0.2):
        return '#1a9641'
    elif (percentage_delayed_flights >= 0.2) and (percentage_delayed_flights < 0.3):
        return '#a6d96a'
    elif (percentage_delayed_flights >= 0.3) and (percentage_delayed_flights < 0.35):
        return 'gold'
    elif (percentage_delayed_flights >= 0.35) and (percentage_delayed_flights < 0.5):
        return '#fdae61'
    else:
        return '#d7191c'

In [14]:
df_geo_airports['flights_scales'] = df_geo_airports['no_of_flights'].apply(radius_scaler)

In [15]:
df_geo_airports['flights_color_scales'] = df_geo_airports['percentage_delayed'].apply(color_scaler)

### 1.5 Creating demo dataframe

In [None]:
df['FL_MONTH'] = df['FL_DATE'].map(lambda x: x.strftime('%m')) # Extracting the year from the date into a separate column
df['FL_DAYOFWEEK'].replace({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}, inplace=True)

In [None]:
df_demo = df[['AIRLINE_NAME', 'FL_MONTH', 'FL_DAYOFWEEK', 'DEP_TIME_PLANNED', 'TRAVEL_DURATION_PLANNED', 'DISTANCE', 'DEP_AIRPORT_NAME', 'DEP_LAT', 'DEP_LONG', 'ARR_AIRPORT_NAME', 'ARR_LAT', 'ARR_LONG', 'ARR_DELAY']]

In [None]:
df_demo.to_pickle(r'C:\Project_Data_NF\df_demo.pkl')

### 1.6 Preparing dataframes for the model

In [16]:
# RUN THIS
# New dataframes for machine learning purpose
df_sample = df_demo.sample(n=40000, random_state=56) #also building a df consisting 
                                                  #of a small sample of the frame for testing

y = df_demo['ARR_DELAY']
X = df_demo.drop(columns = ['ARR_DELAY'])
X_model = df_demo.drop(columns = ['ARR_DELAY', 'DEP_AIRPORT_NAME', 'ARR_AIRPORT_NAME'])

y_sample = df_sample['ARR_DELAY']
X_sample = df_sample.drop(columns = ['ARR_DELAY', 'DEP_AIRPORT_NAME', 'ARR_AIRPORT_NAME'])

In [None]:
X_model.head()

In [None]:
X_model.info()

In [None]:
# Adding dummies for categorical and object features for sample dataset
X_sample = pd.get_dummies(X_sample, drop_first=True)

In [None]:
# Adding dummies for categorical and object features for complete dataset
X_model = pd.get_dummies(X_model, drop_first=True)

In [None]:
X.shape

In [None]:
X_sample.shape

In [None]:
# Scaling sample dataset
std_scale = preprocessing.StandardScaler().fit(X_sample)
df_train_test_sample = std_scale.transform(X_sample)
df_scale_sample = pd.DataFrame(df_train_test_sample)
df_scale_sample.columns = X_sample.columns
df_scale_sample.head()

In [None]:
# Std scaler for complete dataset
std_scale = preprocessing.StandardScaler().fit(X_model)
df_train_test = std_scale.transform(X_model)
df_scale = pd.DataFrame(df_train_test)
df_scale.columns = X_model.columns
df_scale.head()

In [None]:
# Test split for sample dataset
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(df_scale_sample, y_sample, test_size=0.2, random_state=23)

In [None]:
# Test split for complete dataset
X_train, X_test, y_train, y_test = train_test_split(df_scale, y, test_size=0.2, random_state=42)

In [None]:
X_train.to_pickle(r'C:\Project_Data_NF\X_train.pkl')

In [None]:
y_train.to_pickle(r'C:\Project_Data_NF\y_train.pkl')

### Setting up the model

In [17]:
# RUN THIS
regressor = LinearRegression()
regressor.fit(X_train, y_train) #training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [None]:
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_results

In [None]:
df_results.head(100)

### 1.7 Demo Input Function

In [18]:
def demo_input():
    x=1
    while x:
        departure_airport = input('Please enter departure airport name: ')
        if (departure_airport not in X['DEP_AIRPORT_NAME'].values):
            print('Airport unknown. Please enter other airport name.')
            continue
        else:
            break
            
    while x:
        arrival_airport = input('Please enter arrival airport name: ')
        if (arrival_airport not in X['ARR_AIRPORT_NAME'].values):
            print('Airport unknown. Please enter other airport name.')
            continue
        else:
            break
        
    if (departure_airport in X['DEP_AIRPORT_NAME'].values) and (arrival_airport in X[X['DEP_AIRPORT_NAME'] == departure_airport]['ARR_AIRPORT_NAME'].values):
        estimated_flighttime = int(X[(X['DEP_AIRPORT_NAME'] == departure_airport) & (X['ARR_AIRPORT_NAME'] == arrival_airport)]['TRAVEL_DURATION_PLANNED'].value_counts().iloc[:1].index.astype(int)[0])
        distance = int(X[(X['DEP_AIRPORT_NAME'] == departure_airport) & (X['ARR_AIRPORT_NAME'] == arrival_airport)]['DISTANCE'].value_counts().iloc[:1].index.astype(int)[0])
        flighthours = estimated_flighttime // 60 
        flightminutes = estimated_flighttime % 60
        print(' ')
        print('Flight distance:',distance, 'Miles')
        print('Flight duration:',flighthours , 'hours,' ,flightminutes, 'minutes')
        airline_list = sorted(X[(X['DEP_AIRPORT_NAME'] == departure_airport) & (X['ARR_AIRPORT_NAME'] == arrival_airport)]['AIRLINE_NAME'].unique())
        print('\n','Available airlines for your flightroute: \n', *airline_list, sep = "\n")
        print(' ')
    
        while x:
            desired_airline = input('Which airline do you want to fly with? ')
            if (desired_airline not in airline_list):
                print('Please choose one of the listed airlines.')
                continue
            else:
                break
    
        while x:
            departure_datetime = input("""Please enter departure date and time in the format 'YYYY-MM-DD HH:MM': """)
            departure_datetime_object = datetime.datetime.strptime(departure_datetime, '%Y-%m-%d %H:%M')
            if (departure_datetime_object < datetime.datetime.now()):
                print('Date is in the past. Please enter valid date.')
                continue
            else:
                break

        X_pred = X[(X['DEP_AIRPORT_NAME'] == departure_airport) & (X['ARR_AIRPORT_NAME'] == arrival_airport)& (X['AIRLINE_NAME'] == desired_airline)].iloc[:1]
        X_pred['FL_MONTH'] = list([departure_datetime_object.strftime('%m')])
        X_pred['FL_DAYOFWEEK'] = list([departure_datetime_object.weekday()])
        X_pred['FL_DAYOFWEEK'].replace({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}, inplace=True)
        X_pred['DEP_TIME_PLANNED'] = list([int(departure_datetime_object.strftime('%H') + departure_datetime_object.strftime('%M'))])
    
    else:
        print('Fligtroute does not exist.')
        return
    
    X2 = pd.concat([X, X_pred])
    X2.reset_index(inplace=True)
    X2.drop(columns=['index'], inplace=True)
    X2 = X2.drop(columns = ['DEP_AIRPORT_NAME', 'ARR_AIRPORT_NAME'])
    X2 = pd.get_dummies(X2, drop_first=True)
    std_scale = preprocessing.StandardScaler().fit(X2)
    df_train_test = std_scale.transform(X2)
    df_scale = pd.DataFrame(df_train_test)
    df_scale.columns = X2.columns
    X2 = df_scale[len(df_scale)-1:len(df_scale)]
    y_sample_pred = regressor.predict(X2)
    print(' ')
    print('Estimated delay:', int(round(y_sample_pred[0])), 'minutes')
    
    
    return ' '


<a id='anchor_335'></a>

## 2. Flight Delay Prediction Demo

In [19]:
# 3.4.2 Plotting the flights routes of all flights between 2016 and 2018
some_map = folium.Map(location=[df_geo_airports['DEP_LAT'].mean(), 
                                df_geo_airports['DEP_LONG'].mean()], 
                      zoom_start=5)

for i in range(0,len(df_geo_airports)):
   folium.Circle(location=[df_geo_airports.iloc[i]['DEP_LAT'], df_geo_airports.iloc[i]['DEP_LONG']], popup=df_geo_airports.iloc[i]['DEP_AIRPORT_NAME'], radius=df_geo_airports.iloc[i]['flights_scales']*15000,
                 color=df_geo_airports.iloc[i]['flights_color_scales'], fill=True, fill_color=df_geo_airports.iloc[i]['flights_color_scales']).add_to(some_map)    


for i in range(0,len(df_flightroutes)):
    a = df_flightroutes.iloc[i:i+1]
    folium.PolyLine(locations=[[float(a['DEP_LAT'].values), float(a['DEP_LONG'].values)],[float(a['ARR_LAT'].values), float(a['ARR_LONG'].values)]], color='steelblue', weight=0.7, opacity=0.3).add_to(some_map)
    
    
folium.TileLayer('cartodbpositron').add_to(some_map)
some_map

In [20]:
# Plotting airport locations of all US passenger airports
# Diameter represents number of departures between 2016 and 2018 (the larger the higher)
# Color represents percentage of delayed flights between 2016 and 2018 (green        = <20%
#                                                                       bright green = >=20% to <30%
#                                                                       yellow       = >=30% to <35%
#                                                                       orange       = >=35% to <50%
#                                                                       red          = >50%)

some_map = folium.Map(location=[df_geo_airports['DEP_LAT'].mean(), 
                                df_geo_airports['DEP_LONG'].mean()], 
                      zoom_start=5)

for i in range(0,len(df_geo_airports)):
   folium.Circle(location=[df_geo_airports.iloc[i]['DEP_LAT'], df_geo_airports.iloc[i]['DEP_LONG']], popup=df_geo_airports.iloc[i]['DEP_AIRPORT_NAME'], radius=df_geo_airports.iloc[i]['flights_scales']*15000,
                 color=df_geo_airports.iloc[i]['flights_color_scales'], fill=True, fill_color=df_geo_airports.iloc[i]['flights_color_scales']).add_to(some_map)    

folium.TileLayer('cartodbpositron').add_to(some_map)
some_map

In [24]:
output = demo_input()

Please enter departure airport name: Washington Dulles International Airport
Please enter arrival airport name: John F Kennedy International Airport
 
Flight distance: 228 Miles
Flight duration: 1 hours, 15 minutes


Available airlines for your flightroute: 

Endeavor Air
JetBlue Airways
 
Which airline do you want to fly with? Endeavor Air
Please enter departure date and time in the format 'YYYY-MM-DD HH:MM': 2020-05-09 07:00


  return self.partial_fit(X, y)


 
Estimated delay: -3 minutes
