___

In [17]:
# Import libraries to be used

# Directories/Files management
import os.path

# Timing
import time

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and may freeze the kernel
import missingno as msno
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning:
# - Model selection:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, KFold, cross_val_score, StratifiedKFold, \
                                    GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance

# - Basic classes for custom-made transformers:
from sklearn.base import BaseEstimator, TransformerMixin

# - Transformers:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# - Pipeline:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# - Models:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, plot_importance, plot_tree

# - Metrics:
from sklearn.metrics import fbeta_score, f1_score, recall_score, precision_score, accuracy_score, \
                            confusion_matrix, classification_report, roc_curve, precision_recall_curve, \
                            roc_auc_score, average_precision_score, plot_roc_curve, plot_precision_recall_curve

# Objects storage:
from joblib import dump, load
import pickle

In [18]:
t0 = time.perf_counter() 

In [19]:
# Detect Operating System running and manage paths accordingly

root = os.getcwd()
if os.name == 'nt': # Windows
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\Desktop\TFM\notebooks\frontend


___

# Load the dataset

In [20]:
cols = [
    'MONTH',
#     'DAY_OF_MONTH',
    'DAY_OF_WEEK',
    'OP_UNIQUE_CARRIER',
#     'TAIL_NUM',
    'ORIGIN',
#     'ORIGIN_CITY_NAME',
#     'ORIGIN_STATE_ABR',
#     'ORIGIN_STATE_NM',
#     'WBAN_Origin_OTP',
    'DEST',
#     'DEST_CITY_NAME',
#     'DEST_STATE_ABR',
#     'DEST_STATE_NM',
#     'WBAN_Dest_OTP',
#     'CRS_DEP_TIME',
#     'DEP_TIME',
#     'DEP_DELAY',
#     'DEP_DEL15',
    'DEP_TIME_hour',
#     'TAXI_OUT',
#     'TAXI_IN',
    'TAXI_OUT_median',
    'TAXI_IN_median',
#     'CRS_ARR_TIME',
#     'ARR_TIME',
#     'ARR_DELAY',
    'ARR_DEL15', # → Target !!
    'ARR_TIME_hour',
#     'CANCELLED',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
#     'DISTANCE_GROUP',
#     'CARRIER_DELAY',
#     'WEATHER_DELAY',
#     'NAS_DELAY',
#     'SECURITY_DELAY',
#     'LATE_AIRCRAFT_DELAY',
    
#     'STATION_Origin',
#     'WMO_Origin',
#     'WBAN_Origin_LCD',
    'LATITUDE_Origin',
    'LONGITUDE_Origin',
    'HourlyAltimeterSetting_Origin',
    'HourlyDryBulbTemperature_Origin',
    'HourlyPrecipitation_Origin',
    'HourlyRelativeHumidity_Origin',
    'HourlySkyConditions_Origin',
    'HourlyVisibility_Origin',
#     'HourlyWindDirection_Origin',
    'HourlyWindGustSpeed_Origin',
    'HourlyWindSpeed_Origin',
#     'REM_Origin',
#     'STATION_Dest',
#     'WMO_Dest',
#     'WBAN_Dest_LCD',
    'LATITUDE_Dest',
    'LONGITUDE_Dest',
    'HourlyAltimeterSetting_Dest',
    'HourlyDryBulbTemperature_Dest',
    'HourlyPrecipitation_Dest',
    'HourlyRelativeHumidity_Dest',
    'HourlySkyConditions_Dest',
    'HourlyVisibility_Dest',
#     'HourlyWindDirection_Dest',
    'HourlyWindGustSpeed_Dest',
    'HourlyWindSpeed_Dest',
#     'REM_Dest'
]

In [21]:
cols_dtypes = {
    'MONTH' : 'string',
#     'DAY_OF_MONTH' : 'string',
    'DAY_OF_WEEK' : 'string',
    'OP_UNIQUE_CARRIER' : 'string',
#     'TAIL_NUM' : 'string',
    'ORIGIN' : 'string',
#     'ORIGIN_CITY_NAME' : 'string',
#     'ORIGIN_STATE_ABR' : 'string',
#     'ORIGIN_STATE_NM' : 'string',
#     'WBAN_Origin_OTP' : 'string',
    'DEST' : 'string',
#     'DEST_CITY_NAME' : 'string',
#     'DEST_STATE_ABR' : 'string',
#     'DEST_STATE_NM' : 'string',
#     'WBAN_Dest_OTP' : 'string',
#     'CRS_DEP_TIME' : 'string',
#     'DEP_TIME' : 'string',
#     'DEP_DELAY' : 'int32',
#     'DEP_DEL15' : 'int32',
    'DEP_TIME_hour' : 'string',
#     'TAXI_OUT' : 'int32',
#     'TAXI_IN' : 'int32',
    'TAXI_OUT_median' : 'int32',
    'TAXI_IN_median' : 'int32',
#     'CRS_ARR_TIME' : 'string',
#     'ARR_TIME' : 'string',
#     'ARR_DELAY' : 'int32',
    'ARR_DEL15' : 'int32', # → Target !!
    'ARR_TIME_hour' : 'string',
#     'CANCELLED' : 'string',
    'CRS_ELAPSED_TIME' : 'int32',
    'DISTANCE' : 'int32',
#     'DISTANCE_GROUP' : 'string',
#     'CARRIER_DELAY' : 'int32',
#     'WEATHER_DELAY' : 'int32',
#     'NAS_DELAY' : 'int32',
#     'SECURITY_DELAY' : 'int32',
#     'LATE_AIRCRAFT_DELAY' : 'int32',

#     'STATION_Origin' : 'string',
#     'WMO_Origin' : 'string',
#     'WBAN_Origin_LCD' : 'string',
    'LATITUDE_Origin' : 'float64',
    'LONGITUDE_Origin' : 'float64',
    'HourlyAltimeterSetting_Origin' : 'float64',
    'HourlyDryBulbTemperature_Origin' : 'int32',
    'HourlyPrecipitation_Origin' : 'float64',
    'HourlyRelativeHumidity_Origin' : 'int32',
    'HourlySkyConditions_Origin' : 'string',
    'HourlyVisibility_Origin' : 'int32',
#     'HourlyWindDirection_Origin' : 'string',
    'HourlyWindGustSpeed_Origin' : 'int32',
    'HourlyWindSpeed_Origin' : 'int32',
#     'REM_Origin' : 'string',
#     'STATION_Dest' : 'string',
#     'WMO_Dest' : 'string',
#     'WBAN_Dest_LCD' : 'string',
    'LATITUDE_Dest' : 'float64',
    'LONGITUDE_Dest' : 'float64',
    'HourlyAltimeterSetting_Dest' : 'float64',
    'HourlyDryBulbTemperature_Dest' : 'int32',
    'HourlyPrecipitation_Dest' : 'float64',
    'HourlyRelativeHumidity_Dest' : 'int32',
    'HourlySkyConditions_Dest' : 'string',
    'HourlyVisibility_Dest' : 'int32',
#     'HourlyWindDirection_Dest' : 'string',
    'HourlyWindGustSpeed_Dest' : 'int32',
    'HourlyWindSpeed_Dest' : 'int32',
#     'REM_Dest' : 'string',
}

___

## 1. Load the dataset

In [22]:
%%time

input_folder = '../../data/output/us_dot-noaa/'
file_name = "3_otp_lcd_2019.csv"

df = pd.read_csv(input_folder + file_name,
                 encoding='latin1',
#                      nrows=1e5,
                 usecols=cols,
                 dtype=cols_dtypes
                )
df.sample(5)

Wall time: 1min 38s


Unnamed: 0,MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,LATITUDE_Dest,LONGITUDE_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,HourlyWindGustSpeed_Dest,HourlyWindSpeed_Dest
302212,9,6,YX,DTW,DCA,6,17,6,0,7,97,405,42.2313,-83.3308,29.98,63,0.0,87,OVC,10,0,8,38.8472,-77.03454,29.89,66,0.0,81,OVC,9,0,0
7164184,5,6,HA,HNL,KOA,7,14,5,0,8,47,163,21.324,-157.9294,30.03,77,0.0,74,FEW,10,0,5,19.73556,-156.04889,30.01,79,0.0,69,FEW,10,0,6
2546656,3,5,AA,ORD,SFO,17,17,6,0,19,286,1846,41.96019,-87.93162,29.95,46,0.0,61,OVC,10,0,10,37.6197,-122.3647,30.29,53,0.0,80,FEW,10,0,18
5169198,6,4,9E,MSN,MSP,9,14,6,1,10,71,228,43.1405,-89.3452,30.14,84,0.0,53,SCT,10,0,7,44.8831,-93.2289,30.09,67,0.02,87,OVC,8,0,5
1577251,7,2,OH,ECP,CLT,7,13,13,0,10,107,437,30.349,-85.788,30.05,85,0.0,65,CLR,10,0,10,35.2236,-80.9552,30.02,89,0.0,50,OVC,9,0,6


# Prepare user input fields

### `OP_UNIQUE_CARRIER`

In [110]:
carriers_dict = {
                    '9E' : '[9E] Endeavor Air Inc.',
                    'AA' : '[AA] American Airlines Inc.',
                    'AS' : '[AS] Alaska Airlines Inc.',
                    'B6' : '[B6] JetBlue Airways',
                    'DL' : '[DL] Delta Air Lines Inc.',
                    'EV' : '[EV] ExpressJet Airlines LLC',
                    'F9' : '[F9] Frontier Airlines Inc.',
                    'G4' : '[G4] Allegiant Air',
                    'HA' : '[HA] Hawaiian Airlines Inc.',
                    'MQ' : '[MQ] Envoy Air',
                    'NK' : '[NK] Spirit Air Lines',
                    'OH' : '[OH] PSA Airlines Inc.',
                    'OO' : '[OO] SkyWest Airlines Inc.',
                    'UA' : '[UA] United Air Lines Inc.',
                    'WN' : '[WN] Southwest Airlines Co.',
                    'YV' : '[YV] Mesa Airlines Inc.',
                    'YX' : '[YX] Republic Airline'
               }
with open("dict_mappers/carriers_dict.pkl", "wb") as f:
    pickle.dump(carriers_dict, f)
carriers_dict

{'9E': '[9E] Endeavor Air Inc.',
 'AA': '[AA] American Airlines Inc.',
 'AS': '[AS] Alaska Airlines Inc.',
 'B6': '[B6] JetBlue Airways',
 'DL': '[DL] Delta Air Lines Inc.',
 'EV': '[EV] ExpressJet Airlines LLC',
 'F9': '[F9] Frontier Airlines Inc.',
 'G4': '[G4] Allegiant Air',
 'HA': '[HA] Hawaiian Airlines Inc.',
 'MQ': '[MQ] Envoy Air',
 'NK': '[NK] Spirit Air Lines',
 'OH': '[OH] PSA Airlines Inc.',
 'OO': '[OO] SkyWest Airlines Inc.',
 'UA': '[UA] United Air Lines Inc.',
 'WN': '[WN] Southwest Airlines Co.',
 'YV': '[YV] Mesa Airlines Inc.',
 'YX': '[YX] Republic Airline'}

In [42]:
carriers = df['OP_UNIQUE_CARRIER'].value_counts().index

with open("dict_mappers/carriers_sorted_list.pkl", "wb") as f:
    pickle.dump(carriers, f)
carriers

Index(['WN', 'DL', 'AA', 'OO', 'UA', 'YX', 'MQ', 'B6', 'OH', 'AS', '9E', 'YV',
       'NK', 'F9', 'EV', 'G4', 'HA'],
      dtype='object')

### `ORIGIN`

#### Carrier current origins

In [72]:
carrierOrigins_dict = {}
for carrier in df['OP_UNIQUE_CARRIER'].unique():
    carrierOrigins_dict[carrier] = sorted(df.loc[df['OP_UNIQUE_CARRIER'] == carrier, 'ORIGIN'].unique())
with open("dict_mappers/carrierOrigins_dict.pkl", "wb") as f:
    pickle.dump(carrierOrigins_dict, f)
carrierOrigins_dict

{'9E': ['ABE',
  'ACK',
  'AEX',
  'AGS',
  'ALB',
  'ATL',
  'ATW',
  'AUS',
  'AVL',
  'AVP',
  'AZO',
  'BDL',
  'BGR',
  'BHM',
  'BIS',
  'BMI',
  'BNA',
  'BOS',
  'BTR',
  'BTV',
  'BUF',
  'BWI',
  'CAE',
  'CAK',
  'CHA',
  'CHO',
  'CHS',
  'CID',
  'CLE',
  'CLT',
  'CMH',
  'CRW',
  'CSG',
  'CVG',
  'CWA',
  'DAY',
  'DCA',
  'DEN',
  'DFW',
  'DSM',
  'DTW',
  'ELM',
  'EVV',
  'EWN',
  'EWR',
  'EYW',
  'FAR',
  'FAY',
  'FLL',
  'FNT',
  'FSD',
  'FSM',
  'FWA',
  'GNV',
  'GPT',
  'GRB',
  'GRR',
  'GSO',
  'GSP',
  'GTF',
  'GTR',
  'HPN',
  'HSV',
  'IAD',
  'IAH',
  'ILM',
  'IND',
  'JAN',
  'JAX',
  'JFK',
  'LAN',
  'LEX',
  'LFT',
  'LGA',
  'LIT',
  'LNK',
  'MBS',
  'MCI',
  'MCO',
  'MDT',
  'MDW',
  'MEM',
  'MGM',
  'MHT',
  'MIA',
  'MKE',
  'MLI',
  'MOB',
  'MOT',
  'MSN',
  'MSP',
  'MSY',
  'MVY',
  'MYR',
  'OAJ',
  'OMA',
  'ORD',
  'ORF',
  'PBI',
  'PHF',
  'PHL',
  'PIA',
  'PIT',
  'PVD',
  'PWM',
  'RAP',
  'RDU',
  'RIC',
  'ROA',
  'ROC',
  'R

In [None]:
    destType = st.radio('Departure airport', options=['Currently operated by the A/L', 'All airports'], index=0, key=1)
    if destType == 'Currently operated by the A/L':
        with open(root + "dict_mappers/carrierDests_dict.pkl", "rb") as f:
            carrierDests_dict = pickle.load(f)
            dests = carrierDests_dict[carrier]
        dest = st.selectbox('Dest', dests)
    elif destType == 'All airports':
    with open(root + "dict_mappers/dests_sorted_list.pkl", "rb") as f:
        dests = pickle.load(f)
    dest = st.selectbox('Destination', dests)

#### All airports

In [44]:
origins = sorted(df['ORIGIN'].unique())

with open("dict_mappers/origins_sorted_list.pkl", "wb") as f:
    pickle.dump(origins, f)
origins

['ABE',
 'ABI',
 'ABQ',
 'ABR',
 'ABY',
 'ACK',
 'ACT',
 'ACV',
 'ACY',
 'ADK',
 'ADQ',
 'AEX',
 'AGS',
 'AKN',
 'ALB',
 'ALO',
 'AMA',
 'ANC',
 'APN',
 'ART',
 'ASE',
 'ATL',
 'ATW',
 'ATY',
 'AUS',
 'AVL',
 'AVP',
 'AZO',
 'BDL',
 'BET',
 'BFF',
 'BFL',
 'BFM',
 'BGM',
 'BGR',
 'BHM',
 'BIL',
 'BIS',
 'BJI',
 'BLI',
 'BLV',
 'BMI',
 'BNA',
 'BOI',
 'BOS',
 'BPT',
 'BQK',
 'BQN',
 'BRD',
 'BRO',
 'BRW',
 'BTM',
 'BTR',
 'BTV',
 'BUF',
 'BUR',
 'BWI',
 'BZN',
 'CAE',
 'CAK',
 'CDC',
 'CDV',
 'CGI',
 'CHA',
 'CHO',
 'CHS',
 'CID',
 'CIU',
 'CKB',
 'CLE',
 'CLL',
 'CLT',
 'CMH',
 'CMI',
 'CMX',
 'CNY',
 'COD',
 'COS',
 'COU',
 'CPR',
 'CRP',
 'CRW',
 'CSG',
 'CVG',
 'CWA',
 'CYS',
 'DAB',
 'DAL',
 'DAY',
 'DBQ',
 'DCA',
 'DEN',
 'DFW',
 'DLG',
 'DLH',
 'DRO',
 'DRT',
 'DSM',
 'DTW',
 'DVL',
 'EAR',
 'EAU',
 'ECP',
 'EGE',
 'EKO',
 'ELM',
 'ELP',
 'ERI',
 'EUG',
 'EVV',
 'EWN',
 'EWR',
 'EYW',
 'FAI',
 'FAR',
 'FAT',
 'FAY',
 'FLG',
 'FLL',
 'FNT',
 'FSD',
 'FSM',
 'FWA',
 'GCC',
 'GCK',


### `DEST`

#### Carrier current destinations

In [73]:
carrierDests_dict = {}
for carrier in df['OP_UNIQUE_CARRIER'].unique():
    carrierDests_dict[carrier] = sorted(df.loc[df['OP_UNIQUE_CARRIER'] == carrier, 'DEST'].unique())
with open("dict_mappers/carrierDests_dict.pkl", "wb") as f:
    pickle.dump(carrierDests_dict, f)
carrierDests_dict

{'9E': ['ABE',
  'ACK',
  'AEX',
  'AGS',
  'ALB',
  'ATL',
  'ATW',
  'AUS',
  'AVL',
  'AVP',
  'AZO',
  'BDL',
  'BGR',
  'BHM',
  'BIS',
  'BMI',
  'BNA',
  'BOS',
  'BTR',
  'BTV',
  'BUF',
  'BWI',
  'CAE',
  'CAK',
  'CHA',
  'CHO',
  'CHS',
  'CID',
  'CLE',
  'CLT',
  'CMH',
  'CRW',
  'CSG',
  'CVG',
  'CWA',
  'DAY',
  'DCA',
  'DEN',
  'DFW',
  'DSM',
  'DTW',
  'ELM',
  'EVV',
  'EWN',
  'EWR',
  'EYW',
  'FAR',
  'FAY',
  'FLL',
  'FNT',
  'FSD',
  'FSM',
  'FWA',
  'GNV',
  'GPT',
  'GRB',
  'GRR',
  'GSO',
  'GSP',
  'GTF',
  'GTR',
  'HPN',
  'HSV',
  'IAD',
  'IAH',
  'ILM',
  'IND',
  'JAN',
  'JAX',
  'JFK',
  'LAN',
  'LEX',
  'LFT',
  'LGA',
  'LIT',
  'LNK',
  'MBS',
  'MCI',
  'MCO',
  'MDT',
  'MDW',
  'MEM',
  'MGM',
  'MHT',
  'MIA',
  'MKE',
  'MLI',
  'MOB',
  'MOT',
  'MSN',
  'MSP',
  'MSY',
  'MVY',
  'MYR',
  'OAJ',
  'OMA',
  'ORD',
  'ORF',
  'PBI',
  'PHF',
  'PHL',
  'PIA',
  'PIT',
  'PVD',
  'PWM',
  'RAP',
  'RDU',
  'RIC',
  'ROA',
  'ROC',
  'R

#### All airports

In [46]:
dests = sorted(df['DEST'].unique())

with open("dict_mappers/dests_sorted_list.pkl", "wb") as f:
    pickle.dump(dests, f)
dests

['ABE',
 'ABI',
 'ABQ',
 'ABR',
 'ABY',
 'ACK',
 'ACT',
 'ACV',
 'ACY',
 'ADK',
 'ADQ',
 'AEX',
 'AGS',
 'AKN',
 'ALB',
 'ALO',
 'AMA',
 'ANC',
 'APN',
 'ART',
 'ASE',
 'ATL',
 'ATW',
 'ATY',
 'AUS',
 'AVL',
 'AVP',
 'AZO',
 'BDL',
 'BET',
 'BFF',
 'BFL',
 'BFM',
 'BGM',
 'BGR',
 'BHM',
 'BIL',
 'BIS',
 'BJI',
 'BLI',
 'BLV',
 'BMI',
 'BNA',
 'BOI',
 'BOS',
 'BPT',
 'BQK',
 'BQN',
 'BRD',
 'BRO',
 'BRW',
 'BTM',
 'BTR',
 'BTV',
 'BUF',
 'BUR',
 'BWI',
 'BZN',
 'CAE',
 'CAK',
 'CDC',
 'CDV',
 'CGI',
 'CHA',
 'CHO',
 'CHS',
 'CID',
 'CIU',
 'CKB',
 'CLE',
 'CLL',
 'CLT',
 'CMH',
 'CMI',
 'CMX',
 'CNY',
 'COD',
 'COS',
 'COU',
 'CPR',
 'CRP',
 'CRW',
 'CSG',
 'CVG',
 'CWA',
 'CYS',
 'DAB',
 'DAL',
 'DAY',
 'DBQ',
 'DCA',
 'DEN',
 'DFW',
 'DLG',
 'DLH',
 'DRO',
 'DRT',
 'DSM',
 'DTW',
 'DVL',
 'EAR',
 'EAU',
 'ECP',
 'EGE',
 'EKO',
 'ELM',
 'ELP',
 'ERI',
 'EUG',
 'EVV',
 'EWN',
 'EWR',
 'EYW',
 'FAI',
 'FAR',
 'FAT',
 'FAY',
 'FLG',
 'FLL',
 'FNT',
 'FSD',
 'FSM',
 'FWA',
 'GCC',
 'GCK',


### `DEP_TIME_hour`

In [48]:
depTimeHours = df['DEP_TIME_hour'].unique()

with open("dict_mappers/depTimeHours_list.pkl", "wb") as f:
    pickle.dump(depTimeHours, f)
depTimeHours

<StringArray>
['11', '10',  '9',  '8',  '7',  '0',  '6', '12', '23', '17', '18', '20', '16',
 '15', '13', '14', '19', '22', '21']
Length: 19, dtype: string

### `ARR_TIME_hour`

#### `arrTimeHour_dict`: known routes
Based on the most frequent value *(mode)* for the combination of: 'ORIGIN', 'DEST', 'OP_UNIQUE_CARRIER' and 'DEP_TIME_hour'

In [109]:
df2 = df[['ORIGIN', 'DEST', 'OP_UNIQUE_CARRIER', 'DEP_TIME_hour', 'ARR_TIME_hour']].copy()
df2['key'] = df2['ORIGIN'] + '_' + df2['DEST'] + '_'  + df2['OP_UNIQUE_CARRIER'] + '_'  + df2['DEP_TIME_hour']
df3 = df2.sort_values(by='key', ascending=True)[['key', 'ARR_TIME_hour']]
grp = df3.groupby('key')['ARR_TIME_hour'].agg(lambda x : x.mode()[0]) # .reset_index()
arrTimeHour_dict = dict(zip(grp.index, grp.values))
with open("dict_mappers/arrTimeHour_dict.pkl", "wb") as f:
    pickle.dump(arrTimeHour_dict, f)
arrTimeHour_dict

{'ABE_ATL_9E_13': '15',
 'ABE_ATL_9E_14': '17',
 'ABE_ATL_9E_15': '17',
 'ABE_ATL_9E_17': '20',
 'ABE_ATL_9E_18': '20',
 'ABE_ATL_9E_6': '8',
 'ABE_ATL_DL_17': '20',
 'ABE_ATL_DL_6': '8',
 'ABE_BNA_G4_13': '14',
 'ABE_BNA_G4_14': '15',
 'ABE_BNA_G4_17': '18',
 'ABE_BNA_G4_18': '19',
 'ABE_BNA_G4_19': '20',
 'ABE_CLT_OH_0': '7',
 'ABE_CLT_OH_11': '13',
 'ABE_CLT_OH_16': '19',
 'ABE_CLT_OH_18': '21',
 'ABE_CLT_OH_19': '21',
 'ABE_CLT_OH_6': '8',
 'ABE_CLT_OH_8': '10',
 'ABE_DTW_OO_0': '7',
 'ABE_DTW_OO_12': '14',
 'ABE_DTW_OO_13': '15',
 'ABE_DTW_OO_17': '19',
 'ABE_DTW_OO_6': '7',
 'ABE_DTW_OO_9': '11',
 'ABE_FLL_G4_10': '13',
 'ABE_FLL_G4_11': '13',
 'ABE_FLL_G4_14': '16',
 'ABE_FLL_G4_17': '20',
 'ABE_FLL_G4_18': '21',
 'ABE_FLL_G4_19': '22',
 'ABE_FLL_G4_9': '12',
 'ABE_MYR_G4_11': '12',
 'ABE_MYR_G4_12': '13',
 'ABE_MYR_G4_14': '16',
 'ABE_MYR_G4_15': '16',
 'ABE_MYR_G4_16': '18',
 'ABE_MYR_G4_17': '19',
 'ABE_MYR_G4_18': '20',
 'ABE_MYR_G4_20': '22',
 'ABE_MYR_G4_8': '9',
 'ABE_ORD

In [97]:
#  'BOS_ATL_B6_9': '12',
df[(df['ORIGIN'] == 'BOS') & (df['DEST'] == 'ATL') & \
   (df['OP_UNIQUE_CARRIER'] == 'B6') & (df['DEP_TIME_hour'] == '9') & \
   (df['MONTH'] == '02') & (df['DAY_OF_WEEK'] == '1')]['ARR_TIME_hour'].value_counts()

12    2
11    1
Name: ARR_TIME_hour, dtype: Int64

In [88]:
arrTimeHour_dict['BOS_ATL_B6_9']

'12'

#### `arrTimeHour_dict_2`: unknown routes
- This will be used only in case the first dictionary does not contain the user selected combination of: 'ORIGIN', 'DEST', 'OP_UNIQUE_CARRIER' and 'DEP_TIME_hour'  
- This new dictionary is based on the most frequent value *(mode)* for the combination of: 'ORIGIN' and 'DEST'

In [137]:
grp = df.groupby(['ORIGIN', 'DEST'], as_index=False)['CRS_ELAPSED_TIME'].agg(lambda x : x.mode()[0])
arrTimeHour_dict_2 = dict(zip(grp['ORIGIN'] + '_' + grp['DEST'],
                              (grp['CRS_ELAPSED_TIME'] / 60).round(decimals=0).astype('int')))
with open("dict_mappers/arrTimeHour_dict_2.pkl", "wb") as f:
    pickle.dump(arrTimeHour_dict_2, f)
arrTimeHour_dict_2

{'ABE_ATL': 2,
 'ABE_BNA': 2,
 'ABE_CLT': 2,
 'ABE_DTW': 2,
 'ABE_FLL': 3,
 'ABE_MYR': 2,
 'ABE_ORD': 2,
 'ABE_PGD': 3,
 'ABE_PIE': 3,
 'ABE_SAV': 2,
 'ABE_SFB': 2,
 'ABI_DFW': 1,
 'ABQ_ATL': 3,
 'ABQ_AUS': 2,
 'ABQ_BWI': 4,
 'ABQ_DAL': 2,
 'ABQ_DEN': 1,
 'ABQ_DFW': 2,
 'ABQ_HOU': 2,
 'ABQ_IAH': 2,
 'ABQ_JFK': 4,
 'ABQ_LAS': 2,
 'ABQ_LAX': 2,
 'ABQ_MCI': 2,
 'ABQ_MCO': 4,
 'ABQ_MDW': 3,
 'ABQ_MSP': 3,
 'ABQ_OAK': 2,
 'ABQ_ORD': 3,
 'ABQ_PDX': 3,
 'ABQ_PHX': 1,
 'ABQ_SAN': 2,
 'ABQ_SAT': 2,
 'ABQ_SEA': 3,
 'ABQ_SFO': 3,
 'ABQ_SJC': 2,
 'ABQ_SLC': 2,
 'ABR_MSP': 1,
 'ABY_ATL': 1,
 'ACK_BOS': 1,
 'ACK_CLT': 3,
 'ACK_DCA': 2,
 'ACK_HPN': 1,
 'ACK_JFK': 1,
 'ACK_LGA': 1,
 'ACT_DFW': 1,
 'ACV_DEN': 3,
 'ACV_LAX': 2,
 'ACV_SFO': 1,
 'ACY_ATL': 2,
 'ACY_FLL': 3,
 'ACY_MCO': 3,
 'ACY_MYR': 2,
 'ACY_PBI': 3,
 'ACY_RSW': 3,
 'ACY_TPA': 2,
 'ADK_ANC': 3,
 'ADQ_ANC': 1,
 'AEX_ATL': 2,
 'AEX_DFW': 1,
 'AEX_IAH': 1,
 'AGS_ATL': 1,
 'AGS_CLT': 1,
 'AGS_DCA': 2,
 'AGS_DFW': 3,
 'AGS_LGA': 2,
 'AGS_MIA'

### `TAXI_OUT`

In [8]:
df['ORIGIN'] + '_' + df['OP_UNIQUE_CARRIER']

0          TYS_9E
1          SRQ_DL
2          DTW_WN
3          DTW_DL
4          LGA_WN
            ...  
7200048    JNU_AS
7200049    JNU_AS
7200050    JNU_AS
7200051    JNU_AS
7200052    JNU_AS
Length: 7200053, dtype: string

In [107]:
col = 'TAXI_OUT_median'
cols = ['ORIGIN', 'OP_UNIQUE_CARRIER']
taxi_out_df = df.drop_duplicates(subset=cols, keep='first')
taxi_out_dict = dict(zip(taxi_out_df[cols[0]] + '_' + taxi_out_df[cols[1]], taxi_out_df[col]))
with open("dict_mappers/taxi_out_dict.pkl", "wb") as f:
    pickle.dump(taxi_out_dict, f)
taxi_out_dict

{'TYS_9E': 15,
 'SRQ_DL': 11,
 'DTW_WN': 11,
 'DTW_DL': 16,
 'LGA_WN': 20,
 'LGA_YX': 25,
 'LGA_DL': 22,
 'ROA_OO': 16,
 'TPA_DL': 13,
 'RDU_WN': 9,
 'RDU_DL': 15,
 'BOS_B6': 16,
 'BOS_DL': 19,
 'MSP_DL': 14,
 'JAX_DL': 13,
 'IND_WN': 10,
 'IND_DL': 13,
 'BTR_9E': 15,
 'TLH_9E': 14,
 'GSP_WN': 10,
 'DAY_DL': 13,
 'MEM_DL': 13,
 'CMH_WN': 9,
 'CMH_DL': 11,
 'SAT_WN': 8,
 'DFW_AA': 16,
 'BWI_DL': 13,
 'CAE_9E': 16,
 'CLE_DL': 12,
 'DEN_OO': 19,
 'DEN_DL': 14,
 'FLL_DL': 16,
 'LAS_WN': 12,
 'LAS_DL': 14,
 'MCI_WN': 9,
 'MCO_DL': 16,
 'PBI_DL': 14,
 'PHL_AA': 20,
 'PHL_DL': 14,
 'PHX_WN': 10,
 'PNS_DL': 15,
 'RSW_DL': 12,
 'SLC_DL': 16,
 'STL_WN': 9,
 'STL_DL': 11,
 'TUS_DL': 12,
 'CLT_DL': 18,
 'DCA_WN': 13,
 'DCA_DL': 15,
 'LEX_9E': 16,
 'XNA_OO': 16,
 'GPT_9E': 16,
 'IAH_YV': 17,
 'IAH_DL': 13,
 'TYS_OO': 14,
 'LGA_OO': 23,
 'LGA_UA': 19,
 'ORF_OO': 14,
 'RDU_AA': 17,
 'MSP_YX': 18,
 'MSP_AA': 16,
 'PIT_MQ': 13,
 'GSP_EV': 16,
 'AUS_UA': 14,
 'CMH_MQ': 14,
 'BDL_AA': 13,
 'DFW_YX': 17,


In [39]:
df[(df['ORIGIN'] == 'PSG') & (df['OP_UNIQUE_CARRIER'] == 'AS')]['TAXI_OUT_median'].unique()

array([9])

### `TAXI_IN`

In [40]:
df['DEST'] + '_' + df['OP_UNIQUE_CARRIER']

0          ATL_9E
1          ATL_DL
2          ATL_WN
3          ATL_DL
4          ATL_WN
            ...  
7200048    GST_AS
7200049    GST_AS
7200050    GST_AS
7200051    GST_AS
7200052    GST_AS
Length: 7200053, dtype: string

In [108]:
col = 'TAXI_IN_median'
cols = ['DEST', 'OP_UNIQUE_CARRIER']
taxi_in_df = df.drop_duplicates(subset=cols, keep='first')
taxi_in_dict = dict(zip(taxi_in_df[cols[0]] + '_' + taxi_in_df[cols[1]], taxi_in_df[col]))
with open("dict_mappers/taxi_in_dict.pkl", "wb") as f:
    pickle.dump(taxi_in_dict, f)
taxi_in_dict

{'ATL_9E': 8,
 'ATL_DL': 8,
 'ATL_WN': 7,
 'ATL_YX': 9,
 'ATL_OO': 8,
 'ATL_B6': 9,
 'ATL_AA': 9,
 'ATL_YV': 8,
 'ORD_OO': 13,
 'ORD_UA': 11,
 'ORD_AA': 14,
 'ORD_YX': 12,
 'ORD_MQ': 13,
 'ORD_EV': 13,
 'ORD_AS': 14,
 'ORD_F9': 13,
 'ORD_DL': 11,
 'IAH_EV': 8,
 'IAH_UA': 7,
 'IAH_YX': 8,
 'IAH_YV': 8,
 'IAH_AA': 8,
 'IAH_OO': 9,
 'IAH_NK': 9,
 'ATL_F9': 10,
 'ATL_UA': 7,
 'ATL_NK': 10,
 'CLT_OH': 13,
 'CLT_AA': 9,
 'CLT_F9': 10,
 'CLT_YX': 11,
 'CLT_OO': 12,
 'CLT_MQ': 15,
 'ORD_B6': 15,
 'ORD_NK': 13,
 'MSP_9E': 6,
 'MSP_NK': 6,
 'MSP_UA': 4,
 'MSP_YX': 5,
 'MSP_DL': 5,
 'MSP_F9': 7,
 'MSP_OO': 6,
 'FLL_G4': 8,
 'FLL_NK': 7,
 'FLL_B6': 6,
 'FLL_WN': 7,
 'FLL_UA': 7,
 'FLL_AA': 6,
 'FLL_DL': 7,
 'PGD_G4': 6,
 'DTW_9E': 8,
 'DTW_MQ': 9,
 'DTW_DL': 7,
 'DTW_NK': 10,
 'DTW_OO': 9,
 'DTW_YX': 9,
 'DTW_YV': 8,
 'DTW_WN': 7,
 'DTW_F9': 10,
 'MSP_B6': 7,
 'ORD_OH': 10,
 'DCA_OH': 4,
 'DCA_OO': 4,
 'DCA_B6': 4,
 'DCA_YX': 6,
 'DCA_AA': 5,
 'IAH_DL': 7,
 'ATL_MQ': 7,
 'ATL_OH': 8,
 'LGA_9E': 8,

In [42]:
df[(df['DEST'] == 'IAH') & (df['OP_UNIQUE_CARRIER'] == 'UA')]['TAXI_IN_median'].unique()

array([7])

### `HourlySkyConditions`

In [156]:
sky_dict = {
            'CLR' : '[CLR] Clear sky',
            'FEW' : '[FEW] Few clouds',
            'SCT' : '[SCT] Scattered clouds',
            'BKN' : '[BKN] Broken clouds',
            'OVC' : '[OVC] Overcast',
            'VV' : '[VV] Obscured sky'
           }
with open("dict_mappers/sky_dict.pkl", "wb") as f:
    pickle.dump(sky_dict, f)
sky_dict

{'CLR': '[CLR] Clear sky',
 'FEW': '[FEW] Few clouds',
 'SCT': '[SCT] Scattered clouds',
 'BKN': '[BKN] Broken clouds',
 'OVC': '[OVC] Overcast',
 'VV': '[VV] Obscured sky'}

### `DISTANCE`

In [7]:
# Haversine formula : determines the great-circle distance between two points on a sphere given their longitudes and latitudes

from math import cos, asin, sqrt, pi

def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742 * asin(sqrt(a))

In [20]:
airports_df = df.drop_duplicates(subset=['ORIGIN'], keep='first')[['ORIGIN', 'LATITUDE_Origin', 'LONGITUDE_Origin']]
airports_df = airports_df.round(decimals=4)
airports_df

Unnamed: 0,ORIGIN,LATITUDE_Origin,LONGITUDE_Origin
0,TYS,35.8181,-83.9858
1,SRQ,27.4014,-82.5586
2,DTW,42.2313,-83.3308
4,LGA,40.7794,-73.8804
7,ROA,37.3169,-79.9741
...,...,...,...
6621341,WRG,56.4732,-132.3874
6621359,PSG,56.8055,-132.9372
6622180,DLG,59.0500,-158.5167
7193994,YAK,59.5120,-139.6712


In [22]:
airportCoordinates_dict = dict(zip(airports_df['ORIGIN'], zip(airports_df['LATITUDE_Origin'], airports_df['LONGITUDE_Origin'])))
airportCoordinates_dict

{'TYS': (35.8181, -83.9858),
 'SRQ': (27.4014, -82.5586),
 'DTW': (42.2313, -83.3308),
 'LGA': (40.7794, -73.8804),
 'ROA': (37.3169, -79.9741),
 'TPA': (27.9619, -82.5403),
 'RDU': (35.8923, -78.7819),
 'BOS': (42.3606, -71.0097),
 'MSP': (44.8831, -93.2289),
 'JAX': (30.495, -81.6936),
 'IND': (39.7252, -86.2817),
 'BTR': (30.5372, -91.1469),
 'TLH': (30.3931, -84.3533),
 'GSP': (34.9061, -82.2125),
 'DAY': (39.9064, -84.2185),
 'MEM': (35.0564, -89.9865),
 'CMH': (39.9907, -82.877),
 'SAT': (29.5443, -98.4839),
 'DFW': (32.8978, -97.0189),
 'BWI': (39.1733, -76.684),
 'CAE': (33.9423, -81.118),
 'CLE': (41.4057, -81.852),
 'DEN': (39.8328, -104.6575),
 'FLL': (26.0788, -80.1622),
 'LAS': (36.0719, -115.1634),
 'MCI': (39.2972, -94.7306),
 'MCO': (28.4339, -81.325),
 'PBI': (26.6847, -80.0994),
 'PHL': (39.8733, -75.2268),
 'PHX': (33.4277, -112.0038),
 'PNS': (30.4781, -87.1869),
 'RSW': (26.5361, -81.755),
 'SLC': (40.7781, -111.9694),
 'STL': (38.7525, -90.3736),
 'TUS': (32.1313,

In [53]:
distance_dict = {}
for airport in airportCoordinates_dict.keys():
    lat1, lon1 = airportCoordinates_dict[airport][0], airportCoordinates_dict[airport][1]
    for aprt, coordinates in airportCoordinates_dict.items():
        lat2, lon2 = airportCoordinates_dict[aprt][0], airportCoordinates_dict[aprt][1]
        dist = int(round(distance(lat1, lon1, lat2, lon2)) * 0.621371) # km to miles (mi)
        distance_dict[airport + '_' + aprt] = dist

In [54]:
# Quick check to validate results:
distance_dict['TYS_ATL'], distance_dict['SRQ_ATL'], distance_dict['DTW_ATL']

(153, 444, 597)

In [55]:
with open("dict_mappers/distance_dict.pkl", "wb") as f:
    pickle.dump(distance_dict, f)
distance_dict

{'TYS_TYS': 0,
 'TYS_SRQ': 587,
 'TYS_DTW': 444,
 'TYS_LGA': 645,
 'TYS_ROA': 245,
 'TYS_TPA': 549,
 'TYS_RDU': 291,
 'TYS_BOS': 828,
 'TYS_MSP': 792,
 'TYS_JAX': 390,
 'TYS_IND': 297,
 'TYS_BTR': 551,
 'TYS_TLH': 375,
 'TYS_GSP': 118,
 'TYS_DAY': 282,
 'TYS_MEM': 341,
 'TYS_CMH': 294,
 'TYS_SAT': 946,
 'TYS_DFW': 769,
 'TYS_BWI': 462,
 'TYS_CAE': 208,
 'TYS_CLE': 402,
 'TYS_DEN': 1158,
 'TYS_FLL': 709,
 'TYS_LAS': 1736,
 'TYS_MCI': 635,
 'TYS_MCO': 533,
 'TYS_PBI': 671,
 'TYS_PHL': 553,
 'TYS_PHX': 1596,
 'TYS_PNS': 412,
 'TYS_RSW': 654,
 'TYS_SLC': 1548,
 'TYS_STL': 405,
 'TYS_TUS': 1560,
 'TYS_CLT': 175,
 'TYS_DCA': 435,
 'TYS_LEX': 157,
 'TYS_XNA': 576,
 'TYS_GPT': 475,
 'TYS_IAH': 772,
 'TYS_ORF': 439,
 'TYS_PIT': 382,
 'TYS_AUS': 882,
 'TYS_BDL': 739,
 'TYS_OMA': 746,
 'TYS_SAV': 300,
 'TYS_LAX': 1944,
 'TYS_PDX': 2109,
 'TYS_SAN': 1898,
 'TYS_SEA': 2110,
 'TYS_SFO': 2114,
 'TYS_SNA': 1919,
 'TYS_BNA': 152,
 'TYS_PIA': 455,
 'TYS_MIA': 726,
 'TYS_ATL': 153,
 'TYS_SYR': 655,
 'TYS

### `LATITUDE` / `LONGITUDE`

In [32]:
latitude = df.drop_duplicates(subset=['ORIGIN'], keep='first')[['ORIGIN', 'LATITUDE_Origin', 'LONGITUDE_Origin']]
latitude_dict = dict(zip(latitude['ORIGIN'], latitude['LATITUDE_Origin']))
with open("dict_mappers/latitude_dict.pkl", "wb") as f:
    pickle.dump(latitude_dict, f)
latitude_dict

{'TYS': 35.8181,
 'SRQ': 27.40139000000001,
 'DTW': 42.2313,
 'LGA': 40.77944,
 'ROA': 37.3169,
 'TPA': 27.96194,
 'RDU': 35.8923,
 'BOS': 42.3606,
 'MSP': 44.8831,
 'JAX': 30.495,
 'IND': 39.72517,
 'BTR': 30.5372,
 'TLH': 30.39306,
 'GSP': 34.90611,
 'DAY': 39.9064,
 'MEM': 35.0564,
 'CMH': 39.9907,
 'SAT': 29.5443,
 'DFW': 32.8978,
 'BWI': 39.1733,
 'CAE': 33.94228,
 'CLE': 41.4057,
 'DEN': 39.8328,
 'FLL': 26.07875,
 'LAS': 36.0719,
 'MCI': 39.2972,
 'MCO': 28.4339,
 'PBI': 26.6847,
 'PHL': 39.87327,
 'PHX': 33.4277,
 'PNS': 30.47806,
 'RSW': 26.53611,
 'SLC': 40.7781,
 'STL': 38.7525,
 'TUS': 32.1313,
 'CLT': 35.2236,
 'DCA': 38.8472,
 'LEX': 38.0408,
 'XNA': 36.28333,
 'GPT': 30.4119,
 'IAH': 29.98,
 'ORF': 36.9033,
 'PIT': 40.4846,
 'AUS': 30.3208,
 'BDL': 41.9375,
 'OMA': 41.3102,
 'SAV': 32.13133,
 'LAX': 33.938,
 'PDX': 45.5958,
 'SAN': 32.7336,
 'SEA': 47.4444,
 'SFO': 37.6197,
 'SNA': 33.68,
 'BNA': 36.11889,
 'PIA': 40.6675,
 'MIA': 25.7881,
 'ATL': 33.6301,
 'SYR': 43.111

In [33]:
longitude = df.drop_duplicates(subset=['ORIGIN'], keep='first')[['ORIGIN', 'LATITUDE_Origin', 'LONGITUDE_Origin']]
longitude_dict = dict(zip(longitude['ORIGIN'], longitude['LONGITUDE_Origin']))
with open("dict_mappers/longitude_dict.pkl", "wb") as f:
    pickle.dump(longitude_dict, f)
longitude_dict

{'TYS': -83.9858,
 'SRQ': -82.55861,
 'DTW': -83.3308,
 'LGA': -73.88035,
 'ROA': -79.9741,
 'TPA': -82.5403,
 'RDU': -78.7819,
 'BOS': -71.0097,
 'MSP': -93.2289,
 'JAX': -81.6936,
 'IND': -86.28168000000002,
 'BTR': -91.1469,
 'TLH': -84.35333,
 'GSP': -82.21255,
 'DAY': -84.2185,
 'MEM': -89.9865,
 'CMH': -82.87700000000002,
 'SAT': -98.4839,
 'DFW': -97.0189,
 'BWI': -76.684,
 'CAE': -81.11803,
 'CLE': -81.852,
 'DEN': -104.6575,
 'FLL': -80.16216999999997,
 'LAS': -115.1634,
 'MCI': -94.7306,
 'MCO': -81.325,
 'PBI': -80.0994,
 'PHL': -75.22678,
 'PHX': -112.0038,
 'PNS': -87.18694,
 'RSW': -81.755,
 'SLC': -111.9694,
 'STL': -90.3736,
 'TUS': -110.9552,
 'CLT': -80.9552,
 'DCA': -77.03454,
 'LEX': -84.6058,
 'XNA': -94.3,
 'GPT': -89.0808,
 'IAH': -95.36,
 'ORF': -76.1922,
 'PIT': -80.2144,
 'AUS': -97.7604,
 'BDL': -72.6819,
 'OMA': -95.8991,
 'SAV': -81.20237,
 'LAX': -118.3888,
 'PDX': -122.6093,
 'SAN': -117.1831,
 'SEA': -122.3138,
 'SFO': -122.3647,
 'SNA': -117.86639,
 'BN

### Weather Forecast API

**Openweather NWP model**

Openweather NWP (numerical weather prediction) allows to calculate weather data for any location. We use proprietary convolutional neural network that collects and processes wide range of data sources to cover any location and consider the local nuances of climate. ML technology allows us to reach resolution about 500 m and very high accuracy between 90% and 100% with inaccuracy about 1%. Amongst data sources we feed to the NWP are 82,000 weather stations spread globally; national meteorological agencies (NOAA, Environment Canada, Met Office, etc.), radars, weather satellites.

In [50]:
import requests

# Get/Build the url:
coord_API_endpoint = "http://api.openweathermap.org/data/2.5/onecall?"
join_key = "&appid="
API_key = "b51fbb9d87131aabef6d7c2cd42b128e"
units = "&units=imperial"
exclude = "&exclude=current,minutely,daily,alerts"

lat, lon = 33.63, -84.45
lat_lon = "lat=" + str(round(lat, 2))+ "&lon=" + str(round(lon, 2))

url = coord_API_endpoint + lat_lon + exclude + join_key + API_key + units
print(url)

# Instantiate the request:
res = requests.get(url)
# Retrieve the corresponding JSON data:
data = res.json()

data['hourly']

http://api.openweathermap.org/data/2.5/onecall?lat=33.63&lon=-84.45&exclude=current,minutely,daily,alerts&appid=b51fbb9d87131aabef6d7c2cd42b128e&units=imperial


[{'dt': 1620054000,
  'temp': 68.88,
  'feels_like': 69.66,
  'pressure': 1012,
  'humidity': 89,
  'dew_point': 65.52,
  'uvi': 2.05,
  'clouds': 92,
  'visibility': 10000,
  'wind_speed': 9.98,
  'wind_deg': 207,
  'wind_gust': 27.07,
  'weather': [{'id': 500,
    'main': 'Rain',
    'description': 'light rain',
    'icon': '10d'}],
  'pop': 1,
  'rain': {'1h': 0.55}},
 {'dt': 1620057600,
  'temp': 68.81,
  'feels_like': 69.53,
  'pressure': 1012,
  'humidity': 88,
  'dew_point': 65.12,
  'uvi': 4.69,
  'clouds': 90,
  'visibility': 10000,
  'wind_speed': 10.38,
  'wind_deg': 213,
  'wind_gust': 24.87,
  'weather': [{'id': 500,
    'main': 'Rain',
    'description': 'light rain',
    'icon': '10d'}],
  'pop': 1,
  'rain': {'1h': 0.21}},
 {'dt': 1620061200,
  'temp': 69.78,
  'feels_like': 70.56,
  'pressure': 1012,
  'humidity': 87,
  'dew_point': 65.73,
  'uvi': 5.57,
  'clouds': 92,
  'visibility': 10000,
  'wind_speed': 13.67,
  'wind_deg': 213,
  'wind_gust': 24.81,
  'weather': 

In [171]:
import requests

# Get/Build the url:
coord_API_endpoint = "http://api.openweathermap.org/data/2.5/onecall?"
join_key = "&appid="
API_key = "b51fbb9d87131aabef6d7c2cd42b128e"
units = "&units=imperial"
exclude = "&exclude=current,minutely,daily,alerts"

lat, lon = 33.63, -84.45
lat_lon = "lat=" + str(round(lat, 2))+ "&lon=" + str(round(lon, 2))

url = coord_API_endpoint + lat_lon + exclude + join_key + API_key + units


forecast_json_data = requests.get(url).json()
df_predictions = pd.DataFrame()


# Creating empty lists
days = []
hours = []
pressures = []
temperatures = []
precipitations = []
relHumidities = []
skyConditions = []
visibilities = []
windGusts = []
winds = []


# Loop Through the JSON
for num_forecasts in forecast_json_data['hourly']: # Hourly forecast for next 48 hours
    days.append(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%Y-%m-%d'))
    hours.append(int(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%H')))
    pressures.append(round(num_forecasts['pressure'] * 0.029529983071445, 2)) # hPa to inHg
    temperatures.append(int(round(num_forecasts['temp'], 0))) # imperial: Fahrenheit
    try: # "Where available"
        precipitations.append(round(num_forecasts['rain']['1h'] * 0.03937007874015748, 2)) # mm to in
    except KeyError:
        precipitations.append(0)
    relHumidities.append(num_forecasts['humidity']) # Humidity, %
    skyConditions.append(num_forecasts['clouds']) # Cloudiness, %
    visibilities.append(int(round(num_forecasts['visibility'] * 0.0006213712, 0))) # m to mi
    try: # "Where available"
        windGusts.append(int(round(num_forecasts['wind_gust'] * 0.8689762, 0))) # mi/h to kt
    except KeyError:
        windGusts.append(0)
    winds.append(int(round(num_forecasts['wind_speed'] * 0.8689762, 0))) # mi/h to kt

# Put data into a dataframe

def skycond(x):
    if x == 0:
        return 'CLR'
    elif x < 2/8 * 100:
        return 'FEW'
    elif x < 4/8 * 100:
        return 'SCT'
    elif x < 7/8 * 100:
        return 'BKN'
    elif x <= 8/8 * 100:
        return 'OVC'

skyConditions = list(map(skycond, skyConditions))

df_predictions['day'] = days
df_predictions['hour'] = [str(hour) for hour in hours]
df_predictions['pressure_inHg'] = pressures
df_predictions['temperature_F'] = temperatures
df_predictions['precipitation_in'] = precipitations
df_predictions['relHumidity_%'] = relHumidities
df_predictions['skyCondition'] = skyConditions
df_predictions['visibility_mi'] = visibilities
df_predictions['windGust_kt'] = windGusts
df_predictions['wind_kt'] = winds

df_predictions

Unnamed: 0,day,hour,pressure_inHg,temperature_F,precipitation_in,relHumidity_%,skyCondition,visibility_mi,windGust_kt,wind_kt
0,2021-05-03,19,29.85,72,0.05,87,OVC,6,22,12
1,2021-05-03,20,29.85,71,0.0,88,OVC,6,20,10
2,2021-05-03,21,29.85,72,0.0,86,BKN,6,19,11
3,2021-05-03,22,29.85,73,0.0,83,BKN,6,19,10
4,2021-05-03,23,29.83,75,0.02,79,BKN,6,20,10
5,2021-05-04,0,29.83,76,0.01,77,BKN,6,20,9
6,2021-05-04,1,29.83,75,0.01,77,BKN,6,21,9
7,2021-05-04,2,29.83,74,0.0,79,BKN,6,22,8
8,2021-05-04,3,29.85,73,0.0,80,OVC,6,23,6
9,2021-05-04,4,29.88,71,0.0,85,OVC,6,21,5


In [172]:
def weather_forecast(lat=33.63, lon=-84.45):
    
    # Get/Build the url:
    coord_API_endpoint = "http://api.openweathermap.org/data/2.5/onecall?"
    join_key = "&appid="
    API_key = "b51fbb9d87131aabef6d7c2cd42b128e"
    units = "&units=imperial"
    exclude = "&exclude=current,minutely,daily,alerts"

    lat_lon = "lat=" + str(round(lat, 2))+ "&lon=" + str(round(lon, 2))

    url = coord_API_endpoint + lat_lon + exclude + join_key + API_key + units

    forecast_json_data = requests.get(url).json()
    df_predictions = pd.DataFrame()

    # Creating empty lists
    days = []
    hours = []
    pressures = []
    temperatures = []
    precipitations = []
    relHumidities = []
    skyConditions = []
    visibilities = []
    windGusts = []
    winds = []

    # Loop Through the JSON
    for num_forecasts in forecast_json_data['hourly']: # Hourly forecast for next 48 hours
        days.append(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%Y-%m-%d'))
        hours.append(int(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%H')))
        pressures.append(round(num_forecasts['pressure'] * 0.029529983071445, 2)) # hPa to inHg
        temperatures.append(int(round(num_forecasts['temp'], 0))) # imperial: Fahrenheit
        try: # "Where available"
            precipitations.append(round(num_forecasts['rain']['1h'] * 0.03937007874015748, 2)) # mm to in
        except KeyError:
            precipitations.append(0)
        relHumidities.append(num_forecasts['humidity']) # Humidity, %
        skyConditions.append(num_forecasts['clouds']) # Cloudiness, %
        visibilities.append(int(round(num_forecasts['visibility'] * 0.0006213712, 0))) # m to mi
        try: # "Where available"
            windGusts.append(int(round(num_forecasts['wind_gust'] * 0.8689762, 0))) # mi/h to kt
        except KeyError:
            windGusts.append(0)
        winds.append(int(round(num_forecasts['wind_speed'] * 0.8689762, 0))) # mi/h to kt

    # Put data into a dataframe

    def skycond(x):
        if x == 0:
            return 'CLR'
        elif x < 2/8 * 100:
            return 'FEW'
        elif x < 4/8 * 100:
            return 'SCT'
        elif x < 7/8 * 100:
            return 'BKN'
        elif x <= 8/8 * 100:
            return 'OVC'

    skyConditions = list(map(skycond, skyConditions))

    df_predictions['day'] = days
    df_predictions['hour'] = [str(hour) for hour in hours]
    df_predictions['pressure_inHg'] = pressures
    df_predictions['temperature_F'] = temperatures
    df_predictions['precipitation_in'] = precipitations
    df_predictions['relHumidity_%'] = relHumidities
    df_predictions['skyCondition'] = skyConditions
    df_predictions['visibility_mi'] = visibilities
    df_predictions['windGust_kt'] = windGusts
    df_predictions['wind_kt'] = winds
    
    return df_predictions

In [173]:
weather_forecast(lat=33.63, lon=-84.45)

Unnamed: 0,day,hour,pressure_inHg,temperature_F,precipitation_in,relHumidity_%,skyCondition,visibility_mi,windGust_kt,wind_kt
0,2021-05-03,19,29.85,72,0.05,87,OVC,6,22,12
1,2021-05-03,20,29.85,71,0.0,88,OVC,6,20,10
2,2021-05-03,21,29.85,72,0.0,86,BKN,6,19,11
3,2021-05-03,22,29.85,73,0.0,83,BKN,6,19,10
4,2021-05-03,23,29.83,75,0.02,79,BKN,6,20,10
5,2021-05-04,0,29.83,76,0.01,77,BKN,6,20,9
6,2021-05-04,1,29.83,75,0.01,77,BKN,6,21,9
7,2021-05-04,2,29.83,74,0.0,79,BKN,6,22,8
8,2021-05-04,3,29.85,73,0.0,80,OVC,6,23,6
9,2021-05-04,4,29.88,71,0.0,85,OVC,6,21,5


### PRUEBAS !!!

In [24]:
with open("dict_mappers/arrTimeHour_dict.pkl", "rb") as f:
    arrTimeHour_dict = pickle.load(f)
arrTimeHour_dict

{'ABE_ATL_9E_13': '15',
 'ABE_ATL_9E_14': '17',
 'ABE_ATL_9E_15': '17',
 'ABE_ATL_9E_17': '20',
 'ABE_ATL_9E_18': '20',
 'ABE_ATL_9E_6': '8',
 'ABE_ATL_DL_17': '20',
 'ABE_ATL_DL_6': '8',
 'ABE_BNA_G4_13': '14',
 'ABE_BNA_G4_14': '15',
 'ABE_BNA_G4_17': '18',
 'ABE_BNA_G4_18': '19',
 'ABE_BNA_G4_19': '20',
 'ABE_CLT_OH_0': '7',
 'ABE_CLT_OH_11': '13',
 'ABE_CLT_OH_16': '19',
 'ABE_CLT_OH_18': '21',
 'ABE_CLT_OH_19': '21',
 'ABE_CLT_OH_6': '8',
 'ABE_CLT_OH_8': '10',
 'ABE_DTW_OO_0': '7',
 'ABE_DTW_OO_12': '14',
 'ABE_DTW_OO_13': '15',
 'ABE_DTW_OO_17': '19',
 'ABE_DTW_OO_6': '7',
 'ABE_DTW_OO_9': '11',
 'ABE_FLL_G4_10': '13',
 'ABE_FLL_G4_11': '13',
 'ABE_FLL_G4_14': '16',
 'ABE_FLL_G4_17': '20',
 'ABE_FLL_G4_18': '21',
 'ABE_FLL_G4_19': '22',
 'ABE_FLL_G4_9': '12',
 'ABE_MYR_G4_11': '12',
 'ABE_MYR_G4_12': '13',
 'ABE_MYR_G4_14': '16',
 'ABE_MYR_G4_15': '16',
 'ABE_MYR_G4_16': '18',
 'ABE_MYR_G4_17': '19',
 'ABE_MYR_G4_18': '20',
 'ABE_MYR_G4_20': '22',
 'ABE_MYR_G4_8': '9',
 'ABE_ORD

In [25]:
with open("dict_mappers/arrTimeHour_dict_2.pkl", "rb") as f:
    arrTimeHour_dict_2 = pickle.load(f)
arrTimeHour_dict_2

{'ABE_ATL': 2,
 'ABE_BNA': 2,
 'ABE_CLT': 2,
 'ABE_DTW': 2,
 'ABE_FLL': 3,
 'ABE_MYR': 2,
 'ABE_ORD': 2,
 'ABE_PGD': 3,
 'ABE_PIE': 3,
 'ABE_SAV': 2,
 'ABE_SFB': 2,
 'ABI_DFW': 1,
 'ABQ_ATL': 3,
 'ABQ_AUS': 2,
 'ABQ_BWI': 4,
 'ABQ_DAL': 2,
 'ABQ_DEN': 1,
 'ABQ_DFW': 2,
 'ABQ_HOU': 2,
 'ABQ_IAH': 2,
 'ABQ_JFK': 4,
 'ABQ_LAS': 2,
 'ABQ_LAX': 2,
 'ABQ_MCI': 2,
 'ABQ_MCO': 4,
 'ABQ_MDW': 3,
 'ABQ_MSP': 3,
 'ABQ_OAK': 2,
 'ABQ_ORD': 3,
 'ABQ_PDX': 3,
 'ABQ_PHX': 1,
 'ABQ_SAN': 2,
 'ABQ_SAT': 2,
 'ABQ_SEA': 3,
 'ABQ_SFO': 3,
 'ABQ_SJC': 2,
 'ABQ_SLC': 2,
 'ABR_MSP': 1,
 'ABY_ATL': 1,
 'ACK_BOS': 1,
 'ACK_CLT': 3,
 'ACK_DCA': 2,
 'ACK_HPN': 1,
 'ACK_JFK': 1,
 'ACK_LGA': 1,
 'ACT_DFW': 1,
 'ACV_DEN': 3,
 'ACV_LAX': 2,
 'ACV_SFO': 1,
 'ACY_ATL': 2,
 'ACY_FLL': 3,
 'ACY_MCO': 3,
 'ACY_MYR': 2,
 'ACY_PBI': 3,
 'ACY_RSW': 3,
 'ACY_TPA': 2,
 'ADK_ANC': 3,
 'ADQ_ANC': 1,
 'AEX_ATL': 2,
 'AEX_DFW': 1,
 'AEX_IAH': 1,
 'AGS_ATL': 1,
 'AGS_CLT': 1,
 'AGS_DCA': 2,
 'AGS_DFW': 3,
 'AGS_LGA': 2,
 'AGS_MIA'

# Generate `Streamlit` file

In [3]:
%%writefile flight_delay_predictor.py

# Import libraries to be used

# Directories/Files management
import os

# Timing
import time
import datetime

# Objects storage:
import joblib
import pickle

# Online data retrieval:
import requests

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and may freeze the kernel
import missingno as msno
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning:
# - Model selection:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, KFold, cross_val_score, StratifiedKFold, \
                                    GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance

# - Basic classes for custom-made transformers:
from sklearn.base import BaseEstimator, TransformerMixin

# - Transformers:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# - Pipeline:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# - Models:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, plot_importance, plot_tree

# - Metrics:
from sklearn.metrics import fbeta_score, f1_score, recall_score, precision_score, accuracy_score, \
                            confusion_matrix, classification_report, roc_curve, precision_recall_curve, \
                            roc_auc_score, average_precision_score, plot_roc_curve, plot_precision_recall_curve

# Model interpretability:
import shap

# Frontend:
import streamlit as st
import streamlit.components.v1 as components


# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

# Define the dataset columns:
cols = [
    'MONTH',
    'DAY_OF_WEEK',
    'OP_UNIQUE_CARRIER',
    'ORIGIN',
    'DEST',
    'DEP_TIME_hour',
    'TAXI_OUT_median',
    'TAXI_IN_median',
    'ARR_DEL15', # → Target !!
    'ARR_TIME_hour',
    'DISTANCE',
    'LATITUDE_Origin',
    'LONGITUDE_Origin',
    'HourlyAltimeterSetting_Origin',
    'HourlyDryBulbTemperature_Origin',
    'HourlyPrecipitation_Origin',
    'HourlyRelativeHumidity_Origin',
    'HourlySkyConditions_Origin',
    'HourlyVisibility_Origin',
    'HourlyWindGustSpeed_Origin',
    'HourlyWindSpeed_Origin',
    'LATITUDE_Dest',
    'LONGITUDE_Dest',
    'HourlyAltimeterSetting_Dest',
    'HourlyDryBulbTemperature_Dest',
    'HourlyPrecipitation_Dest',
    'HourlyRelativeHumidity_Dest',
    'HourlySkyConditions_Dest',
    'HourlyVisibility_Dest',
    'HourlyWindGustSpeed_Dest',
    'HourlyWindSpeed_Dest',
]

cols_dtypes = {
    'MONTH' : 'string',
    'DAY_OF_WEEK' : 'string',
    'OP_UNIQUE_CARRIER' : 'string',
    'ORIGIN' : 'string',
    'DEST' : 'string',
    'DEP_TIME_hour' : 'string',
    'TAXI_OUT_median' : 'int32',
    'TAXI_IN_median' : 'int32',
    'ARR_DEL15' : 'int32', # → Target !!
    'ARR_TIME_hour' : 'string',
    'DISTANCE' : 'int32',
    'LATITUDE_Origin' : 'float64',
    'LONGITUDE_Origin' : 'float64',
    'HourlyAltimeterSetting_Origin' : 'float64',
    'HourlyDryBulbTemperature_Origin' : 'int32',
    'HourlyPrecipitation_Origin' : 'float64',
    'HourlyRelativeHumidity_Origin' : 'int32',
    'HourlySkyConditions_Origin' : 'string',
    'HourlyVisibility_Origin' : 'int32',
    'HourlyWindGustSpeed_Origin' : 'int32',
    'HourlyWindSpeed_Origin' : 'int32',
    'LATITUDE_Dest' : 'float64',
    'LONGITUDE_Dest' : 'float64',
    'HourlyAltimeterSetting_Dest' : 'float64',
    'HourlyDryBulbTemperature_Dest' : 'int32',
    'HourlyPrecipitation_Dest' : 'float64',
    'HourlyRelativeHumidity_Dest' : 'int32',
    'HourlySkyConditions_Dest' : 'string',
    'HourlyVisibility_Dest' : 'int32',
    'HourlyWindGustSpeed_Dest' : 'int32',
    'HourlyWindSpeed_Dest' : 'int32',
}

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

# # Load the data:
# @st.cache
# def load_data():
#     input_folder = '../../data/output/us_dot-noaa/'
#     file_name = "3_otp_lcd_2019.csv"

#     df = pd.read_csv(input_folder + file_name,
#                      encoding='latin1',
#     #                      nrows=1e5,
#                      usecols=cols,
#                      dtype=cols_dtypes
#                     )
#     X = df.drop(['ARR_DEL15'], axis=1)
#     y = df['ARR_DEL15']
#     return df, X, y

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

# Load the model:
# @st.cache
def load_model(path=""):
    model = joblib.load(path)
    return model
    
# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

def frontend_appearance():
    """
    Design frontend appearance
    """

    # frontend elements of the web page 
    html_temp = """ 
    <div style ="background-color:SteelBlue;padding:13px"> 
    <h1 style ="color:white;text-align:center;">Flight Delay Forecaster</h1> 
    </div> 
    """   

    # display the frontend aspect
    st.markdown(html_temp, unsafe_allow_html = True)

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------  

@st.cache
def weather_forecast(lat, lon):
    
    # Get/Build the url:
    coord_API_endpoint = "http://api.openweathermap.org/data/2.5/onecall?"
    join_key = "&appid="
    API_key = "b51fbb9d87131aabef6d7c2cd42b128e"
    units = "&units=imperial"
    exclude = "&exclude=current,minutely,daily,alerts"

    lat_lon = "lat=" + str(round(lat, 2))+ "&lon=" + str(round(lon, 2))

    url = coord_API_endpoint + lat_lon + exclude + join_key + API_key + units

    forecast_json_data = requests.get(url).json()
    df_predictions = pd.DataFrame()

    # Creating empty lists
    days = []
    hours = []
    pressures = []
    temperatures = []
    precipitations = []
    relHumidities = []
    skyConditions = []
    visibilities = []
    windGusts = []
    winds = []

    # Loop Through the JSON
    for num_forecasts in forecast_json_data['hourly']: # Hourly forecast for next 48 hours
        days.append(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%Y-%m-%d'))
        hours.append(int(datetime.datetime.fromtimestamp(num_forecasts['dt']).strftime('%H')))
        pressures.append(round(num_forecasts['pressure'] * 0.029529983071445, 2)) # hPa to inHg
        temperatures.append(int(round(num_forecasts['temp'], 0))) # imperial: Fahrenheit
        try: # "Where available"
            precipitations.append(round(num_forecasts['rain']['1h'] * 0.03937007874015748, 2)) # mm to in
        except KeyError:
            precipitations.append(0)
        relHumidities.append(num_forecasts['humidity']) # Humidity, %
        skyConditions.append(num_forecasts['clouds']) # Cloudiness, %
        visibilities.append(int(round(num_forecasts['visibility'] * 0.0006213712, 0))) # m to mi
        try: # "Where available"
            windGusts.append(int(round(num_forecasts['wind_gust'] * 0.8689762, 0))) # mi/h to kt
        except KeyError:
            windGusts.append(0)
        winds.append(int(round(num_forecasts['wind_speed'] * 0.8689762, 0))) # mi/h to kt

    # Put data into a dataframe

    def skycond(x):
        if x == 0:
            return 'CLR'
        elif x < 2/8 * 100:
            return 'FEW'
        elif x < 4/8 * 100:
            return 'SCT'
        elif x < 7/8 * 100:
            return 'BKN'
        elif x <= 8/8 * 100:
            return 'OVC'

    skyConditions = list(map(skycond, skyConditions))

    df_predictions['day'] = days
    df_predictions['hour'] = [str(hour) for hour in hours]
    df_predictions['pressure_inHg'] = pressures
    df_predictions['temperature_F'] = temperatures
    df_predictions['precipitation_in'] = precipitations
    df_predictions['relHumidity_%'] = relHumidities
    df_predictions['skyCondition'] = skyConditions
    df_predictions['visibility_mi'] = visibilities
    df_predictions['windGust_kt'] = windGusts
    df_predictions['wind_kt'] = winds
    
    return df_predictions, url    
       
# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

# def user_inputs(df):
def user_inputs(root):
    """
    Define user input fields
    """
    
    # Create user input fields:

    
# FLIGHT DATA

    st.markdown('---')
    st.title('Flight data')
#     st.markdown('<p style="text-align: center;">Flight data</p>')
#     st.markdown(""" 
#         <div style ="background-color:#DFE9FB;padding:3px"> 
#         <h4 style ="color:black;text-align:center;">FLIGHT DATA</h4> 
#         </div> 
#         """  , unsafe_allow_html = True)
    
    
# 1) CARRIER:
    st.subheader('Carrier')
    
    # Carrier:
    with open(root + "dict_mappers/carriers_dict.pkl", "rb") as f:
        carriers_dict = pickle.load(f)
#     carrier = st.selectbox('Carrier', df['OP_UNIQUE_CARRIER'].value_counts().index, format_func = carriers_dict.get)
    with open(root + "dict_mappers/carriers_sorted_list.pkl", "rb") as f:
        carriers = pickle.load(f)
    carrier = st.selectbox('Carrier', carriers, format_func = carriers_dict.get)
    
# 2) ORIGIN:
    st.subheader('Origin')
    
    # Origin:
    originType = st.radio('Departure airport', options=['Currently operated by the A/L', 'All airports'], index=0, key=1)
    if originType == 'Currently operated by the A/L':
        with open(root + "dict_mappers/carrierOrigins_dict.pkl", "rb") as f:
            carrierOrigins_dict = pickle.load(f)
            origins = carrierOrigins_dict[carrier]
        origin = st.selectbox('Origin', origins)
    elif originType == 'All airports':
        with open(root + "dict_mappers/origins_sorted_list.pkl", "rb") as f:
            origins = pickle.load(f)
        origin = st.selectbox('Origin', origins)

    # Latitude / Longitude - ORIGIN:
    with open(root + "dict_mappers/latitude_dict.pkl", "rb") as f:
        latitude_dict = pickle.load(f)
    with open(root + "dict_mappers/longitude_dict.pkl", "rb") as f:
        longitude_dict = pickle.load(f)
    col1, col2, = st.beta_columns(2)
    with col1:
        st.text('ORIGIN - Latitude')
        latitudeOrigin = st.markdown(('{:8.5f}').format((latitude_dict[origin])))
        latitudeOrigin = latitude_dict[origin] # So that the model can use it in the proper format
    with col2:
        st.text('ORIGIN - Longitude')
        longitudeOrigin = st.markdown(('{:8.5f}').format((longitude_dict[origin])))
        longitudeOrigin = longitude_dict[origin] # So that the model can use it in the proper format
    # Taxi-out:
    with open(root + "dict_mappers/taxi_out_dict.pkl", "rb") as f:
        taxi_out_dict = pickle.load(f)          
    try:
        taxiout = st.slider('Taxi-out time [min] (*)',
                            min_value=0, max_value=40, value=taxi_out_dict[origin + '_' + carrier], step=1)
        st.markdown("(\*) *Default value is set to the median for the combination of: Origin and Carrier*")
    except KeyError:
        taxiout = st.slider('Taxi-out time [min] (*)', min_value=0, max_value=40, value=15, step=1)
        st.markdown("(\*) *The selected Carrier has not flown from Origin before. Please select a value*")
    
    
# 3) DESTINATION:
    st.subheader('Destination')
    
    # Destination:
    destType = st.radio('Departure airport', options=['Currently operated by the A/L', 'All airports'], index=0, key=2)
    if destType == 'Currently operated by the A/L':
        with open(root + "dict_mappers/carrierDests_dict.pkl", "rb") as f:
            carrierDests_dict = pickle.load(f)
            dests = carrierDests_dict[carrier]
        dest = st.selectbox('Dest', dests)
    elif destType == 'All airports':
        with open(root + "dict_mappers/dests_sorted_list.pkl", "rb") as f:
            dests = pickle.load(f)
        dest = st.selectbox('Destination', dests)

    # Latitude / Longitude - DESTINATION:
    col3, col4, = st.beta_columns(2)
    with col3:
        st.text('DESTINATION - Latitude')
        latitudeDest = st.markdown(('{:8.5f}').format((latitude_dict[dest])))
        latitudeDest = latitude_dict[dest] # So that the model can use it in the proper format
    with col4:
        st.text('DESTINATION - Longitude')
        longitudeDest = st.markdown(('{:8.5f}').format((longitude_dict[dest])))
        longitudeDest = longitude_dict[dest] # So that the model can use it in the proper format
    # Taxi-in:
    with open(root + "dict_mappers/taxi_in_dict.pkl", "rb") as f:
        taxi_in_dict = pickle.load(f)          
    try:
        taxiin = st.slider('Taxi-in time [min] (*)',
                            min_value=0, max_value=20, value=taxi_in_dict[dest + '_' + carrier], step=1)
        st.markdown("(\*) *Default value is set to the median for the combination of: Destination and Carrier*")
    except KeyError:
        taxiin = st.slider('Taxi-in time [min] (*)', min_value=0, max_value=20, value=6, step=1)
        st.markdown("(\*) *The selected Carrier has not flown to Destination before. Please select a value*")

        
# 4) TIME:
    st.subheader('Time')
    st.write("Current time:", datetime.datetime.now().strftime("%Y-%m-%d | %H:%M:%S"),
             "({})".format(datetime.datetime.now(datetime.timezone.utc).astimezone().tzinfo))
    
    col5, col6, col7 = st.beta_columns(3)    
    with col5:
        # Date:
        fdate = st.date_input("Flight date", value=datetime.date.today(),
                              min_value=datetime.date(2019, 1, 1), max_value=datetime.date(2021, 12, 31))
        fmonth = str(fdate.month)
        fweekday = str(fdate.isoweekday())    
    with col6:
        # Departure time:
#         deptime = st.selectbox('Departure time hour', list(map(str, sorted([int(hour) for hour in df['DEP_TIME_hour'].unique()]))))    
        with open(root + "dict_mappers/depTimeHours_list.pkl", "rb") as f:
            depTimeHours = pickle.load(f)
        deptime = st.selectbox('Departure time hour', list(map(str, sorted([int(hour) for hour in depTimeHours]))))    
    with col7:
        # Arrival time:
        with open(root + "dict_mappers/arrTimeHour_dict.pkl", "rb") as f:
            arrTimeHour_dict = pickle.load(f)
        with open(root + "dict_mappers/arrTimeHour_dict_2.pkl", "rb") as f:
            arrTimeHour_dict_2 = pickle.load(f)
        try:
            arrtime = st.number_input('Arrival  time hour (*)', min_value=0,
                                      value=int(arrTimeHour_dict[origin + '_' + dest + '_' + carrier + '_' + deptime]),
                                      max_value=23, step=1)
            arrtime = str(arrtime) # So that the model can use it in the proper format
            st.markdown("(\*) *Default value is set by the combination of: Origin, Destination, Carrier and Departure time hour*")
        except KeyError:
            try:
                arrtime = st.number_input('Arrival  time hour (*)', min_value=0,
                                          value=int(deptime) + int(arrTimeHour_dict_2[origin + '_' + dest]),
                                          max_value=23, step=1)
                arrtime = str(arrtime) # So that the model can use it in the proper format
                st.markdown("(\*) *The selected combination of Origin, Destination, Carrier and Departure time hour has not been flown before*")
                st.markdown("*Therefore, a default value has been set based on the combination of: Origin and Destination*")
            except KeyError:
                arrtime = st.number_input('Arrival  time hour (*)', min_value=0, value=0, max_value=23, step=1)
                arrtime = str(arrtime) # So that the model can use it in the proper format
                st.markdown("(\*) *The selected combination of Origin and Destination has not been flown before*")        

    # Distance:    
    with open(root + "dict_mappers/distance_dict.pkl", "rb") as f:
        distance_dict = pickle.load(f) 
    try:
        if distance_dict[origin + '_' + dest]:
            pass
    except KeyError:
        distance = st.slider('Distance covered [mi] (*)', min_value=0, max_value=6000, value=600, step=50)
        st.markdown("(\*) *The selected route (Origin-Destination) has not been flown before. Please select a value*")
    else: 
        st.markdown('Distance covered [mi]')
        distance = st.markdown(distance_dict[origin + '_' + dest])
        distance = distance_dict[origin + '_' + dest] # So that the model can use it in the proper format

# METEOROLOGICAL DATA

    st.markdown('---')
    st.title('Meteorological data')

    col8, col9, col10 = st.beta_columns([3, 1, 3])
    
    with col8:
    # 1) ORIGIN:
        st.subheader('Origin')     
        
       
        df_predictions, url = weather_forecast(latitudeOrigin, longitudeOrigin)
        flight_forecast = df_predictions[(df_predictions['day'] == str(fdate)) & (df_predictions['hour'] == deptime)]

        if len(flight_forecast) > 0:
            st.success("""*Weather forecast is available for the
                           departure. Therefore, meteorological
                           inputs have been defaulted accordingly.  
                           Powered by [OpenWeather]({})*""".format(url))
            altset_def = flight_forecast['pressure_inHg'].iloc[0]
            temp_def = int(flight_forecast['temperature_F'].iloc[0])
            precip_def = float(flight_forecast['precipitation_in'].iloc[0])
            relHumid_def = flight_forecast['relHumidity_%'].iloc[0]
            skyCond_def_dict = {'CLR': 0, 'FEW': 1, 'SCT': 2, 'BKN': 3, 'OVC': 4}
            skyCond_def = skyCond_def_dict[flight_forecast['skyCondition'].iloc[0]]
            visibility_def = int(flight_forecast['visibility_mi'].iloc[0])
            gust_def = int(flight_forecast['windGust_kt'].iloc[0])
            wind_def = int(flight_forecast['wind_kt'].iloc[0])
                
        else:
            st.warning("""*Unfortunately, no weather prediction is
                           available for the selected flight. Predictions
                           are only available for the next 48h.*""")
            altset_def = 29.92
            temp_def = 59
            precip_def = 0.
            relHumid_def = 60
            skyCond_def = 0
            visibility_def = 10
            gust_def = 0
            wind_def = 8
            
        # Altimeter setting - ORIGIN: 
        altsetOrigin = st.number_input('Altimeter setting [inHg]', min_value=27., value=altset_def,
                                       max_value=32., step=0.01, key=1)

        # Temperature - ORIGIN:
        tempTypeOrigin = st.radio('Temperature unit', options=['ºF', 'ºC'], index=0, key=1)
        if tempTypeOrigin == 'ºF':
            tempOrigin = st.slider('Temperature [ºF]', min_value=-50, max_value=130, value=temp_def, step=1, key=1)
        elif tempTypeOrigin == 'ºC':
            tempOrigin = st.slider('Temperature [ºC]', min_value=-50, max_value=50,
                                   value=int((temp_def - 32) / 1.8), step=1, key=1)
            tempOrigin = int(1.8 * tempOrigin + 32) # Convert Celsius to Fahrenheit to properly feed the model
            
        # Hourly precipitation - ORIGIN:
        precipOrigin = st.number_input('Hourly precipitation [in]', min_value=0.,
                                       value=precip_def, max_value=30., step=0.01, key=1)

        # Relative humidity - ORIGIN:    
        relhumOrigin = st.number_input('Relative humidity [%]', min_value=0,
                                       value=relHumid_def, max_value=100, step=1, key=1)

        # Sky condtions - ORIGIN: 
        with open(root + "dict_mappers/sky_dict.pkl", "rb") as f:
            sky_dict = pickle.load(f)
        skyOrigin = st.selectbox('Sky conditions', options=list(sky_dict.keys()),
                                 index=skyCond_def, format_func = sky_dict.get, key=1)

        # Visibility - ORIGIN:
        visibOrigin = st.number_input('Visibility [mi]', min_value=0,
                                       value=visibility_def, max_value=100, step=1, key=1)

        # Wind gust speed - ORIGIN:
        gustOrigin = st.slider('Wind gust speed [mph]', min_value=0, max_value=40, value=gust_def, step=1, key=1)

        # Wind speed - ORIGIN:
        windOrigin = st.slider('Wind speed [mph]', min_value=0, max_value=40, value=wind_def, step=1, key=1)

    with col10:
    # 2) DESTINATION:
        st.subheader('Destination')     

        df_predictions, url = weather_forecast(latitudeDest, longitudeDest)
        if int(arrtime) < int(deptime): # Late night flight arriving on the following day
            flight_forecast = df_predictions[(df_predictions['day'] == str(fdate + datetime.timedelta(days=1))) \
                                           & (df_predictions['hour'] == arrtime)]
        else:
            flight_forecast = df_predictions[(df_predictions['day'] == str(fdate)) & (df_predictions['hour'] == arrtime)]

        if len(flight_forecast) > 0:
            st.success("""*Weather forecast is available for the
                           arrival. Therefore, meteorological
                           inputs have been defaulted accordingly.  
                           Powered by [OpenWeather]({})*""".format(url))
            altset_def = flight_forecast['pressure_inHg'].iloc[0]
            temp_def = int(flight_forecast['temperature_F'].iloc[0])
            precip_def = float(flight_forecast['precipitation_in'].iloc[0])
            relHumid_def = flight_forecast['relHumidity_%'].iloc[0]
            skyCond_def_dict = {'CLR': 0, 'FEW': 1, 'SCT': 2, 'BKN': 3, 'OVC': 4}
            skyCond_def = skyCond_def_dict[flight_forecast['skyCondition'].iloc[0]]
            visibility_def = int(flight_forecast['visibility_mi'].iloc[0])
            gust_def = int(flight_forecast['windGust_kt'].iloc[0])
            wind_def = int(flight_forecast['wind_kt'].iloc[0])
                
        else:
            st.warning("""*Unfortunately, no weather prediction is
                           available for the selected flight. Predictions
                           are only available for the next 48h.*""")
            altset_def = 29.92
            temp_def = 59
            precip_def = 0.
            relHumid_def = 60
            skyCond_def = 0
            visibility_def = 10
            gust_def = 0
            wind_def = 8
            
        # Altimeter setting - DEST: 
        altsetDest = st.number_input('Altimeter setting [inHg]', min_value=27., value=altset_def,
                                       max_value=32., step=0.01, key=2)

        # Temperature - DEST:
        tempTypeDest = st.radio('Temperature unit', options=['ºF', 'ºC'], index=0, key=2)
        if tempTypeDest == 'ºF':
            tempDest = st.slider('Temperature [ºF]', min_value=-50, max_value=130, value=temp_def, step=1, key=2)
        elif tempTypeDest == 'ºC':
            tempDest = st.slider('Temperature [ºC]', min_value=-50, max_value=50,
                                   value=int((temp_def - 32) / 1.8), step=1, key=2)
            tempDest = int(1.8 * tempDest + 32) # Convert Celsius to Fahrenheit to properly feed the model
            
        # Hourly precipitation - DEST:
        precipDest = st.number_input('Hourly precipitation [in]', min_value=0.,
                                       value=precip_def, max_value=30., step=0.01, key=2)

        # Relative humidity - DEST:    
        relhumDest = st.number_input('Relative humidity [%]', min_value=0,
                                       value=relHumid_def, max_value=100, step=1, key=2)

        # Sky condtions - DEST: 
        with open(root + "dict_mappers/sky_dict.pkl", "rb") as f:
            sky_dict = pickle.load(f)
        skyDest = st.selectbox('Sky conditions', options=list(sky_dict.keys()),
                                 index=skyCond_def, format_func = sky_dict.get, key=2)

        # Visibility - DEST:
        visibDest = st.number_input('Visibility [mi]', min_value=0,
                                       value=visibility_def, max_value=100, step=1, key=2)

        # Wind gust speed - DEST:
        gustDest = st.slider('Wind gust speed [mph]', min_value=0, max_value=40, value=gust_def, step=1, key=2)

        # Wind speed - DEST:
        windDest = st.slider('Wind speed [mph]', min_value=0, max_value=40, value=wind_def, step=1, key=2)
    

    user_inputs = [
        fmonth, fweekday, carrier, origin, dest, deptime, taxiout, taxiin, arrtime, distance,                
        altsetOrigin, tempOrigin, precipOrigin,
        relhumOrigin, skyOrigin, visibOrigin, gustOrigin, windOrigin,
        altsetDest, tempDest, precipDest,
        relhumDest, skyDest, visibDest, gustDest, windDest
    ]

    
    return user_inputs

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

def prediction(model, X_test):  
    # Making predictions: 
    prediction = model.predict(X_test)
    score = model.predict_proba(X_test)[0, 0]
    if prediction == 0:
        result = 'ON-TIME'
    else:
        result = 'DELAYED'
    return result, score

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------

@st.cache(hash_funcs={shap.explainers._tree.Tree: hash})
def load_shap_explainer(filename='', X_test_transformed=pd.DataFrame()):   
    # Load the explainer file (instead of generating it so as to save time):
    explainer = joblib.load(filename)
    shap_values = explainer.shap_values(X_test_transformed)
    return explainer, shap_values

def st_shap(plot, height=None):
    shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
    components.html(shap_html, height=height) 

# ------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------


if __name__=='__main__': 
    
#     root = "/app/tfm_kschool/frontend/" # Used for deployment on Streamlit Sharing platform
    root = "" # Used for running local tests
    
    # Let the user know the data is loading and load the data:
#     df, X, y = load_data()

    # Load the model:
#     pipe = load_model(path="XGBoost_pipeline_model.joblib.dat")
    pipe = load_model(path=root + "XGBoost_pipeline_model.joblib.dat")
    transformer = pipe[:-1]
    model = pipe.named_steps['clf']
    
    # Load the general HMI framework:
    col1, col2, col3 = st.beta_columns([0.1,1.25,0.1])
    with col1:
        st.write("")
    with col2:
        st.image(root + 'logo3.jpeg')
    with col3:
        st.write("")
        
#     frontend_appearance()
    
    # Load the input fields:
#     inputs = user_inputs(df)
    inputs = user_inputs(root)
    
    # Generate an array based on user input values thal will be fed into the model:
    dismissed_cols = [
                      'ARR_DEL15',
                      'LATITUDE_Origin',
                      'LONGITUDE_Origin',
                      'LATITUDE_Dest',
                      'LONGITUDE_Dest'
                    ]
    X_test = pd.DataFrame(
            data=np.array(inputs)[np.newaxis], # Kind of transpose the resulting array from the 'inputs' list
            columns=[col for col in cols if col not in dismissed_cols]
        )
    cols_dtypes_frontend = cols_dtypes.copy()
    for col in dismissed_cols:
        del cols_dtypes_frontend[col]
    X_test = X_test.astype(cols_dtypes_frontend)

    # Indicate numerical and categorical features:
    num_attribs = X_test.select_dtypes('number').columns
    cat_attribs = X_test.select_dtypes(['string', 'category', 'object']).columns
    # Transform categorical variables:
    X_test_categTransformed_df = pd.DataFrame(transformer.transform(X_test)[:, 0:9], columns=cat_attribs)
    # Concatenate categorical transformed features with 'as-is' numerical features:
    X_test_transformed = pd.concat([X_test_categTransformed_df, X_test[num_attribs]], axis=1)
    X_test_transformed = X_test_transformed[X_test.columns]
       
    # When 'Predict' button is clicked, make the prediction and store it: 
    st.markdown('---')
    col11, col12, col13 = st.beta_columns([3, 1, 3])
    if col12.button("Predict"):
        # Calculate prediction:
        result = prediction(pipe, X_test)[0]
        score = prediction(pipe, X_test)[1]
        if result == 'ON-TIME':
            st.success('The flight is predicted to be **{}** ({:5.2f}%)'.format(result, 100*score))
        elif result == 'DELAYED':
            st.error('The flight is predicted to be **{}** ({:5.2f}%)'.format(result, 100*(1-score)))
        
        # SHAP values and force plot:
        with st.beta_expander("See explanatory details for this flight prediction"):
            st.write("""
                Below a *SHAP force plot* explains the contribution of each variable to the model's prediction.  
                - Red features are *forcing* the prediction to **DELAY**.
                - On the contrary, blue variables drive the prediction to **ON-TIME**.
            """)
            explainer, shap_values = load_shap_explainer(root + 'shap_treeExplainer.bz2', X_test_transformed)
            st_shap(shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:], link='logit'))
            
            # -----------

            shap.decision_plot(base_value=explainer.expected_value, shap_values=shap_values[0],
                              features=X_test.iloc[0,:], link='logit', feature_display_range=slice(None, -X_test.shape[1]-1, -1),
                              return_objects=True, show=False, y_demarc_color='#00172b')
            fig = plt.gcf()
            ax = plt.gca()
            fig.patch.set_facecolor('#00172b')
            ax.set_facecolor('#00172b')
            ax.set_xlabel('Probability', fontsize=16, color='white')
            ax.tick_params(axis='both', colors='white')
            ax.grid(axis='both', color='white', linestyle='-', linewidth=0.25)
            for ln in ax.lines:
                ln.set_linewidth(3)
            for text in ax.texts:
                text.set_color('white')
                text.set_alpha(0.75)
            st.pyplot(fig)
            
            # -----------
    
#     # SHAP values general overview:
#     shapSummary = st.checkbox(label="SHAP Summary Plot")
#     if shapSummary:
#         st.image(root + 'shap_summaryPlot.png')

       

Overwriting flight_delay_predictor.py


# Altair plots [OPTIONAL]

In [26]:
import altair as alt
from vega_datasets import data

# Since these data are each more than 5,000 rows we'll import from the URLs
airports = data.airports.url
flights_airport = data.flights_airport.url

states = alt.topo_feature(data.us_10m.url, feature="states")

# Create mouseover selection
select_city = alt.selection_single(
    on="mouseover", nearest=True, fields=["origin"], empty="none"
)

# Define which attributes to lookup from airports.csv
lookup_data = alt.LookupData(
    airports, key="iata", fields=["state", "latitude", "longitude"]
)

background = alt.Chart(states).mark_geoshape(
    fill="lightgray",
    stroke="white"
).properties(
    width=750,
    height=500
).project("albersUsa")

connections = alt.Chart(flights_airport).mark_rule(opacity=0.35).encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    latitude2="lat2:Q",
    longitude2="lon2:Q"
).transform_lookup(
    lookup="origin",
    from_=lookup_data
).transform_lookup(
    lookup="destination",
    from_=lookup_data,
    as_=["state", "lat2", "lon2"]
).transform_filter(
    select_city
)

points = alt.Chart(flights_airport).mark_circle().encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    size=alt.Size("routes:Q", scale=alt.Scale(range=[0, 1000]), legend=None),
    order=alt.Order("routes:Q", sort="descending"),
    tooltip=["origin:N", "routes:Q"]
).transform_aggregate(
    routes="count()",
    groupby=["origin"]
).transform_lookup(
    lookup="origin",
    from_=lookup_data
).transform_filter(
    (alt.datum.state != "PR") & (alt.datum.state != "VI")
).add_selection(
    select_city
)

(background + connections + points).configure_view(stroke=None)


___