In [1]:
# Import libraries to be used

# Directories/Files management
import os.path

# Timing
import time

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and may freeze the kernel
import missingno as msno

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

root = os.getcwd()
if os.name == 'nt': # Windows
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\Desktop\TFM\notebooks


___

# Get the data

## LCD clean file (2019)

The reader may find the relevant information about LCD data in the official [NOAA archive](https://www.ncei.noaa.gov/data/local-climatological-data/doc/)  

### Import file

#### Define file path

In [4]:
output_folder = '../data/output/noaa/lcd_all/'
file_name = "lcd_all_postprocessed.csv"

lcd = pd.read_csv(output_folder + file_name,
                  encoding='latin1',
                  dtype = {
                           'STATION' : 'string',
                           'LATITUDE' : 'float64',
                           'LONGITUDE' : 'float64',
                           'WMO' : 'string',
                           'WBAN' : 'string',
                           'DATE' : 'string',
                           'Date' : 'string',
                           'Hour' : 'string',
                           'HourlyAltimeterSetting' : 'float32',
                           'HourlyDryBulbTemperature' : 'int32',
                           'HourlyPrecipitation' : 'float32',
                           'HourlyRelativeHumidity' : 'int32', 
                           'HourlySkyConditions' : 'string',
                           'HourlyVisibility' : 'int32',
                           'HourlyWindDirection' : 'string',
                           'HourlyWindGustSpeed' : 'int32',
                           'HourlyWindSpeed' : 'int32',
                           'REM' : 'string'
                           }
                 )

In [5]:
lcd.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,LATITUDE,LONGITUDE,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,REM
1980145,72476593013,724765,93013,2019-12-29 05:45:00,2019-12-29,5,38.50583,-107.89889,30.040001,15,0.0,84,OVC,3,140,0,6,MET11312/29/19 05:45:02 SPECI KMTJ 291245Z 140...
1682118,72551014939,725510,14939,2019-01-04 11:54:00,2019-01-04,11,40.8508,-96.7475,29.860001,47,0.0,54,CLR,10,0,0,0,MET10901/04/19 11:54:02 METAR KLNK 041754Z 000...
1895778,72248613942,722486,13942,2019-06-02 22:53:00,2019-06-02,22,32.5155,-92.0405,29.940001,74,0.0,88,CLR,9,0,0,0,MET08906/02/19 22:53:02 METAR KMLU 030453Z 000...
1701319,72643014920,726430,14920,2019-03-16 10:53:00,2019-03-16,10,43.8788,-91.2527,30.360001,32,0.0,59,CLR,10,330,0,6,MET09603/16/19 10:53:02 METAR KLSE 161653Z 330...
1166898,72257603902,722576,3902,2019-10-13 16:58:00,2019-10-13,16,31.06667,-97.83333,29.940001,75,0.0,45,OVC,8,130,0,9,MET09410/13/19 16:58:02 METAR KGRK 132258Z 130...


In [6]:
lcd_year = lcd['Date'].str[:4]
lcd_month = lcd['Date'].str[5:7]
lcd_day = lcd['Date'].str[8:10]
lcd_id = lcd['WBAN'] + '_' + lcd_year + '-' + lcd_month + '-' + lcd_day + '_' + lcd['Hour']
lcd['LCD_id'] = lcd_id
lcd.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,LATITUDE,LONGITUDE,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,REM,LCD_id
19993,72365023050,723650,23050,2019-04-14 13:52:00,2019-04-14,13,35.0419,-106.6155,29.940001,69,0.0,17,SCT,10,100,0,7,MET10504/14/19 13:52:02 METAR KABQ 142052Z 100...,23050_2019-04-14_13
169573,72622794790,726227,94790,2019-06-05 09:56:00,2019-06-05,9,43.98867,-76.02623,29.799999,67,0.0,73,FEW,10,350,0,3,MET09906/05/19 09:56:02 METAR KART 051456Z 350...,94790_2019-06-05_9
192576,72645704825,726457,4825,2019-01-20 02:15:00,2019-01-20,2,44.26667,-88.51667,30.219999,3,0.0,60,CLR,10,0,0,0,MET07501/20/19 02:15:02 METAR KATW 200815Z 000...,04825_2019-01-20_2
1620279,72240003937,722400,3937,2019-11-29 05:53:00,2019-11-29,5,30.12472,-93.22833,30.209999,64,0.0,90,OVC,10,80,0,8,MET11111/29/19 05:53:02 METAR KLCH 291153Z 080...,03937_2019-11-29_5
2475792,72210812894,722108,12894,2019-10-15 18:53:00,2019-10-15,18,26.53611,-81.755,29.969999,81,0.0,74,BKN,10,270,0,5,MET11610/15/19 18:53:02 METAR KRSW 152353Z 270...,12894_2019-10-15_18


## OTP dataset

Additional information on each column meaning can be found in the [BTS Transtats website](https://www.transtats.bts.gov/homepage.asp):
1. Go to: *Data Finder / By Mode / Aviation*
2. Hit: *Airline On-Time Performance Data*
3. Open: *Reporting Carrier On-Time Performance (1987-present)*

### Import OTP dataset

In [7]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
     'TAIL_NUM',
     'ORIGIN',
     'ORIGIN_CITY_NAME',
     'ORIGIN_STATE_ABR',
     'ORIGIN_STATE_NM',
     'WBAN_Origin',
     'DEST',
     'DEST_CITY_NAME',
     'DEST_STATE_ABR',
     'DEST_STATE_NM',
     'WBAN_Dest',
     'CRS_DEP_TIME',
     'DEP_TIME',
     'DEP_DELAY',
     'DEP_DEL15',
     'DEP_TIME_hour',
     'TAXI_OUT',
     'TAXI_IN',
     'TAXI_OUT_median',
     'TAXI_IN_median',
     'CRS_ARR_TIME',
     'ARR_TIME',
     'ARR_DELAY',
     'ARR_DEL15',
     'ARR_TIME_hour',
     'CANCELLED',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP',
     'CARRIER_DELAY',
     'WEATHER_DELAY',
     'NAS_DELAY',
     'SECURITY_DELAY',
     'LATE_AIRCRAFT_DELAY'
]

In [8]:
cols_dtypes = {
               'MONTH' : 'string',
               'DAY_OF_MONTH' : 'string',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'TAIL_NUM' : 'string',
               'ORIGIN' : 'category',
               'ORIGIN_CITY_NAME' : 'string',
               'ORIGIN_STATE_ABR' : 'category',
               'ORIGIN_STATE_NM' : 'category',
               'WBAN_Origin' : 'string',
               'DEST' : 'category',
               'DEST_CITY_NAME' : 'string',
               'DEST_STATE_ABR' : 'category',
               'DEST_STATE_NM' : 'category',
               'WBAN_Dest' : 'string',
               'CRS_DEP_TIME' : 'string',
               'DEP_TIME' : 'string',
               'DEP_DELAY' : 'int32',
               'DEP_DEL15' : 'int32',
               'DEP_TIME_hour' : 'string',
               'TAXI_OUT' : 'int32',
               'TAXI_IN' : 'int32',
               'TAXI_OUT_median' : 'int32',
               'TAXI_IN_median' : 'int32',
               'CRS_ARR_TIME' : 'string',
               'ARR_TIME' : 'string',
               'ARR_DELAY' : 'int32',
               'ARR_DEL15' : 'int32', # → Target !!
               'ARR_TIME_hour' : 'string',
               'CANCELLED' : 'string',
               'CRS_ELAPSED_TIME' : 'int32',
               'DISTANCE' : 'int32',
               'DISTANCE_GROUP' : 'category',
               'CARRIER_DELAY' : 'int32',
               'WEATHER_DELAY' : 'int32',
               'NAS_DELAY' : 'int32',
               'SECURITY_DELAY' : 'int32',
               'LATE_AIRCRAFT_DELAY' : 'int32',
               }

In [9]:
output_folder = '../data/output/us_dot/'
file_name = "2_otp_2019_wban.csv"

otp = pd.read_csv(output_folder + file_name,
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

### Adapt OTP dataset format

In [10]:
# It is observed that WBAN format is not always in 5-digit format due to conversion from 'int' dtype, so:
for c in ['WBAN_Origin', 'WBAN_Dest']:
    otp[c] = otp[c].apply(lambda x: ('0' + x) if len(x) == 4 else x)
# Likewise, convert months and days into 2-digit format:
for d in ['MONTH', 'DAY_OF_MONTH']:
    otp[d] = otp[d].apply(lambda x: ('0' + x) if len(x) == 1 else x)

## OTP-LCD merge

### 1st merge: `Origin`

In [11]:
lcd_original_cols = lcd.columns
# Origin merge:
lcd.columns = lcd_original_cols.map(lambda x: str(x) + '_Origin')
otp_id_Origin = otp['WBAN_Origin'] + '_' + '2019-' + otp['MONTH'] + '-' + otp['DAY_OF_MONTH'] + '_' + otp['DEP_TIME_hour']
otp['LCD_id_Origin'] = otp_id_Origin
otp_lcd_Origin = otp.merge(lcd, how='inner', on='LCD_id_Origin', suffixes=['_OTP', '_LCD'])

In [12]:
otp_lcd_Origin.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin
6688016,12,11,3,F9,N354FR,RSW,"Fort Myers, FL",FL,Florida,12894,BUF,"Buffalo, NY",NY,New York,14733,942,949,7,0,9,14,8,14,7,1242,1226,-16,0,12,0,180,1144,5,0,0,0,0,0,12894_2019-12-11_9,72210812894,722108,12894,2019-12-11 09:02:00,2019-12-11,9,26.53611,-81.755,30.190001,71,0.0,100,BKN,1,80,0,5,MET09912/11/19 09:02:02 SPECI KRSW 111402Z 080...


### 2nd merge: `Dest`

In [13]:
# Dest merge:
lcd.columns = lcd_original_cols.map(lambda x: str(x) + '_Dest')
otp_id_Dest = otp_lcd_Origin['WBAN_Dest'] + '_' + '2019-' + otp_lcd_Origin['MONTH'] + '-' \
              + otp_lcd_Origin['DAY_OF_MONTH'] + '_' + otp_lcd_Origin['ARR_TIME_hour']
otp_lcd_Origin['LCD_id_Dest'] = otp_id_Dest
otp_lcd = otp_lcd_Origin.merge(lcd, how='inner', on='LCD_id_Dest', suffixes=['_OTP', '_LCD'])

In [14]:
otp_lcd.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest_OTP,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin,LCD_id_Dest,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,DATE_Dest,Date_Dest,Hour_Dest,LATITUDE_Dest,LONGITUDE_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,HourlyWindDirection_Dest,HourlyWindGustSpeed_Dest,HourlyWindSpeed_Dest,REM_Dest
5907243,11,7,4,YV,N86334,IAH,"Houston, TX",TX,Texas,12960,ELP,"El Paso, TX",TX,Texas,23044,1808,1831,23,1,18,39,6,17,3,1917,1959,42,1,19,0,129,667,3,42,0,0,0,0,12960_2019-11-07_18,72243012960,722430,12960,2019-11-07 18:00:00,2019-11-07,18,29.98,-95.36,30.165001,66,0.0,90,OVC,6,330,0,17,SYN09272243 12361 83315 10189 20172 30169 4021...,23044_2019-11-07_19,72270023044,722700,23044,2019-11-07 19:51:00,2019-11-07,19,31.81111,-106.37583,30.34,54,0.0,59,OVC,10,70,0,7,MET10611/07/19 19:51:01 METAR KELP 080251Z 070...


### Final points before exporting

In [15]:
# Check how many flights have been lost throughout the merging process:
print("OTP dataset number of flights : {}".format(len(otp)))
print("OTP-LCD dataset number of flights : {} ({:4.2f}% dropped)".format(len(otp_lcd),
                                                                         (len(otp) - len(otp_lcd)) * 100 / len(otp)))

OTP dataset number of flights : 7208372
OTP-LCD dataset number of flights : 7200053 (0.12% dropped)


In [16]:
# Drop redundant columns:
drop_cols = ['LCD_id_Origin', 'DATE_Origin', 'Date_Origin', 'Hour_Origin', 'LCD_id_Dest', 'DATE_Dest', 'Date_Dest', 'Hour_Dest']
otp_lcd.drop(drop_cols, axis=1, inplace=True)
otp_lcd.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest_OTP,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,LATITUDE_Dest,LONGITUDE_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,HourlyWindDirection_Dest,HourlyWindGustSpeed_Dest,HourlyWindSpeed_Dest,REM_Dest
5579800,3,14,4,B6,N517JB,BDL,"Hartford, CT",CT,Connecticut,14740,SJU,"San Juan, PR",PR,Puerto Rico,11641,600,553,-7,0,6,20,6,13,5,951,919,-32,0,9,0,231,1666,7,0,0,0,0,0,72508014740,725080,14740,41.9375,-72.6819,30.280001,38,0.0,70,OVC,10,0,0,0,MET11903/14/19 06:51:02 METAR KBDL 141151Z 000...,78526011641,785260,11641,18.4325,-66.0108,30.08,78,0.0,76,FEW,10,350,0,8,MET10603/14/19 09:56:02 METAR TJSJ 141356Z 350...


___

## Export resulting DF into CSV file

In [17]:
output_folder = '../data/output/us_dot-noaa/'
file_name = "3_otp_lcd_2019.csv"

if file_name not in os.listdir(output_folder):
    # Save such DataFrame into a CSV file (only once):
    otp_lcd.to_csv(output_folder + file_name,
                   index=False,
                   encoding='latin1')
    print("File '" + file_name + "' has been generated.")
else:    
    print("File '" + file_name + "' already exists.\nNo file has been generated (previous one remains).")

File '3_otp_lcd_2019.csv' has been generated.


___

In [18]:
t1 = time.perf_counter() - t0
print("Time elapsed: {:2.0f}h {:2.0f}min {:2.0f}s".format(t1//3600, (t1%3600)//60, (t1%3600)%60))

Time elapsed:  0h 10min  5s


___