In [1]:
# Import libraries to be used

# Directories/Files management
import os.path

# Timing
import time

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and may freeze the kernel
import missingno as msno

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

root = os.getcwd()
if os.name == 'nt': # Windows
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\Desktop\TFM\notebooks


___

# Get the data

## LCD clean file (2019)

The reader may find the relevant information about LCD data in the official [NOAA archive](https://www.ncei.noaa.gov/data/local-climatological-data/doc/)  

### Import file

#### Define file path

In [11]:
output_folder = '../data/output/noaa/lcd_all/'
file_name = "lcd_all_postprocessed.csv"

lcd = pd.read_csv(output_folder + file_name,
                  encoding='latin1',
                  dtype = {
                           'STATION' : 'string',
                           'LATITUDE' : 'float64',
                           'LONGITUDE' : 'float64',
                           'WMO' : 'string',
                           'WBAN' : 'string',
                           'DATE' : 'string',
                           'Date' : 'string',
                           'Hour' : 'string',
                           'HourlyAltimeterSetting' : 'float32',
                           'HourlyDryBulbTemperature' : 'int32',
                           'HourlyPrecipitation' : 'float32',
                           'HourlyRelativeHumidity' : 'int32', 
                           'HourlySkyConditions' : 'string',
                           'HourlyVisibility' : 'int32',
                           'HourlyWindDirection' : 'string',
                           'HourlyWindGustSpeed' : 'int32',
                           'HourlyWindSpeed' : 'int32',
                           'REM' : 'string'
                           }
                 )

In [12]:
lcd.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,LATITUDE,LONGITUDE,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,REM
2054338,70200026617,702000,26617,2019-07-01 06:53:00,2019-07-01,6,64.5111,-165.44,30.27,46,0.0,89,OVC,9,220,0,11,MET10907/01/19 06:53:02 METAR PAOM 011553Z 220...
29266,72659014929,726590,14929,2019-05-05 22:53:00,2019-05-05,22,45.4433,-98.413,30.040001,48,0.0,46,OVC,10,10,0,13,MET09405/05/19 22:53:02 METAR KABR 060453Z 010...
1146754,72645014898,726450,14898,2019-06-21 22:53:00,2019-06-21,22,44.4794,-88.1366,30.07,60,0.0,62,CLR,10,60,0,7,MET09006/21/19 22:53:02 METAR KGRB 220453Z 060...
1261612,72745594931,727455,94931,2019-08-30 10:53:00,2019-08-30,10,47.38639,-92.83889,30.16,64,0.0,43,SCT,10,310,0,11,MET09308/30/19 10:53:02 METAR KHIB 301653Z 310...
2530602,72289793206,722897,93206,2019-01-18 02:56:00,2019-01-18,2,35.23722,-120.64139,30.26,52,0.0,97,VV,0,130,0,3,MET09601/18/19 02:56:02 METAR KSBP 181056Z 130...


In [13]:
lcd_year = lcd['Date'].str[:4]
lcd_month = lcd['Date'].str[5:7]
lcd_day = lcd['Date'].str[8:10]
lcd_id = lcd['WBAN'] + '_' + lcd_year + '-' + lcd_month + '-' + lcd_day + '_' + lcd['Hour']
lcd['LCD_id'] = lcd_id
lcd.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,LATITUDE,LONGITUDE,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,REM,LCD_id
780012,72547094908,725470,94908,2019-07-01 09:53:00,2019-07-01,9,42.39778,-90.70361,30.040001,76,0.0,85,CLR,10,220,0,8,MET09707/01/19 09:53:02 METAR KDBQ 011553Z 220...,94908_2019-07-01_9
824850,72745014913,727450,14913,2019-09-06 05:05:00,2019-09-06,5,46.8369,-92.1833,30.049999,58,0.0,81,BKN,10,340,0,9,MET09809/06/19 05:05:02 SPECI KDLH 061105Z 340...,14913_2019-09-06_5
858839,72537094847,725370,94847,2019-07-24 21:53:00,2019-07-24,21,42.2313,-83.3308,30.139999,72,0.0,53,CLR,10,140,0,3,MET089METAR KDTW 250253Z 14003KT 10SM SCT060 S...,94847_2019-07-24_21
571715,72208013880,722080,13880,2019-09-05 03:10:00,2019-09-05,3,32.89943,-80.04075,29.52,73,0.0,94,OVC,1,0,0,29,MET14009/05/19 03:10:02 SPECI KCHS 050810Z ...,13880_2019-09-05_3
819383,72745014913,727450,14913,2019-01-20 22:55:00,2019-01-20,22,46.8369,-92.1833,30.379999,-2,0.0,75,FEW,10,180,0,7,MET09501/20/19 22:55:02 METAR KDLH 210455Z 180...,14913_2019-01-20_22


## OTP dataset

Additional information on each column meaning can be found in the [BTS Transtats website](https://www.transtats.bts.gov/homepage.asp):
1. Go to: *Data Finder / By Mode / Aviation*
2. Hit: *Airline On-Time Performance Data*
3. Open: *Reporting Carrier On-Time Performance (1987-present)*

### Import OTP dataset

In [14]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
     'TAIL_NUM',
     'ORIGIN',
     'ORIGIN_CITY_NAME',
     'ORIGIN_STATE_ABR',
     'ORIGIN_STATE_NM',
     'WBAN_Origin',
     'DEST',
     'DEST_CITY_NAME',
     'DEST_STATE_ABR',
     'DEST_STATE_NM',
     'WBAN_Dest',
     'CRS_DEP_TIME',
     'DEP_TIME',
     'DEP_DELAY',
     'DEP_DEL15',
     'DEP_TIME_hour',
     'TAXI_OUT',
     'TAXI_IN',
     'TAXI_OUT_median',
     'TAXI_IN_median',
     'CRS_ARR_TIME',
     'ARR_TIME',
     'ARR_DELAY',
     'ARR_DEL15',
     'ARR_TIME_hour',
     'CANCELLED',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP',
     'CARRIER_DELAY',
     'WEATHER_DELAY',
     'NAS_DELAY',
     'SECURITY_DELAY',
     'LATE_AIRCRAFT_DELAY'
]

In [15]:
cols_dtypes = {
               'MONTH' : 'string',
               'DAY_OF_MONTH' : 'string',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'TAIL_NUM' : 'string',
               'ORIGIN' : 'category',
               'ORIGIN_CITY_NAME' : 'string',
               'ORIGIN_STATE_ABR' : 'category',
               'ORIGIN_STATE_NM' : 'category',
               'WBAN_Origin' : 'string',
               'DEST' : 'category',
               'DEST_CITY_NAME' : 'string',
               'DEST_STATE_ABR' : 'category',
               'DEST_STATE_NM' : 'category',
               'WBAN_Dest' : 'string',
               'CRS_DEP_TIME' : 'string',
               'DEP_TIME' : 'string',
               'DEP_DELAY' : 'int32',
               'DEP_DEL15' : 'int32',
               'DEP_TIME_hour' : 'string',
               'TAXI_OUT' : 'int32',
               'TAXI_IN' : 'int32',
               'TAXI_OUT_median' : 'int32',
               'TAXI_IN_median' : 'int32',
               'CRS_ARR_TIME' : 'string',
               'ARR_TIME' : 'string',
               'ARR_DELAY' : 'int32',
               'ARR_DEL15' : 'int32', # â†’ Target !!
               'ARR_TIME_hour' : 'string',
               'CANCELLED' : 'string',
               'CRS_ELAPSED_TIME' : 'int32',
               'DISTANCE' : 'int32',
               'DISTANCE_GROUP' : 'category',
               'CARRIER_DELAY' : 'int32',
               'WEATHER_DELAY' : 'int32',
               'NAS_DELAY' : 'int32',
               'SECURITY_DELAY' : 'int32',
               'LATE_AIRCRAFT_DELAY' : 'int32',
               }

In [16]:
output_folder = '../data/output/us_dot/'
file_name = "2_otp_2019_wban.csv"

otp = pd.read_csv(output_folder + file_name,
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

### Adapt OTP dataset format

In [17]:
# It is observed that WBAN format is not always in 5-digit format due to conversion from 'int' dtype, so:
for c in ['WBAN_Origin', 'WBAN_Dest']:
    otp[c] = otp[c].apply(lambda x: ('0' + x) if len(x) == 4 else x)
# Likewise, convert months and days into 2-digit format:
for d in ['MONTH', 'DAY_OF_MONTH']:
    otp[d] = otp[d].apply(lambda x: ('0' + x) if len(x) == 1 else x)

## OTP-LCD merge

### 1st merge: `Origin`

In [18]:
lcd_original_cols = lcd.columns
# Origin merge:
lcd.columns = lcd_original_cols.map(lambda x: str(x) + '_Origin')
otp_id_Origin = otp['WBAN_Origin'] + '_' + '2019-' + otp['MONTH'] + '-' + otp['DAY_OF_MONTH'] + '_' + otp['DEP_TIME_hour']
otp['LCD_id_Origin'] = otp_id_Origin
otp_lcd_Origin = otp.merge(lcd, how='inner', on='LCD_id_Origin', suffixes=['_OTP', '_LCD'])

In [19]:
otp_lcd_Origin.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin
4017243,3,9,6,AA,N163AA,MIA,"Miami, FL",FL,Florida,12839,MCO,"Orlando, FL",FL,Florida,12815,655,647,-8,0,6,17,5,16,8,806,747,-19,0,8,0,71,192,1,0,0,0,0,0,12839_2019-03-09_6,72202012839,722020,12839,2019-03-09 06:53:00,2019-03-09,6,25.7881,-80.3169,30.09,71,0.0,90,BKN,10,80,0,3,MET12903/09/19 06:53:02 METAR KMIA 091153Z 080...


### 2nd merge: `Dest`

In [20]:
# Dest merge:
lcd.columns = lcd_original_cols.map(lambda x: str(x) + '_Dest')
otp_id_Dest = otp_lcd_Origin['WBAN_Dest'] + '_' + '2019-' + otp_lcd_Origin['MONTH'] + '-' \
              + otp_lcd_Origin['DAY_OF_MONTH'] + '_' + otp_lcd_Origin['ARR_TIME_hour']
otp_lcd_Origin['LCD_id_Dest'] = otp_id_Dest
otp_lcd = otp_lcd_Origin.merge(lcd, how='inner', on='LCD_id_Dest', suffixes=['_OTP', '_LCD'])

In [21]:
otp_lcd.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest_OTP,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin,LCD_id_Dest,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,DATE_Dest,Date_Dest,Hour_Dest,LATITUDE_Dest,LONGITUDE_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,HourlyWindDirection_Dest,HourlyWindGustSpeed_Dest,HourlyWindSpeed_Dest,REM_Dest
5426058,12,21,6,WN,N8559Q,BWI,"Baltimore, MD",MD,Maryland,93721,SEA,"Seattle, WA",WA,Washington,24233,1545,1608,23,1,15,12,9,11,8,1855,1859,4,0,18,0,370,2335,10,0,0,0,0,0,93721_2019-12-21_15,72406093721,724060,93721,2019-12-21 15:54:00,2019-12-21,15,39.1733,-76.684,30.49,37,0.0,52,BKN,10,0,0,0,MET11912/21/19 15:54:02 METAR KBWI 212054Z 000...,24233_2019-12-21_18,72793024233,727930,24233,2019-12-21 18:31:00,2019-12-21,18,47.4444,-122.3138,29.85,43,0.0,93,OVC,2,140,0,7,MET11212/21/19 18:31:01 SPECI KSEA 220231Z 140...


### Final points before exporting

In [22]:
# Check how many flights have been lost throughout the merging process:
print("OTP dataset number of flights : {}".format(len(otp)))
print("OTP-LCD dataset number of flights : {} ({:4.2f}% dropped)".format(len(otp_lcd),
                                                                         (len(otp) - len(otp_lcd)) * 100 / len(otp)))

OTP dataset number of flights : 7208372
OTP-LCD dataset number of flights : 7200053 (0.12% dropped)


In [23]:
# Drop redundant columns:
drop_cols = ['LCD_id_Origin', 'DATE_Origin', 'Date_Origin', 'Hour_Origin', 'LCD_id_Dest', 'DATE_Dest', 'Date_Dest', 'Hour_Dest']
otp_lcd.drop(drop_cols, axis=1, inplace=True)
otp_lcd.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin_OTP,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest_OTP,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,LATITUDE_Origin,LONGITUDE_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyWindDirection_Origin,HourlyWindGustSpeed_Origin,HourlyWindSpeed_Origin,REM_Origin,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,LATITUDE_Dest,LONGITUDE_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,HourlyWindDirection_Dest,HourlyWindGustSpeed_Dest,HourlyWindSpeed_Dest,REM_Dest
4492884,7,25,4,WN,N273WN,SAN,"San Diego, CA",CA,California,23188,SJC,"San Jose, CA",CA,California,23293,1745,1747,2,0,17,10,3,12,3,1905,1856,-9,0,19,0,80,417,2,0,0,0,0,0,72290023188,722900,23188,32.7336,-117.1831,29.93,76,0.0,69,SCT,10,200,0,7,MET10707/25/19 17:51:01 METAR KSAN 260151Z 200...,72494523293,724945,23293,37.3591,-121.924,29.93,68,0.0,87,FEW,10,320,0,9,MET10907/25/19 19:53:02 METAR KSJC 260353Z 320...


___

## Export resulting DF into CSV file

In [24]:
output_folder = '../data/output/us_dot-noaa/'
file_name = "3_otp_lcd_2019.csv"

if file_name not in os.listdir(output_folder):
    # Save such DataFrame into a CSV file (only once):
    otp_lcd.to_csv(output_folder + file_name,
                   index=False,
                   encoding='latin1')
    print("File '" + file_name + "' has been generated.")
else:    
    print("File '" + file_name + "' already exists.\nNo file has been generated (previous one remains).")

File '3_otp_lcd_2019.csv' has been generated.


___

In [25]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

Time elapsed:  2955.1426988999997


___