In [1]:
# Import libraries to be used

# Directories/Files management
import os.path

# Timing
import time

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and may freeze the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

In [2]:
t0 = time.perf_counter() 

In [None]:
# Detect Operating System running and manage paths accordingly

root = os.getcwd()
if os.name == 'nt': # Windows
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    print("Running on Ubuntu.")
print("root path\t", root)

___

# Get the data

### OTP (2019)

Let's first check which airports are present in the OTP dataset, and later compare them to those appearing in the WBAN database. Ideally, all the airports contained in the OTP dataset should appear in the WBAN database. This would mean that there presumably might be a weather station at each of those airports.

In [3]:
cols = ['ORIGIN', 'DEST']
cols_dtypes = {'ORIGIN' : 'category', 'DEST' : 'category'}

In [4]:
%%time

otp = pd.read_csv('../data/output/us_dot/1_otp_2019_raw.csv',
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

Wall time: 18.4 s


In [5]:
otp

Unnamed: 0,ORIGIN,DEST
0,TYS,ATL
1,TYS,ATL
2,ATL,SGF
3,ATL,SGF
4,ATL,SGF
...,...,...
7268227,MCO,SWF
7268228,DCA,BOS
7268229,PHL,BOS
7268230,BOS,SJU


In [6]:
otp_iatas = otp['ORIGIN'].unique()
otp_iatas

['TYS', 'ATL', 'SGF', 'SRQ', 'DTW', ..., 'AKN', 'DLG', 'HYA', 'PGV', 'XWA']
Length: 360
Categories (360, object): ['TYS', 'ATL', 'SGF', 'SRQ', ..., 'DLG', 'HYA', 'PGV', 'XWA']

### WBAN list ("MASTER-STN-HIST.txt")

The reader may find the [file](http://www1.ncdc.noaa.gov/pub/data/inventories/MASTER-STN-HIST.TXT) in the NOAA official website, together with relevant info to explore its content.  
*Please note that the hosting server is occasionally down.*

#### Import the source TXT file and clean it

In [7]:
new_file = []
with open ('../data/source/noaa/wban/MASTER-STN-HIST.txt', 'r') as f:
    next(f) # Skip the first row, since it's a header
    i = 0
    for line in f.readlines():
        # Break each line in accordance with the MASTER STATION HISTORY INFORMATION FILE documentation: 
        HOMR_ID = line[0:8]
        wban = line[22:27]
        WMO = line[28:33]
        FAA_LOC_ID = line[34:38]
        NWS_LOC_ID = line[39:44]
        ICAO_ID = line[45:49]
        COUNTRY = line[50:70]
        TIME_ZONE = line[105:110]
        HIST_wban_NAME = line[142:172]
        new_line = [HOMR_ID, wban, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_wban_NAME]
        new_line_2 = []
        for field in new_line:
            field = field.strip()
            new_line_2.append(field)
        new_line_2 = '^'.join(new_line_2)
        new_file.append(new_line_2)

#         i += 1
#         if i == 15:
#             break
            
new_file[:10]

['10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000158^^^^^^UNITED STATES^+9^GUSTAVUS']

In [8]:
with open('../data/output/noaa/wban/MASTER-STN-HIST_preprocessed.csv', 'w') as f:
    headers = ['HOMR_ID', 'WBAN', 'WMO', 'FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID', 'COUNTRY', 'TIME_ZONE', 'HIST_WBAN_NAME']
    f.write('^'.join(headers))
    f.write('\n')
    f.write('\n'.join(new_file))

___

#### Import the recently generated CSV file and start exploring it

In [9]:
wban = pd.read_csv('../data/output/noaa/wban/MASTER-STN-HIST_preprocessed.csv',
                   sep='^',
                   encoding='latin1',
                   low_memory = False)
wban

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


In [10]:
# For this study, only stations containing both WBAN will be useful:
wban.drop(wban[wban['WBAN'].isna()].index, axis=0, inplace=True)
# However, several empty values are found for FAA_LOC_ID:
wban.drop(wban[wban['FAA_LOC_ID'].isna()].index, axis=0, inplace=True)
# And many others come in the 4-character long way:
wban.drop(wban[wban['FAA_LOC_ID'].str.len() > 3].index, axis=0, inplace=True)
# Replace NaNs:
wban.fillna({'WMO': 999, 'TIME_ZONE': 999}, inplace=True)
wban.fillna({'NWS_LOC_ID': '', 'ICAO_ID': ''}, inplace=True)
# Drop duplicates:
wban = wban.sort_values(by=['FAA_LOC_ID', 'WBAN'], ascending=True)
wban.drop_duplicates('WBAN', keep='last', inplace=True)
wban.sort_index(inplace=True)
wban = wban.sort_values(by=['FAA_LOC_ID', 'WMO', 'WBAN'], ascending=True)
wban.drop_duplicates('FAA_LOC_ID', keep='last', inplace=True)
wban.sort_index(inplace=True)
# Cast numeric columns type to 'int64':
num_cols = wban.select_dtypes('number').columns
wban[num_cols] = wban[num_cols].astype('int64')
# Generate a unique identifier based on several elements:
wban['UniqueID'] = wban['WMO'].apply(lambda x: str(x)) + '0' \
                   + wban['WBAN'].apply(lambda x: str(x)) #\
#                    + '_' + wban['FAA_LOC_ID'].apply(lambda x: str(x))

wban

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME,UniqueID
8,10000001,24285,999,ONP,ONP,KONP,UNITED STATES,8,NEWPORT MUNI AP,999024285
30,10000239,26489,999,5MK,5MK,,UNITED STATES,9,MCKINLEY NP AP,999026489
32,10000239,46403,999,INR,INR,PAIN,UNITED STATES,9,MCKINLEY NP AP,999046403
40,10000355,25335,70362,SGY,SGY,PAGY,UNITED STATES,9,SKAGWAY AP,70362025335
50,10000485,53864,999,EET,EET,KEET,UNITED STATES,6,ALABASTER SHELBY CO AP,999053864
...,...,...,...,...,...,...,...,...,...,...
177224,30121168,64779,999,MQE,,KMQE,UNITED STATES,5,BLUE HILL ASOS,999064779
177234,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP,999094099
177253,30121791,63904,999,T74,,KT74,UNITED STATES,6,TAYLOR MUNI AP,999063904
177259,30121800,43110,999,LNQ,,KLNQ,UNITED STATES,6,BOOTHVILLE HELIPORT,999043110


In [11]:
pd.set_option('display.max_rows', 10)
wban_otp = wban[wban['FAA_LOC_ID'].isin(otp_iatas)]
wban_otp

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME,UniqueID
168,10001177,93193,72389,FAT,FAT,KFAT,UNITED STATES,8,FRESNO YOSEMITE INTL,72389093193
241,10001399,24257,72592,RDD,RDD,KRDD,UNITED STATES,8,REDDING MUNI AP,72592024257
266,10001444,23188,72290,SAN,SAN,KSAN,UNITED STATES,8,SAN DIEGO INTL AP,72290023188
301,10001448,23234,72494,SFO,SFO,KSFO,UNITED STATES,8,SAN FRANCISCO INTL AP,72494023234
308,10001454,93206,999,SBP,,KSBP,UNITED STATES,8,SAN LUIS OBISPO AP,999093206
...,...,...,...,...,...,...,...,...,...,...
175156,30015485,73805,999,ECP,ECP,KECP,UNITED STATES,6,NW FLORIDA BEACHES INTL AP,999073805
175499,30015673,12838,999,MLB,MLB,KMLB,UNITED STATES,5,MELBOURNE INTL AP,999012838
175661,30071738,94030,999,VEL,,KVEL,UNITED STATES,7,VERNAL MUNI AP,999094030
176305,30083422,398,999,PSE,,TJPS,PUERTO RICO,4,MERCEDITA AP,9990398


In [12]:
wban.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2762 entries, 8 to 177274
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   HOMR_ID         2762 non-null   int64 
 1   WBAN            2762 non-null   int64 
 2   WMO             2762 non-null   int64 
 3   FAA_LOC_ID      2762 non-null   object
 4   NWS_LOC_ID      2762 non-null   object
 5   ICAO_ID         2762 non-null   object
 6   COUNTRY         2762 non-null   object
 7   TIME_ZONE       2762 non-null   int64 
 8   HIST_WBAN_NAME  2762 non-null   object
 9   UniqueID        2762 non-null   object
dtypes: int64(4), object(6)
memory usage: 237.4+ KB


In [13]:
wban_iatas = wban['FAA_LOC_ID'].unique()
print(wban['FAA_LOC_ID'].nunique())
wban_iatas

2762


array(['ONP', '5MK', 'INR', ..., 'T74', 'LNQ', 'SXF'], dtype=object)

In [14]:
disjointed_iatas = []
for iata in otp_iatas:
    if iata not in wban_iatas:
        disjointed_iatas.append(iata)
print(len(disjointed_iatas))
disjointed_iatas

9


['AZA', 'USA', 'FCA', 'SCE', 'PAH', 'SPN', 'HHH', 'OGS', 'BKG']

In [15]:
for iata in disjointed_iatas:
    print("{} : {} flights ({:4.2f}%)".format(iata,
                                         len(otp[otp['ORIGIN'] == iata]),
                                         len(otp[otp['ORIGIN'] == iata]) / len(otp) * 100))

AZA : 5665 flights (0.08%)
USA : 1352 flights (0.02%)
FCA : 3015 flights (0.04%)
SCE : 1764 flights (0.02%)
PAH : 1222 flights (0.02%)
SPN : 471 flights (0.01%)
HHH : 2180 flights (0.03%)
OGS : 502 flights (0.01%)
BKG : 221 flights (0.00%)


With this quick check, we have ensured that almost every airport appearing in the OTP dataset is also present in the WBAN database. In other words, we now know that each airport has its corresponding meteorological station.

For those few ones that are not present (listed above), it is fair to simply drop them. This can be performed without the risk of losing model's prediction capability, considering that they account altogether for less than 0.3% of the complete dataset.

#### Generate the resulting CSV file for later use

In [16]:
wban.to_csv('../data/output/noaa/wban/MASTER-STN-HIST_postprocessed.csv',
            index=False,
            sep='^',
            encoding='latin1')

___

### OTP-WBAN merge: IATA check

In [17]:
pd.set_option('display.max_rows', 400)

In [18]:
otp_wban = otp[['ORIGIN']].groupby('ORIGIN').count().reset_index() \
                          .merge(wban[['WBAN', 'WMO', 'FAA_LOC_ID', 'UniqueID']],
                                 how='inner', left_on='ORIGIN', right_on='FAA_LOC_ID')
otp_wban

Unnamed: 0,ORIGIN,WBAN,WMO,FAA_LOC_ID,UniqueID
0,ABE,14737,72517,ABE,72517014737
1,ABQ,23050,72365,ABQ,72365023050
2,AEX,93915,74754,AEX,74754093915
3,ALB,14735,72518,ALB,72518014735
4,AMA,23047,72363,AMA,72363023047
5,ATL,13874,72219,ATL,72219013874
6,ATW,4825,999,ATW,99904825
7,AUS,13958,72254,AUS,72254013958
8,AVL,3812,72315,AVL,7231503812
9,BDL,14740,72508,BDL,72508014740


___

### Local Climatological Data (LCD) → 2019 Master file

Source: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/

#### Import the generated CSV files and start exploring them

In [19]:
pd.set_option('display.max_rows', 10)

In [20]:
tables = pd.read_html("https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/")
lcd_files = tables[0].loc[2:].copy()
lcd_files

Unnamed: 0,Name,Last modified,Size,Description
2,01001099999.csv,2020-10-24 23:45,4.2M,
3,01001499999.csv,2020-10-25 00:04,3.2M,
4,01002099999.csv,2020-10-24 23:44,4.0M,
5,01003099999.csv,2020-10-25 00:08,4.3M,
6,01006099999.csv,2020-10-25 00:53,4.1M,
...,...,...,...,...
13464,A0735700182.csv,2020-10-24 23:26,14M,
13465,A0735900240.csv,2020-10-24 23:41,17M,
13466,A5125500445.csv,2020-10-24 23:33,13M,
13467,A5125600451.csv,2020-10-24 23:32,14M,


In [21]:
lcd_files.drop(labels=['Last modified', 'Size', 'Description'], axis=1, inplace=True)
lcd_files['WBAN'] = lcd_files.loc[:, 'Name'].str.slice(-9, -4)
lcd_files.drop(lcd_files[lcd_files['WBAN'] == '99999'].index, inplace=True)
lcd_files['WBAN'].dropna(inplace=True)
lcd_files.dropna(how='all', axis=0, inplace=True)
lcd_files['WBAN'] = lcd_files['WBAN'].astype('int64')
lcd_files

Unnamed: 0,Name,WBAN
816,03577035046.csv,35046
817,03583035034.csv,35034
910,04018016201.csv,16201
1605,08449013025.csv,13025
2531,16289034113.csv,34113
...,...,...
13463,A0735500241.csv,241
13464,A0735700182.csv,182
13465,A0735900240.csv,240
13466,A5125500445.csv,445


In [22]:
otp_wban.head()

Unnamed: 0,ORIGIN,WBAN,WMO,FAA_LOC_ID,UniqueID
0,ABE,14737,72517,ABE,72517014737
1,ABQ,23050,72365,ABQ,72365023050
2,AEX,93915,74754,AEX,74754093915
3,ALB,14735,72518,ALB,72518014735
4,AMA,23047,72363,AMA,72363023047


In [23]:
lcd_wban = lcd_files.merge(otp_wban[['WBAN', 'WMO', 'FAA_LOC_ID', 'UniqueID']], how='right', left_on='WBAN', right_on='WBAN')
lcd_wban

Unnamed: 0,Name,WBAN,WMO,FAA_LOC_ID,UniqueID
0,72517014737.csv,14737,72517,ABE,72517014737
1,72365023050.csv,23050,72365,ABQ,72365023050
2,74754093915.csv,93915,74754,AEX,74754093915
3,72518014735.csv,14735,72518,ALB,72518014735
4,72363023047.csv,23047,72363,AMA,72363023047
...,...,...,...,...,...
346,70321025513.csv,25513,70321,DLG,70321025513
347,70367025322.csv,25322,70367,GST,70367025322
348,72506794720.csv,94720,999,HYA,999094720
349,72306513783.csv,13783,999,PGV,999013783


In [24]:
missing_stations = lcd_wban[lcd_wban['Name'].isna()]['FAA_LOC_ID'].to_list()
missing_stations

['HRL', 'IAG', 'DHN', 'ESC', 'PSE', 'SLN', 'STS', 'YUM', 'XWA']

In [25]:
for iata in missing_stations:
    print("{} : {} flights ({:4.2f}%)".format(iata,
                                         len(otp[otp['ORIGIN'] == iata]),
                                         len(otp[otp['ORIGIN'] == iata]) / len(otp) * 100))

HRL : 4370 flights (0.06%)
IAG : 830 flights (0.01%)
DHN : 1371 flights (0.02%)
ESC : 597 flights (0.01%)
PSE : 812 flights (0.01%)
SLN : 892 flights (0.01%)
STS : 2613 flights (0.04%)
YUM : 1909 flights (0.03%)
XWA : 204 flights (0.00%)


Once again, with this quick check we have ensured that almost every airport appearing in the OTP dataset is also present in the LCD database. In other words, we now know that each airport has its corresponding meteorological station's readings available.

For those few ones that are not present (listed above), it is fair to simply drop them. This can be performed without the risk of losing model's prediction capability, considering that they account altogether for less than 0.2% of the complete dataset.

In [26]:
lcd_wban_clean = lcd_files.merge(otp_wban[['WBAN', 'WMO', 'FAA_LOC_ID']], how='inner', left_on='WBAN', right_on='WBAN')
lcd_wban_clean

Unnamed: 0,Name,WBAN,WMO,FAA_LOC_ID
0,70026027502.csv,27502,70026,BRW
1,70063727406.csv,27406,999,SCC
2,70133026616.csv,26616,70133,OTZ
3,70200026617.csv,26617,70200,OME
4,70219026615.csv,26615,70219,BET
...,...,...,...,...
337,91190022516.csv,22516,91190,OGG
338,91197521510.csv,21510,999,KOA
339,91212041415.csv,41415,91212,GUM
340,91285021504.csv,21504,91285,ITO


In [27]:
wban_available = dict(zip(lcd_wban_clean['WBAN'].to_list(), lcd_wban_clean['Name'].to_list()))
wban_available.items()

dict_items([(27502, '70026027502.csv'), (27406, '70063727406.csv'), (26616, '70133026616.csv'), (26617, '70200026617.csv'), (26615, '70219026615.csv'), (26411, '70261026411.csv'), (26451, '70273026451.csv'), (26410, '70296026410.csv'), (25513, '70321025513.csv'), (25503, '70326025503.csv'), (25501, '70350025501.csv'), (25339, '70361025339.csv'), (25322, '70367025322.csv'), (25333, '70371025333.csv'), (25309, '70381025309.csv'), (25329, '70386025329.csv'), (25338, '70387025338.csv'), (25325, '70395025325.csv'), (25704, '70454025704.csv'), (73805, '72073573805.csv'), (12836, '72201012836.csv'), (12839, '72202012839.csv'), (12844, '72203012844.csv'), (12812, '72203412812.csv'), (12838, '72204012838.csv'), (12815, '72205012815.csv'), (12854, '72205712854.csv'), (13889, '72206013889.csv'), (3822, '72207003822.csv'), (13880, '72208013880.csv'), (12894, '72210812894.csv'), (12842, '72211012842.csv'), (12871, '72211512871.csv'), (12873, '72211612873.csv'), (53883, '72213653883.csv'), (93805, '

#### Run only the first time to download every individual LCD file (year 2019) corresponding to each WBAN station

In [28]:
import urllib.request
import time

# Root URL to download the files:
base_url = 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/'
# Storage folder:
lcds_folder = '../data/source/noaa/lcd_individual_files/' 

i = 1

for wban, wban_csv_name in wban_available.items():
    iata = lcd_wban_clean.loc[lcd_wban_clean['WBAN'] == wban, 'FAA_LOC_ID'].values[0]
    file_name = iata + "_" + str(wban) + ".csv"
    # Check if the file has already been downloaded:
    if file_name in os.listdir(lcds_folder):
        print("File '" + file_name + "' already exists.\nNo file has been generated (previous one remains).")
        continue
    else:
        if os.listdir(lcds_folder) == []:
            print('Beginning files download with urllib...')
        file_path = lcds_folder + file_name
        url = base_url + wban_csv_name
        print(i, ") IATA:", iata, "/ WBAN:", wban, "/ Link:", url)
        urllib.request.urlretrieve(url, file_path)
    #     time.sleep(5)
        i += 1
    #     if i == 3:
    #         break

File 'BRW_27502.csv' already exists.
No file has been generated (previous one remains).
File 'SCC_27406.csv' already exists.
No file has been generated (previous one remains).
File 'OTZ_26616.csv' already exists.
No file has been generated (previous one remains).
File 'OME_26617.csv' already exists.
No file has been generated (previous one remains).
File 'BET_26615.csv' already exists.
No file has been generated (previous one remains).
File 'FAI_26411.csv' already exists.
No file has been generated (previous one remains).
File 'ANC_26451.csv' already exists.
No file has been generated (previous one remains).
File 'CDV_26410.csv' already exists.
No file has been generated (previous one remains).
File 'DLG_25513.csv' already exists.
No file has been generated (previous one remains).
File 'AKN_25503.csv' already exists.
No file has been generated (previous one remains).
File 'ADQ_25501.csv' already exists.
No file has been generated (previous one remains).
File 'YAK_25339.csv' already exi

File 'IAD_93738.csv' already exists.
No file has been generated (previous one remains).
File 'DCA_13743.csv' already exists.
No file has been generated (previous one remains).
File 'BWI_93721.csv' already exists.
No file has been generated (previous one remains).
File 'HGR_93706.csv' already exists.
No file has been generated (previous one remains).
File 'ACY_93730.csv' already exists.
No file has been generated (previous one remains).
File 'PHL_13739.csv' already exists.
No file has been generated (previous one remains).
File 'TTN_14792.csv' already exists.
No file has been generated (previous one remains).
File 'LYH_13733.csv' already exists.
No file has been generated (previous one remains).
File 'SHD_93760.csv' already exists.
No file has been generated (previous one remains).
File 'ROA_13741.csv' already exists.
No file has been generated (previous one remains).
File 'LWB_53801.csv' already exists.
No file has been generated (previous one remains).
File 'CRW_13866.csv' already exi

No file has been generated (previous one remains).
File 'SLC_24127.csv' already exists.
No file has been generated (previous one remains).
File 'PVU_24174.csv' already exists.
No file has been generated (previous one remains).
File 'RKS_24027.csv' already exists.
No file has been generated (previous one remains).
File 'OGD_24126.csv' already exists.
No file has been generated (previous one remains).
File 'JAC_24166.csv' already exists.
No file has been generated (previous one remains).
File 'PIH_24156.csv' already exists.
No file has been generated (previous one remains).
File 'IDA_24145.csv' already exists.
No file has been generated (previous one remains).
File 'EKO_24121.csv' already exists.
No file has been generated (previous one remains).
File 'SUN_94161.csv' already exists.
No file has been generated (previous one remains).
File 'TWF_94178.csv' already exists.
No file has been generated (previous one remains).
File 'RDD_24257.csv' already exists.
No file has been generated (prev

In [29]:
disjointed_iatas

['AZA', 'USA', 'FCA', 'SCE', 'PAH', 'SPN', 'HHH', 'OGS', 'BKG']

In [30]:
missing_stations

['HRL', 'IAG', 'DHN', 'ESC', 'PSE', 'SLN', 'STS', 'YUM', 'XWA']

In [31]:
missingwban_completeList = disjointed_iatas + missing_stations
missingwban_completeList

['AZA',
 'USA',
 'FCA',
 'SCE',
 'PAH',
 'SPN',
 'HHH',
 'OGS',
 'BKG',
 'HRL',
 'IAG',
 'DHN',
 'ESC',
 'PSE',
 'SLN',
 'STS',
 'YUM',
 'XWA']

To sum up, below is the list of those iata codes that appear in the otp dataset, but have no weather station:  
`['AZA', 'USA', 'FCA', 'SCE', 'PAH', 'SPN', 'HHH', 'OGS', 'BKG', 'DHN', 'ESC', 'HRL', 'IAG', 'PSE', 'SLN', 'STS', 'XWA','YUM']`

# Cross each IATA with its WBAN in the OTP dataset

### OTP (2019)

In [32]:
otp = pd.read_csv('../data/output/us_dot/1_otp_2019_raw.csv',
                  encoding='latin1',
                  low_memory = False)

In [33]:
otp_wban_Origin = otp.merge(lcd_wban_clean[['WBAN', 'FAA_LOC_ID']],
                            how='inner', left_on='ORIGIN', right_on='FAA_LOC_ID')
otp_wban_Origin.rename(columns={'WBAN': 'WBAN_Origin'}, inplace=True)
otp_wban_OriginDest = otp_wban_Origin.merge(lcd_wban_clean[['WBAN', 'FAA_LOC_ID']],
                                            how='inner', left_on='DEST', right_on='FAA_LOC_ID')
otp_wban_OriginDest.rename(columns={'WBAN': 'WBAN_Dest'}, inplace=True)
rearranged_cols = [
       'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM',
       'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_NM', 'WBAN_Origin',
       'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR', 'DEST_STATE_NM', 'WBAN_Dest',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'DEP_DEL15', 'DEP_TIME_hour', 'TAXI_OUT',
       'TAXI_IN', 'TAXI_OUT_median', 'TAXI_IN_median', 'CRS_ARR_TIME', 'ARR_TIME',
       'ARR_DELAY', 'ARR_DEL15', 'ARR_TIME_hour', 'CANCELLED','CRS_ELAPSED_TIME', 'DISTANCE', 'DISTANCE_GROUP',
       'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'
]

otp_wban_OriginDest = otp_wban_OriginDest[rearranged_cols]
otp_wban_OriginDest

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,3,4,9E,N195PQ,TYS,"Knoxville, TN",TN,Tennessee,13891,ATL,"Atlanta, GA",GA,Georgia,13874,1140,1205,25,1,11,30,4,15,8,1250,1315,25,1,12,0,70,152,1,0,0,0,0,25
1,1,4,5,9E,N919XJ,TYS,"Knoxville, TN",TN,Tennessee,13891,ATL,"Atlanta, GA",GA,Georgia,13874,1140,1250,70,1,11,35,9,15,8,1250,1412,82,1,12,0,70,152,1,0,0,12,0,70
2,1,1,2,9E,N606LR,TYS,"Knoxville, TN",TN,Tennessee,13891,ATL,"Atlanta, GA",GA,Georgia,13874,706,659,-7,0,7,13,4,15,8,813,749,-24,0,8,0,67,152,1,0,0,0,0,0
3,1,2,3,9E,N920XJ,TYS,"Knoxville, TN",TN,Tennessee,13891,ATL,"Atlanta, GA",GA,Georgia,13874,706,700,-6,0,7,17,4,15,8,815,753,-22,0,8,0,69,152,1,0,0,0,0,0
4,1,3,4,9E,N329PQ,TYS,"Knoxville, TN",TN,Tennessee,13891,ATL,"Atlanta, GA",GA,Georgia,13874,706,703,-3,0,7,19,5,15,8,815,801,-14,0,8,0,69,152,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7208367,8,6,2,AS,N619AS,JNU,"Juneau, AK",AK,Alaska,25309,GST,"Gustavus, AK",AK,Alaska,25322,1700,1700,0,0,17,12,6,12,6,1735,1730,-5,0,17,0,35,41,1,0,0,0,0,0
7208368,8,23,5,AS,N614AS,JNU,"Juneau, AK",AK,Alaska,25309,GST,"Gustavus, AK",AK,Alaska,25322,1700,1639,-21,0,17,12,6,12,6,1735,1710,-25,0,17,0,35,41,1,0,0,0,0,0
7208369,8,24,6,AS,N613AS,JNU,"Juneau, AK",AK,Alaska,25309,GST,"Gustavus, AK",AK,Alaska,25322,1700,1641,-19,0,17,11,6,12,6,1735,1717,-18,0,17,0,35,41,1,0,0,0,0,0
7208370,8,25,7,AS,N614AS,JNU,"Juneau, AK",AK,Alaska,25309,GST,"Gustavus, AK",AK,Alaska,25322,1700,1926,146,1,17,22,3,12,6,1735,2008,153,1,17,0,35,41,1,146,0,7,0,0


In [34]:
otp_wban_OriginDest[otp_wban_OriginDest['ORIGIN'].isin(missingwban_completeList) | \
                    otp_wban_OriginDest['DEST'].isin(missingwban_completeList)]

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,WBAN_Origin,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,WBAN_Dest,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,TAXI_OUT_median,TAXI_IN_median,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY


In [35]:
otp[otp['ORIGIN'].isin(missingwban_completeList) | \
    otp['DEST'].isin(missingwban_completeList)]

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_NM,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_NM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,DEP_TIME_hour,TAXI_OUT,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_hour,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,TAXI_OUT_median,TAXI_IN_median
2358,1,4,5,WN,N934WN,HOU,"Houston, TX",TX,Texas,HRL,"Harlingen/San Benito, TX",TX,Texas,1835,1854,19,1,18,8,2,1935,1947,12,0,19,0,60,277,2,0,0,0,0,0,9,2
2359,1,4,5,WN,N7748A,HOU,"Houston, TX",TX,Texas,HRL,"Harlingen/San Benito, TX",TX,Texas,1045,1044,-1,0,10,10,2,1150,1138,-12,0,11,0,65,277,2,0,0,0,0,0,9,2
2360,1,4,5,WN,N405WN,HOU,"Houston, TX",TX,Texas,HRL,"Harlingen/San Benito, TX",TX,Texas,2210,2254,44,1,22,13,2,2315,2350,35,1,23,0,65,277,2,5,0,0,0,30,9,2
2361,1,4,5,WN,N473WN,HOU,"Houston, TX",TX,Texas,HRL,"Harlingen/San Benito, TX",TX,Texas,805,800,-5,0,8,11,2,910,857,-13,0,9,0,65,277,2,0,0,0,0,0,9,2
2362,1,4,5,WN,N8728Q,HOU,"Houston, TX",TX,Texas,HRL,"Harlingen/San Benito, TX",TX,Texas,1340,1407,27,1,13,11,3,1440,1508,28,1,14,0,60,277,2,27,0,1,0,0,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7268150,12,31,2,B6,N637JB,PSE,"Ponce, PR",PR,Puerto Rico,MCO,"Orlando, FL",FL,Florida,138,123,-15,0,0,11,4,340,315,-25,0,0,0,182,1179,5,0,0,0,0,0,12,8
7268187,12,31,2,B6,N958JB,JFK,"New York, NY",NY,New York,PSE,"Ponce, PR",PR,Puerto Rico,2059,2055,-4,0,20,16,4,159,134,-25,0,0,0,240,1617,7,0,0,0,0,0,20,5
7268188,12,31,2,B6,N958JB,PSE,"Ponce, PR",PR,Puerto Rico,JFK,"New York, NY",NY,New York,319,320,1,0,0,22,5,628,624,-4,0,6,0,249,1617,7,0,0,0,0,0,12,8
7268195,12,31,2,B6,N591JB,MCO,"Orlando, FL",FL,Florida,PSE,"Ponce, PR",PR,Puerto Rico,2359,8,9,0,23,18,7,344,352,8,0,0,0,165,1179,5,0,0,0,0,0,15,5


In [36]:
(len(otp) - len(otp_wban_OriginDest)) == len(otp[otp['ORIGIN'].isin(missingwban_completeList) | \
                                                 otp['DEST'].isin(missingwban_completeList)])

True

From this it can be deduced that the result of these merge operations is a new dataset in which only rows corresponding to the previously identified missing WBAN stations are dropped. Just as expected, which is a good sign.

In [37]:
print("Number of discarded flights: {} ({:5.3f}%)".format((len(otp) - len(otp_wban_OriginDest)),
                                                           100 * (len(otp) - len(otp_wban_OriginDest)) / len(otp)))

Number of discarded flights: 59860 (0.824%)


As previously calculated, the dismissed flights account for less than 1% of the entire dataset. In return, this will enable us to allocate the applicable weather conditions to each flight. This data could potentially greatly improve the model, so it seems reasonable to just drop them.

In [38]:
%%time

otp_wban_OriginDest.to_csv('../data/output/us_dot/2_otp_2019_wban.csv',
                           index=False,
                           encoding='latin1')

Wall time: 1min 35s


In [39]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

Time elapsed:  185.61764889999998


___

___