In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.transtats.bts.gov/Fields.asp?Table_ID=236&SYS_Table_Name=T_ONTIME_REPORTING&User_Table_Name=Reporting%20Carrier%20On-Time%20Performance%20(1987-present)&Year_Info=1&First_Year=1987&Last_Year=2020&Rate_Info=0&Frequency=Monthly&Data_Frequency=Annual,Quarterly,Monthly).

___

# 2. Get the data

### OTP (2019)

Let's first check which airports are present in the OTP dataset, and later compare them to those appearing in the WBAN database. Ideally, all the airports contained in the OTP dataset should appear in the WBAN database. This would mean that there presumably might be a weather station at each of those airports.

In [4]:
cols = ['ORIGIN', 'DEST']

In [5]:
preprocessed_input_csv_path = os.path.join(root,
                                           "Output_Data",
                                           "US_DoT",
                                           "AL_OTP_MVP_Preprocessed_19_v2_clean.csv")
preprocessed_input_csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\US_DoT\\AL_OTP_MVP_Preprocessed_19_v2_clean.csv'

In [6]:
cols_dtypes = {'ORIGIN' : 'category', 'DEST' : 'category'}

In [7]:
%%time

OTP = pd.read_csv(preprocessed_input_csv_path,
                  encoding='latin1',
                  usecols=cols,
                  low_memory = False)

Wall time: 24.1 s


In [8]:
# Cast types in accordance to previously defined dictionary:
OTP = OTP.astype(cols_dtypes)
# Display DF columns according to 'cols' order:
OTP = OTP[cols]
OTP

Unnamed: 0,ORIGIN,DEST
0,TYS,ATL
1,TYS,ATL
2,ATL,SGF
3,ATL,SGF
4,ATL,SGF
...,...,...
7268227,MCO,SWF
7268228,DCA,BOS
7268229,PHL,BOS
7268230,BOS,SJU


In [9]:
OTP_IATAs = OTP['ORIGIN'].unique()
OTP_IATAs

['TYS', 'ATL', 'SGF', 'SRQ', 'DTW', ..., 'AKN', 'DLG', 'HYA', 'PGV', 'XWA']
Length: 360
Categories (360, object): ['TYS', 'ATL', 'SGF', 'SRQ', ..., 'DLG', 'HYA', 'PGV', 'XWA']

### WBAN list ("MASTER-STN-HIST.txt")

#### Import the source TXT file and clean it

In [10]:
txt_path = os.path.join(root,
                        "Raw_Data",
                        "NOAA",
                        "WBAN",
                        "MASTER-STN-HIST.txt")

txt_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Raw_Data\\NOAA\\WBAN\\MASTER-STN-HIST.txt'

In [11]:
new_file = []
with open (txt_path, 'r') as f:
    next(f) # Skip the first row, since it's a header
    i = 0
    for line in f.readlines():
        HOMR_ID = line[0:8]
        WBAN = line[22:27]
        WMO = line[28:33]
        FAA_LOC_ID = line[34:38]
        NWS_LOC_ID = line[39:44]
        ICAO_ID = line[45:49]
        COUNTRY = line[50:70]
        TIME_ZONE = line[105:110]
        HIST_WBAN_NAME = line[142:172]
        new_line = [HOMR_ID, WBAN, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_WBAN_NAME]
#         print(HOMR_ID, WBAN, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_WBAN_NAME)
        new_line_2 = []
        for field in new_line:
            field = field.strip()
            new_line_2.append(field)
        new_line_2 = '^'.join(new_line_2)
#         print(new_line_2)
        new_file.append(new_line_2)

#         i += 1
#         if i == 15:
#             break
            
new_file[:10]

['10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000158^^^^^^UNITED STATES^+9^GUSTAVUS']

In [12]:
csv_output_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_preprocessed.csv")

csv_output_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [13]:
with open(csv_output_path, 'w') as f:
    headers = ['HOMR_ID', 'WBAN', 'WMO', 'FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID', 'COUNTRY', 'TIME_ZONE', 'HIST_WBAN_NAME']
    f.write('^'.join(headers))
    f.write('\n')
    f.write('\n'.join(new_file))    

___

#### Import the recently generated CSV file and start exploring it

In [14]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "NOAA",
                        "MASTER-STN-HIST_preprocessed.csv")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [15]:
WBAN = pd.read_csv(csv_path,
                   sep='^',
                   encoding='latin1',
                   low_memory = False)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


In [16]:
# Some airport codes present fewer characters than expected. Let's clean them:
WBAN.drop(WBAN[WBAN['FAA_LOC_ID'].str.len() < 3].index, axis=0, inplace=True)
WBAN.drop(WBAN[WBAN['ICAO_ID'].str.len() < 4].index, axis=0, inplace=True)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


In [17]:
WBAN.dropna(how='any', inplace=True)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
13,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
14,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
15,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
16,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
19,10000202,25325.0,70395.0,KTN,KTN,PAKT,UNITED STATES,9.0,KETCHIKAN INTL AP
...,...,...,...,...,...,...,...,...,...
168723,30000752,53868.0,72427.0,OQT,OQT,KOQT,UNITED STATES,5.0,OAK RIDGE ASOS
171069,30001692,93874.0,74757.0,GPT,GPT,KGPT,UNITED STATES,6.0,GULFPORT - BILOXI AP
171070,30001692,93874.0,74757.0,GPT,GPT,KGPT,UNITED STATES,6.0,GULFPORT - BILOXI AP
175241,30015538,26512.0,70246.0,MHM,MHM,PAMH,UNITED STATES,9.0,MINCHUMINA AP


In [18]:
WBAN[WBAN.select_dtypes('number').columns] = WBAN.select_dtypes('number').astype('int64')
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
13,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
14,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
15,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
16,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
19,10000202,25325,70395,KTN,KTN,PAKT,UNITED STATES,9,KETCHIKAN INTL AP
...,...,...,...,...,...,...,...,...,...
168723,30000752,53868,72427,OQT,OQT,KOQT,UNITED STATES,5,OAK RIDGE ASOS
171069,30001692,93874,74757,GPT,GPT,KGPT,UNITED STATES,6,GULFPORT - BILOXI AP
171070,30001692,93874,74757,GPT,GPT,KGPT,UNITED STATES,6,GULFPORT - BILOXI AP
175241,30015538,26512,70246,MHM,MHM,PAMH,UNITED STATES,9,MINCHUMINA AP


In [19]:
WBAN['ICAO_ID'] = WBAN['ICAO_ID'].str[-4:]
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
13,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
14,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
15,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
16,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
19,10000202,25325,70395,KTN,KTN,PAKT,UNITED STATES,9,KETCHIKAN INTL AP
...,...,...,...,...,...,...,...,...,...
168723,30000752,53868,72427,OQT,OQT,KOQT,UNITED STATES,5,OAK RIDGE ASOS
171069,30001692,93874,74757,GPT,GPT,KGPT,UNITED STATES,6,GULFPORT - BILOXI AP
171070,30001692,93874,74757,GPT,GPT,KGPT,UNITED STATES,6,GULFPORT - BILOXI AP
175241,30015538,26512,70246,MHM,MHM,PAMH,UNITED STATES,9,MINCHUMINA AP


In [20]:
WBAN_ICAOs = WBAN['ICAO_ID'].unique()
print(WBAN['ICAO_ID'].nunique())
WBAN_ICAOs

405


array(['PAGS', 'PAKT', 'PAGY', 'PAWG', 'KEET', 'KIGM', 'KAVX', 'KFAT',
       'KRBL', 'KRDD', 'KSAN', 'KSFO', 'KGJT', 'KLIC', 'KILG', 'KAAF',
       'KEYW', 'KPNS', 'KTLH', 'KVRB', 'KFFC', 'KSAV', 'KAYS', 'KLWS',
       'KORD', 'KICT', 'KSDF', 'KNHZ', 'KLAN', 'KMQT', 'KMKG', 'KSTJ',
       'KSGF', 'KJAN', 'KHKY', 'KFAR', 'KISN', 'KELY', 'KDSV', 'KSYR',
       'KILN', 'KOKC', 'KPHL', 'KDAL', 'TISX', 'KBTV', 'KEPH', 'KHTS',
       'KLND', 'KDEN', 'KPKF', 'KEKA', 'KMOB', 'KMGM', 'KBHM', 'KHSV',
       'KLZK', 'KFSM', 'KHRO', 'KDUG', 'KFHU', 'KTUS', 'KYUM', 'KPHX',
       'KINW', 'KNJK', 'KNSI', 'KLGB', 'KLAX', 'KNTD', 'KMWS', 'KSBA',
       'KSDB', 'KSMX', 'KBFL', 'KNID', 'KNLC', 'KSNS', 'KBIH', 'KNUQ',
       'KHWD', 'KOAK', 'KSCK', 'KCCR', 'KSUU', 'KSAC', 'KTRK', 'KALS',
       'KPUB', 'KFCS', 'KCOS', 'KCAG', 'KBDR', 'KBDL', 'KMIA', 'KFLL',
       'KPBI', 'KTPA', 'KMCO', 'KDAB', 'KGNV', 'KCRG', 'KVPS', 'KJAX',
       'KAMG', 'KMCN', 'KAGS', 'KATL', 'KAHN', 'KPIH', 'KBOI', 'KSLO',
      

In [21]:
WBAN_IATAs = WBAN['FAA_LOC_ID'].unique()
print(WBAN['FAA_LOC_ID'].nunique())
WBAN_IATAs

405


array(['GST', 'KTN', 'SGY', 'WRG', 'EET', 'IGM', 'AVX', 'FAT', 'RBL',
       'RDD', 'SAN', 'SFO', 'GJT', 'LIC', 'ILG', 'AAF', 'EYW', 'PNS',
       'TLH', 'VRB', 'FFC', 'SAV', 'AYS', 'LWS', 'ORD', 'ICT', 'SDF',
       'NHZ', 'LAN', 'MQT', 'MKG', 'STJ', 'SGF', 'JAN', 'HKY', 'FAR',
       'ISN', 'ELY', 'DSV', 'SYR', 'ILN', 'OKC', 'PHL', 'DAL', 'STX',
       'BTV', 'EPH', 'HTS', 'LND', 'DEN', 'PKF', 'EKA', 'MOB', 'MGM',
       'BHM', 'HSV', 'LZK', 'FSM', 'HRO', 'DUG', 'FHU', 'TUS', 'YUM',
       'PHX', 'INW', 'NJK', 'NSI', 'LGB', 'LAX', 'NTD', 'MWS', 'SBA',
       'SDB', 'SMX', 'BFL', 'NID', 'NLC', 'SNS', 'BIH', 'NUQ', 'HWD',
       'OAK', 'SCK', 'CCR', 'SUU', 'SAC', 'TRK', 'ALS', 'PUB', 'FCS',
       'COS', 'CAG', 'BDR', 'BDL', 'MIA', 'FLL', 'PBI', 'TPA', 'MCO',
       'DAB', 'GNV', 'CRG', 'VPS', 'JAX', 'AMG', 'MCN', 'AGS', 'ATL',
       'AHN', 'PIH', 'BOI', 'SLO', 'SPI', 'UIN', 'PIA', 'MLI', 'MDW',
       'RFD', 'EVV', 'IND', 'FWA', 'SBN', 'BRL', 'DSM', 'CID', 'DBQ',
       'SUX', 'ALO',

In [22]:
disjointed_IATAs = []
for IATA in disjointed_IATAs:
    if IATA not in OTP_IATAs:
        disjointed_IATAs.append(IATA)
print(len(disjointed_IATAs))
disjointed_IATAs

0


[]

**With this quick check, we have ensured that every airport appearing in the OTP dataset is also present in the WBAN database. In other words, we now know that each airport has its corresponding meteorological station.**

In [23]:
output_csv_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_postprocessed.csv")

WBAN.to_csv(path_or_buf=output_csv_path,
            index=False,
            sep='^',
            encoding='latin1')

___

### Local Climatological Data (LCD) > 2019 Master file

Source: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/

In [24]:
LCD_tables = pd.read_html('https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/')

URLError: <urlopen error [WinError 10060] Se produjo un error durante el intento de conexión ya que la parte conectada no respondió adecuadamente tras un periodo de tiempo, o bien se produjo un error en la conexión establecida ya que el host conectado no ha podido responder>

In [None]:
LCD = LCD_tables[0].loc[2:len(LCD_tables[0])-2,:]
LCD

#### Run only the first time to generate the global CLEAN file (year 2019)

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___