In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.transtats.bts.gov/Fields.asp?Table_ID=236&SYS_Table_Name=T_ONTIME_REPORTING&User_Table_Name=Reporting%20Carrier%20On-Time%20Performance%20(1987-present)&Year_Info=1&First_Year=1987&Last_Year=2020&Rate_Info=0&Frequency=Monthly&Data_Frequency=Annual,Quarterly,Monthly).

___

# 2. Get the data

### OTP (2019)

Let's first check which airports are present in the OTP dataset, and later compare them to those appearing in the WBAN database. Ideally, all the airports contained in the OTP dataset should appear in the WBAN database. This would mean that there presumably might be a weather station at each of those airports.

In [4]:
cols = ['ORIGIN', 'DEST']

In [5]:
preprocessed_input_csv_path = os.path.join(root,
                                           "Output_Data",
                                           "US_DoT",
                                           "AL_OTP_MVP_Preprocessed_19_v2_clean.csv")
preprocessed_input_csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\US_DoT\\AL_OTP_MVP_Preprocessed_19_v2_clean.csv'

In [6]:
cols_dtypes = {'ORIGIN' : 'category', 'DEST' : 'category'}

In [7]:
%%time

OTP = pd.read_csv(preprocessed_input_csv_path,
                  encoding='latin1',
                  usecols=cols,
                  low_memory = False)

Wall time: 49.2 s


In [8]:
# Cast types in accordance to previously defined dictionary:
OTP = OTP.astype(cols_dtypes)
# Display DF columns according to 'cols' order:
OTP = OTP[cols]
OTP

Unnamed: 0,ORIGIN,DEST
0,TYS,ATL
1,TYS,ATL
2,ATL,SGF
3,ATL,SGF
4,ATL,SGF
...,...,...
7268227,MCO,SWF
7268228,DCA,BOS
7268229,PHL,BOS
7268230,BOS,SJU


In [9]:
OTP_IATAs = OTP['ORIGIN'].unique()
OTP_IATAs

['TYS', 'ATL', 'SGF', 'SRQ', 'DTW', ..., 'AKN', 'DLG', 'HYA', 'PGV', 'XWA']
Length: 360
Categories (360, object): ['TYS', 'ATL', 'SGF', 'SRQ', ..., 'DLG', 'HYA', 'PGV', 'XWA']

### WBAN list ("MASTER-STN-HIST.txt")

#### Import the source TXT file and clean it

In [10]:
txt_path = os.path.join(root,
                        "Raw_Data",
                        "NOAA",
                        "WBAN",
                        "MASTER-STN-HIST.txt")

txt_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Raw_Data\\NOAA\\WBAN\\MASTER-STN-HIST.txt'

In [11]:
new_file = []
with open (txt_path, 'r') as f:
    next(f) # Skip the first row, since it's a header
    i = 0
    for line in f.readlines():
        HOMR_ID = line[0:8]
        WBAN = line[22:27]
        WMO = line[28:33]
        FAA_LOC_ID = line[34:38]
        NWS_LOC_ID = line[39:44]
        ICAO_ID = line[45:49]
        COUNTRY = line[50:70]
        TIME_ZONE = line[105:110]
        HIST_WBAN_NAME = line[142:172]
        new_line = [HOMR_ID, WBAN, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_WBAN_NAME]
#         print(HOMR_ID, WBAN, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_WBAN_NAME)
        new_line_2 = []
        for field in new_line:
            field = field.strip()
            new_line_2.append(field)
        new_line_2 = '^'.join(new_line_2)
#         print(new_line_2)
        new_file.append(new_line_2)

#         i += 1
#         if i == 15:
#             break
            
new_file[:10]

['10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000158^^^^^^UNITED STATES^+9^GUSTAVUS']

In [12]:
csv_output_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_preprocessed.csv")

csv_output_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [13]:
with open(csv_output_path, 'w') as f:
    headers = ['HOMR_ID', 'WBAN', 'WMO', 'FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID', 'COUNTRY', 'TIME_ZONE', 'HIST_WBAN_NAME']
    f.write('^'.join(headers))
    f.write('\n')
    f.write('\n'.join(new_file))    

___

#### Import the recently generated CSV file and start exploring it

In [14]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "NOAA",
                        "MASTER-STN-HIST_preprocessed.csv")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [15]:
WBAN = pd.read_csv(csv_path,
                   sep='^',
                   encoding='latin1',
                   low_memory = False)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


In [16]:
# There seems to be some duplicates:
WBAN.drop_duplicates(subset=['HOMR_ID', 'WBAN', 'WMO', 'FAA_LOC_ID', 'ICAO_ID'], keep='first', inplace=True)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
5,10000001,24285.0,,ONP,ONP,KONP,UNITED STATES,8.0,NEWPORT MUNI AP
9,10000158,,,,,,UNITED STATES,9.0,GUSTAVUS
11,10000158,25322.0,,,,,UNITED STATES,9.0,GUSTAVUS AP
13,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
...,...,...,...,...,...,...,...,...,...
178007,30122585,48717.0,,,,,SAUDI ARABIA,-3.0,DHAHRAN
178008,30122586,48718.0,,,,,SAUDI ARABIA,-3.0,HAFR ELBATIN
178009,30125300,,,,JEFS2,,UNITED STATES,5.0,JEFFERSON 0.1NW
178010,30125302,,,,FGNA3,,UNITED STATES,7.0,FLAGSTAFF 14N


In [17]:
# Some airport codes present fewer characters than expected. Let's clean them:
WBAN.drop(WBAN[WBAN['FAA_LOC_ID'].str.len() < 3].index, axis=0, inplace=True)
WBAN.drop(WBAN[WBAN['ICAO_ID'].str.len() < 4].index, axis=0, inplace=True)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
5,10000001,24285.0,,ONP,ONP,KONP,UNITED STATES,8.0,NEWPORT MUNI AP
9,10000158,,,,,,UNITED STATES,9.0,GUSTAVUS
11,10000158,25322.0,,,,,UNITED STATES,9.0,GUSTAVUS AP
13,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
...,...,...,...,...,...,...,...,...,...
178007,30122585,48717.0,,,,,SAUDI ARABIA,-3.0,DHAHRAN
178008,30122586,48718.0,,,,,SAUDI ARABIA,-3.0,HAFR ELBATIN
178009,30125300,,,,JEFS2,,UNITED STATES,5.0,JEFFERSON 0.1NW
178010,30125302,,,,FGNA3,,UNITED STATES,7.0,FLAGSTAFF 14N


In [18]:
WBAN['ICAO_ID'] = WBAN['ICAO_ID'].str[-4:]
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
5,10000001,24285.0,,ONP,ONP,KONP,UNITED STATES,8.0,NEWPORT MUNI AP
9,10000158,,,,,,UNITED STATES,9.0,GUSTAVUS
11,10000158,25322.0,,,,,UNITED STATES,9.0,GUSTAVUS AP
13,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
...,...,...,...,...,...,...,...,...,...
178007,30122585,48717.0,,,,,SAUDI ARABIA,-3.0,DHAHRAN
178008,30122586,48718.0,,,,,SAUDI ARABIA,-3.0,HAFR ELBATIN
178009,30125300,,,,JEFS2,,UNITED STATES,5.0,JEFFERSON 0.1NW
178010,30125302,,,,FGNA3,,UNITED STATES,7.0,FLAGSTAFF 14N


In [19]:
WBAN[['WBAN', 'WMO', 'TIME_ZONE']] = WBAN[['WBAN', 'WMO', 'TIME_ZONE']].fillna(999)
WBAN[['FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID']] = WBAN[['FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID']].fillna('')
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,999.0,,,,UNITED STATES,8.0,NEWPORT MUNI AP
5,10000001,24285.0,999.0,ONP,ONP,KONP,UNITED STATES,8.0,NEWPORT MUNI AP
9,10000158,999.0,999.0,,,,UNITED STATES,9.0,GUSTAVUS
11,10000158,25322.0,999.0,,,,UNITED STATES,9.0,GUSTAVUS AP
13,10000158,25322.0,70367.0,GST,GST,PAGS,UNITED STATES,9.0,GUSTAVUS AP
...,...,...,...,...,...,...,...,...,...
178007,30122585,48717.0,999.0,,,,SAUDI ARABIA,-3.0,DHAHRAN
178008,30122586,48718.0,999.0,,,,SAUDI ARABIA,-3.0,HAFR ELBATIN
178009,30125300,999.0,999.0,,JEFS2,,UNITED STATES,5.0,JEFFERSON 0.1NW
178010,30125302,999.0,999.0,,FGNA3,,UNITED STATES,7.0,FLAGSTAFF 14N


In [20]:
WBAN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45604 entries, 0 to 178011
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HOMR_ID         45604 non-null  int64  
 1   WBAN            45604 non-null  float64
 2   WMO             45604 non-null  float64
 3   FAA_LOC_ID      45604 non-null  object 
 4   NWS_LOC_ID      45604 non-null  object 
 5   ICAO_ID         45604 non-null  object 
 6   COUNTRY         45604 non-null  object 
 7   TIME_ZONE       45604 non-null  float64
 8   HIST_WBAN_NAME  45604 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 4.7+ MB


In [21]:
num_cols = WBAN.select_dtypes('number').columns
WBAN[num_cols] = WBAN[num_cols].astype('int64')
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285,999,,,,UNITED STATES,8,NEWPORT MUNI AP
5,10000001,24285,999,ONP,ONP,KONP,UNITED STATES,8,NEWPORT MUNI AP
9,10000158,999,999,,,,UNITED STATES,9,GUSTAVUS
11,10000158,25322,999,,,,UNITED STATES,9,GUSTAVUS AP
13,10000158,25322,70367,GST,GST,PAGS,UNITED STATES,9,GUSTAVUS AP
...,...,...,...,...,...,...,...,...,...
178007,30122585,48717,999,,,,SAUDI ARABIA,-3,DHAHRAN
178008,30122586,48718,999,,,,SAUDI ARABIA,-3,HAFR ELBATIN
178009,30125300,999,999,,JEFS2,,UNITED STATES,5,JEFFERSON 0.1NW
178010,30125302,999,999,,FGNA3,,UNITED STATES,7,FLAGSTAFF 14N


In [None]:
wban

In [22]:
WBAN_ICAOs = WBAN['ICAO_ID'].unique()
print(WBAN['ICAO_ID'].nunique())
WBAN_ICAOs

2723


array(['', 'KONP', 'PAGS', ..., 'KT74', 'KLNQ', 'CWCJ'], dtype=object)

In [23]:
WBAN_IATAs = WBAN['FAA_LOC_ID'].unique()
print(WBAN['FAA_LOC_ID'].nunique())
WBAN_IATAs

4136


array(['', 'ONP', 'GST', ..., 'T74', 'LNQ', 'SXF'], dtype=object)

In [24]:
disjointed_IATAs = []
for IATA in OTP_IATAs:
    if IATA not in WBAN_IATAs:
        disjointed_IATAs.append(IATA)
print(len(disjointed_IATAs))
disjointed_IATAs

6


['AZA', 'USA', 'SCE', 'SPN', 'HHH', 'BKG']

In [25]:
for IATA in disjointed_IATAs:
    print("{} : {} flights ({:4.2f}%)".format(IATA,
                                         len(OTP[OTP['ORIGIN'] == IATA]),
                                         len(OTP[OTP['ORIGIN'] == IATA]) / len(OTP) * 100))

AZA : 5665 flights (0.08%)
USA : 1352 flights (0.02%)
SCE : 1764 flights (0.02%)
SPN : 471 flights (0.01%)
HHH : 2180 flights (0.03%)
BKG : 221 flights (0.00%)


With this quick check, we have ensured that almost every airport appearing in the OTP dataset is also present in the WBAN database. In other words, we now know that each airport has its corresponding meteorological station.

For those few ones that are not present (listed above), it is fair to simply drop them. This can be performed without the risk of losing model's prediction capability, considering that they account altogether for less than 0.2% of the complete dataset.

In [26]:
output_csv_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_postprocessed.csv")

WBAN.to_csv(path_or_buf=output_csv_path,
            index=False,
            sep='^',
            encoding='latin1')

___

### OTP-WBAN merge

In [27]:
OTP.merge(WBAN, how='inner', left_on='ORIGIN', right_on='FAA_LOC_ID')

Unnamed: 0,ORIGIN,DEST,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,TYS,ATL,20018222,999,72326,TYS,TYS,KTYS,UNITED STATES,5,KNOXVILLE MCGHEE TYSON AP
1,TYS,ATL,20018222,13891,72326,TYS,TYS,KTYS,UNITED STATES,5,KNOXVILLE MCGHEE TYSON AP
2,TYS,ATL,20018222,999,72326,TYS,TYS,KTYS,UNITED STATES,5,KNOXVILLE MCGHEE TYSON AP
3,TYS,ATL,20018222,13891,72326,TYS,TYS,KTYS,UNITED STATES,5,KNOXVILLE MCGHEE TYSON AP
4,TYS,ATL,20018222,999,72326,TYS,TYS,KTYS,UNITED STATES,5,KNOXVILLE MCGHEE TYSON AP
...,...,...,...,...,...,...,...,...,...,...,...
10172330,XWA,MSP,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP
10172331,XWA,MSP,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP
10172332,XWA,MSP,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP
10172333,XWA,DEN,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP


1) Eliminar las filas sobrantes: es decir, solo debería haber una posible pareja de WBAN para cada VMO, y viceversa. Ahora mismo hay:

999	    72326  
13891	72326  
etc.

2) Descargar todos los archivos de la página de LCD siguiendo el formato:

https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/XXXXXYYYYYY.csv

Donde:
- XXXXX = WMO
- YYYYYY = WBAN

___

### Local Climatological Data (LCD) > 2019 Master file

Source: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/

#### Import the generated CSV files and start exploring them

In [14]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "NOAA",
                        "MASTER-STN-HIST_preprocessed.csv")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [15]:
LCD = pd.read_csv(csv_path,
                   sep='^',
                   encoding='latin1',
                   low_memory = False)
LCD

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


#### Run only the first time to generate the global CLEAN file (year 2019)

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___