In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.transtats.bts.gov/Fields.asp?Table_ID=236&SYS_Table_Name=T_ONTIME_REPORTING&User_Table_Name=Reporting%20Carrier%20On-Time%20Performance%20(1987-present)&Year_Info=1&First_Year=1987&Last_Year=2020&Rate_Info=0&Frequency=Monthly&Data_Frequency=Annual,Quarterly,Monthly).

___

# 2. Get the data

### OTP (2019)

Let's first check which airports are present in the OTP dataset, and later compare them to those appearing in the WBAN database. Ideally, all the airports contained in the OTP dataset should appear in the WBAN database. This would mean that there presumably might be a weather station at each of those airports.

In [4]:
cols = ['ORIGIN', 'DEST']

In [5]:
preprocessed_input_csv_path = os.path.join(root,
                                           "Output_Data",
                                           "US_DoT",
                                           "AL_OTP_MVP_Preprocessed_19_v2_clean.csv")
preprocessed_input_csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\US_DoT\\AL_OTP_MVP_Preprocessed_19_v2_clean.csv'

In [6]:
cols_dtypes = {'ORIGIN' : 'category', 'DEST' : 'category'}

In [7]:
%%time

OTP = pd.read_csv(preprocessed_input_csv_path,
                  encoding='latin1',
                  usecols=cols,
                  low_memory = False)

Wall time: 14.2 s


In [8]:
# Cast types in accordance to previously defined dictionary:
OTP = OTP.astype(cols_dtypes)
# Display DF columns according to 'cols' order:
OTP = OTP[cols]
OTP

Unnamed: 0,ORIGIN,DEST
0,TYS,ATL
1,TYS,ATL
2,ATL,SGF
3,ATL,SGF
4,ATL,SGF
...,...,...
7268227,MCO,SWF
7268228,DCA,BOS
7268229,PHL,BOS
7268230,BOS,SJU


In [9]:
OTP_IATAs = OTP['ORIGIN'].unique()
OTP_IATAs

['TYS', 'ATL', 'SGF', 'SRQ', 'DTW', ..., 'AKN', 'DLG', 'HYA', 'PGV', 'XWA']
Length: 360
Categories (360, object): ['TYS', 'ATL', 'SGF', 'SRQ', ..., 'DLG', 'HYA', 'PGV', 'XWA']

### WBAN list ("MASTER-STN-HIST.txt")

#### Import the source TXT file and clean it

In [10]:
txt_path = os.path.join(root,
                        "Raw_Data",
                        "NOAA",
                        "WBAN",
                        "MASTER-STN-HIST.txt")

txt_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Raw_Data\\NOAA\\WBAN\\MASTER-STN-HIST.txt'

In [12]:
new_file = []
with open (txt_path, 'r') as f:
    next(f) # Skip the first row, since it's a header
    i = 0
    for line in f.readlines():
        HOMR_ID = line[0:8]
        WBAN = line[22:27]
        WMO = line[28:33]
        FAA_LOC_ID = line[34:38]
        NWS_LOC_ID = line[39:44]
        ICAO_ID = line[45:49]
        COUNTRY = line[50:70]
        TIME_ZONE = line[105:110]
        HIST_WBAN_NAME = line[142:172]
        new_line = [HOMR_ID, WBAN, WMO, FAA_LOC_ID, NWS_LOC_ID, ICAO_ID, COUNTRY, TIME_ZONE, HIST_WBAN_NAME]
        new_line_2 = []
        for field in new_line:
            field = field.strip()
            new_line_2.append(field)
        new_line_2 = '^'.join(new_line_2)
        new_file.append(new_line_2)

#         i += 1
#         if i == 15:
#             break
            
new_file[:10]

['10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^^^^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000001^24285^^ONP^ONP^KONP^UNITED STATES^+8^NEWPORT MUNI AP',
 '10000158^^^^^^UNITED STATES^+9^GUSTAVUS']

In [13]:
csv_output_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_preprocessed.csv")

csv_output_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [14]:
with open(csv_output_path, 'w') as f:
    headers = ['HOMR_ID', 'WBAN', 'WMO', 'FAA_LOC_ID', 'NWS_LOC_ID', 'ICAO_ID', 'COUNTRY', 'TIME_ZONE', 'HIST_WBAN_NAME']
    f.write('^'.join(headers))
    f.write('\n')
    f.write('\n'.join(new_file))    

___

#### Import the recently generated CSV file and start exploring it

In [321]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "NOAA",
                        "MASTER-STN-HIST_preprocessed.csv")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\MASTER-STN-HIST_preprocessed.csv'

In [322]:
WBAN = pd.read_csv(csv_path,
                   sep='^',
                   encoding='latin1',
                   low_memory = False)
WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,UNITED STATES,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178012,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178013,30125303,,,,,,UNITED STATES,5.0,CAMDEN
178014,30125303,,,,,,UNITED STATES,5.0,CAMDEN


In [323]:
# For this study, only stations containing both WBAN will be useful:
WBAN.drop(WBAN[WBAN['WBAN'].isna()].index, axis=0, inplace=True)
# However, several empty values are found for FAA_LOC_ID:
WBAN.drop(WBAN[WBAN['FAA_LOC_ID'].isna()].index, axis=0, inplace=True)
# And many others come in the 4-character long way:
WBAN.drop(WBAN[WBAN['FAA_LOC_ID'].str.len() > 3].index, axis=0, inplace=True)
# Replace NaNs:
WBAN.fillna({'WMO':999, 'TIME_ZONE': 999}, inplace=True)
WBAN.fillna({'NWS_LOC_ID': '', 'ICAO_ID': ''}, inplace=True)
# Drop duplicates:
WBAN = WBAN.sort_values(by=['FAA_LOC_ID', 'WBAN'], ascending=True)
WBAN.drop_duplicates('WBAN', keep='last', inplace=True)
WBAN.sort_index(inplace=True)
WBAN = WBAN.sort_values(by=['FAA_LOC_ID', 'WMO', 'WBAN'], ascending=True)
WBAN.drop_duplicates('FAA_LOC_ID', keep='last', inplace=True)
WBAN.sort_index(inplace=True)
# Cast numeric columns type to 'int64':
num_cols = WBAN.select_dtypes('number').columns
WBAN[num_cols] = WBAN[num_cols].astype('int64')
# Generate a unique identifier based on several elements:
WBAN['UniqueID'] = WBAN['WMO'].apply(lambda x: str(x)) + '0' \
                   + WBAN['WBAN'].apply(lambda x: str(x)) #\
#                    + '_' + WBAN['FAA_LOC_ID'].apply(lambda x: str(x))

WBAN

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME,UniqueID
8,10000001,24285,999,ONP,ONP,KONP,UNITED STATES,8,NEWPORT MUNI AP,999024285
30,10000239,26489,999,5MK,5MK,,UNITED STATES,9,MCKINLEY NP AP,999026489
32,10000239,46403,999,INR,INR,PAIN,UNITED STATES,9,MCKINLEY NP AP,999046403
40,10000355,25335,70362,SGY,SGY,PAGY,UNITED STATES,9,SKAGWAY AP,70362025335
50,10000485,53864,999,EET,EET,KEET,UNITED STATES,6,ALABASTER SHELBY CO AP,999053864
...,...,...,...,...,...,...,...,...,...,...
177224,30121168,64779,999,MQE,,KMQE,UNITED STATES,5,BLUE HILL ASOS,999064779
177234,30121192,94099,999,XWA,XWA,KXWA,UNITED STATES,6,WILLISTON AP,999094099
177253,30121791,63904,999,T74,,KT74,UNITED STATES,6,TAYLOR MUNI AP,999063904
177259,30121800,43110,999,LNQ,,KLNQ,UNITED STATES,6,BOOTHVILLE HELIPORT,999043110


In [324]:
pd.set_option('display.max_rows', 10)
WBAN_OTP = WBAN[WBAN['FAA_LOC_ID'].isin(OTP_IATAs)]
WBAN_OTP

Unnamed: 0,HOMR_ID,WBAN,WMO,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTRY,TIME_ZONE,HIST_WBAN_NAME,UniqueID
168,10001177,93193,72389,FAT,FAT,KFAT,UNITED STATES,8,FRESNO YOSEMITE INTL,72389093193
241,10001399,24257,72592,RDD,RDD,KRDD,UNITED STATES,8,REDDING MUNI AP,72592024257
266,10001444,23188,72290,SAN,SAN,KSAN,UNITED STATES,8,SAN DIEGO INTL AP,72290023188
301,10001448,23234,72494,SFO,SFO,KSFO,UNITED STATES,8,SAN FRANCISCO INTL AP,72494023234
308,10001454,93206,999,SBP,,KSBP,UNITED STATES,8,SAN LUIS OBISPO AP,999093206
...,...,...,...,...,...,...,...,...,...,...
175156,30015485,73805,999,ECP,ECP,KECP,UNITED STATES,6,NW FLORIDA BEACHES INTL AP,999073805
175499,30015673,12838,999,MLB,MLB,KMLB,UNITED STATES,5,MELBOURNE INTL AP,999012838
175661,30071738,94030,999,VEL,,KVEL,UNITED STATES,7,VERNAL MUNI AP,999094030
176305,30083422,398,999,PSE,,TJPS,PUERTO RICO,4,MERCEDITA AP,9990398


In [325]:
WBAN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2762 entries, 8 to 177274
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   HOMR_ID         2762 non-null   int64 
 1   WBAN            2762 non-null   int64 
 2   WMO             2762 non-null   int64 
 3   FAA_LOC_ID      2762 non-null   object
 4   NWS_LOC_ID      2762 non-null   object
 5   ICAO_ID         2762 non-null   object
 6   COUNTRY         2762 non-null   object
 7   TIME_ZONE       2762 non-null   int64 
 8   HIST_WBAN_NAME  2762 non-null   object
 9   UniqueID        2762 non-null   object
dtypes: int64(4), object(6)
memory usage: 237.4+ KB


In [326]:
WBAN_IATAs = WBAN['FAA_LOC_ID'].unique()
print(WBAN['FAA_LOC_ID'].nunique())
WBAN_IATAs

2762


array(['ONP', '5MK', 'INR', ..., 'T74', 'LNQ', 'SXF'], dtype=object)

In [327]:
disjointed_IATAs = []
for IATA in OTP_IATAs:
    if IATA not in WBAN_IATAs:
        disjointed_IATAs.append(IATA)
print(len(disjointed_IATAs))
disjointed_IATAs

9


['AZA', 'USA', 'FCA', 'SCE', 'PAH', 'SPN', 'HHH', 'OGS', 'BKG']

In [328]:
for IATA in disjointed_IATAs:
    print("{} : {} flights ({:4.2f}%)".format(IATA,
                                         len(OTP[OTP['ORIGIN'] == IATA]),
                                         len(OTP[OTP['ORIGIN'] == IATA]) / len(OTP) * 100))

AZA : 5665 flights (0.08%)
USA : 1352 flights (0.02%)
FCA : 3015 flights (0.04%)
SCE : 1764 flights (0.02%)
PAH : 1222 flights (0.02%)
SPN : 471 flights (0.01%)
HHH : 2180 flights (0.03%)
OGS : 502 flights (0.01%)
BKG : 221 flights (0.00%)


With this quick check, we have ensured that almost every airport appearing in the OTP dataset is also present in the WBAN database. In other words, we now know that each airport has its corresponding meteorological station.

For those few ones that are not present (listed above), it is fair to simply drop them. This can be performed without the risk of losing model's prediction capability, considering that they account altogether for less than 0.3% of the complete dataset.

#### Generate the resulting CSV file for later use

In [329]:
output_csv_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_postprocessed.csv")

WBAN.to_csv(path_or_buf=output_csv_path,
            index=False,
            sep='^',
            encoding='latin1')

___

### OTP-WBAN merge

In [339]:
pd.set_option('display.max_rows', 400)

In [342]:
OTP_WBAN = OTP[['ORIGIN']].groupby('ORIGIN').count().reset_index() \
                          .merge(WBAN[['WBAN', 'WMO', 'FAA_LOC_ID', 'UniqueID']],
                                 how='inner', left_on='ORIGIN', right_on='FAA_LOC_ID')
OTP_WBAN

Unnamed: 0,ORIGIN,WBAN,WMO,FAA_LOC_ID,UniqueID
0,ABE,14737,72517,ABE,72517014737
1,ABI,13962,72266,ABI,72266013962
2,ABQ,23050,72365,ABQ,72365023050
3,ABR,14929,72659,ABR,72659014929
4,ABY,13869,72216,ABY,72216013869
5,ACK,14756,999,ACK,999014756
6,ACT,13959,72256,ACT,72256013959
7,ACV,24283,999,ACV,999024283
8,ACY,93730,72407,ACY,72407093730
9,ADK,25704,70454,ADK,70454025704


# SIGUIENTES PASOS: 
### 1. Generar una lista con los nombres de los WMO-WBAN a descargar, y ejecturar la descarga en bucle automatizado → HECHO
### 2. Generar un nuevo notebook en el que se haga el merge OTP-WBAN y eliminar los 9 aeropuertos que no tienen su WBAN correspondiente

In [347]:
missingWMO_IDs = OTP_WBAN[OTP_WBAN['WMO'] == 999]['WBAN'].to_list()
complete_IDs = OTP_WBAN[OTP_WBAN['WMO'] != 999]['UniqueID'].to_list()
len(missingWMO_IDs), len(complete_IDs)

(126, 225)

In [348]:
missingWMO_IDs

[14756,
 24283,
 94790,
 93073,
 4825,
 14946,
 94815,
 13838,
 24217,
 13802,
 54831,
 53883,
 11603,
 94938,
 23152,
 24132,
 93129,
 93736,
 4869,
 3802,
 94870,
 93075,
 93842,
 94890,
 93843,
 93005,
 94928,
 14905,
 14991,
 73805,
 23063,
 24121,
 14748,
 93719,
 93740,
 3103,
 23064,
 3901,
 3902,
 53893,
 93007,
 94025,
 93706,
 94931,
 93034,
 94745,
 12996,
 14758,
 94720,
 3968,
 4738,
 24145,
 94893,
 94761,
 24166,
 13987,
 14919,
 21510,
 24022,
 3950,
 54735,
 23020,
 13812,
 13976,
 13963,
 12907,
 53801,
 14845,
 12959,
 3936,
 14710,
 12838,
 13942,
 3181,
 24013,
 23259,
 93013,
 94724,
 93753,
 24126,
 3102,
 24284,
 53803,
 24222,
 64776,
 12812,
 13783,
 93741,
 53808,
 12873,
 14841,
 23184,
 24163,
 398,
 4743,
 93138,
 24174,
 4803,
 24027,
 12894,
 23049,
 93206,
 27406,
 12854,
 23186,
 93760,
 23293,
 93998,
 93225,
 93184,
 12871,
 23274,
 11640,
 94161,
 14714,
 3965,
 14792,
 14850,
 94178,
 13977,
 13972,
 94030,
 93845,
 94163,
 53922,
 94099]

1) Eliminar las filas sobrantes: es decir, solo debería haber una posible pareja de WBAN para cada VMO, y viceversa. Ahora mismo hay:

999	    72326  
13891	72326  
etc.

2) Descargar todos los archivos de la página de LCD siguiendo el formato:

https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/XXXXXYYYYYY.csv

Donde:
- XXXXX = WMO
- YYYYYY = WBAN

___

### Local Climatological Data (LCD) > 2019 Master file

Source: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/

#### Import the generated CSV files and start exploring them

In [350]:
pd.set_option('display.max_rows', 10)

In [392]:
tables = pd.read_html("https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/")
LCD_files = tables[0].loc[2:].copy()
LCD_files

Unnamed: 0,Name,Last modified,Size,Description
2,01001099999.csv,2020-10-24 23:45,4.2M,
3,01001499999.csv,2020-10-25 00:04,3.2M,
4,01002099999.csv,2020-10-24 23:44,4.0M,
5,01003099999.csv,2020-10-25 00:08,4.3M,
6,01006099999.csv,2020-10-25 00:53,4.1M,
...,...,...,...,...
13464,A0735700182.csv,2020-10-24 23:26,14M,
13465,A0735900240.csv,2020-10-24 23:41,17M,
13466,A5125500445.csv,2020-10-24 23:33,13M,
13467,A5125600451.csv,2020-10-24 23:32,14M,


In [393]:
LCD_files.drop(labels=['Last modified', 'Size', 'Description'], axis=1, inplace=True)
LCD_files['WBAN'] = LCD_files.loc[:, 'Name'].str.slice(-9, -4)
LCD_files.drop(LCD_files[LCD_files['WBAN'] == '99999'].index, inplace=True)
LCD_files['WBAN'].dropna(inplace=True)
LCD_files.dropna(how='all', axis=0, inplace=True)
LCD_files['WBAN'] = LCD_files['WBAN'].astype('int64')
LCD_files

Unnamed: 0,Name,WBAN
816,03577035046.csv,35046
817,03583035034.csv,35034
910,04018016201.csv,16201
1605,08449013025.csv,13025
2531,16289034113.csv,34113
...,...,...
13463,A0735500241.csv,00241
13464,A0735700182.csv,00182
13465,A0735900240.csv,00240
13466,A5125500445.csv,00445


In [391]:
OTP_WBAN.head()

Unnamed: 0,ORIGIN,WBAN,WMO,FAA_LOC_ID,UniqueID
0,ABE,14737,72517,ABE,72517014737
1,ABI,13962,72266,ABI,72266013962
2,ABQ,23050,72365,ABQ,72365023050
3,ABR,14929,72659,ABR,72659014929
4,ABY,13869,72216,ABY,72216013869


In [402]:
LCD_WBAN = LCD_files.merge(OTP_WBAN[['WBAN', 'WMO', 'FAA_LOC_ID', 'UniqueID']], how='right', left_on='WBAN', right_on='WBAN')
LCD_WBAN

Unnamed: 0,Name,WBAN,WMO,FAA_LOC_ID,UniqueID
0,72517014737.csv,14737,72517,ABE,72517014737
1,72266013962.csv,13962,72266,ABI,72266013962
2,72365023050.csv,23050,72365,ABQ,72365023050
3,72659014929.csv,14929,72659,ABR,72659014929
4,72216013869.csv,13869,72216,ABY,72216013869
...,...,...,...,...,...
346,72676494163.csv,94163,999,WYS,999094163
347,72343653922.csv,53922,999,XNA,999053922
348,,94099,999,XWA,999094099
349,70361025339.csv,25339,70361,YAK,70361025339


In [406]:
missing_stations = LCD_WBAN[LCD_WBAN['Name'].isna()]['FAA_LOC_ID'].to_list()
for IATA in missing_stations:
    print("{} : {} flights ({:4.2f}%)".format(IATA,
                                         len(OTP[OTP['ORIGIN'] == IATA]),
                                         len(OTP[OTP['ORIGIN'] == IATA]) / len(OTP) * 100))

DHN : 1371 flights (0.02%)
ESC : 597 flights (0.01%)
HRL : 4370 flights (0.06%)
IAG : 830 flights (0.01%)
PSE : 812 flights (0.01%)
SLN : 892 flights (0.01%)
STS : 2613 flights (0.04%)
XWA : 204 flights (0.00%)
YUM : 1909 flights (0.03%)


Once again, with this quick check we have ensured that almost every airport appearing in the OTP dataset is also present in the LCD database. In other words, we now know that each airport has its corresponding meteorological station's readings available.

For those few ones that are not present (listed above), it is fair to simply drop them. This can be performed without the risk of losing model's prediction capability, considering that they account altogether for less than 0.2% of the complete dataset.

In [407]:
LCD_WBAN_clean = LCD_files.merge(OTP_WBAN[['WBAN', 'WMO', 'FAA_LOC_ID']], how='inner', left_on='WBAN', right_on='WBAN')
LCD_WBAN_clean

Unnamed: 0,Name,WBAN,WMO,FAA_LOC_ID
0,70026027502.csv,27502,70026,BRW
1,70063727406.csv,27406,999,SCC
2,70133026616.csv,26616,70133,OTZ
3,70200026617.csv,26617,70200,OME
4,70219026615.csv,26615,70219,BET
...,...,...,...,...
337,91190022516.csv,22516,91190,OGG
338,91197521510.csv,21510,999,KOA
339,91212041415.csv,41415,91212,GUM
340,91285021504.csv,21504,91285,ITO


In [427]:
WBAN_available = dict(zip(LCD_WBAN_clean['WBAN'].to_list(), LCD_WBAN_clean['Name'].to_list()))
WBAN_available.items()

dict_items([(27502, '70026027502.csv'), (27406, '70063727406.csv'), (26616, '70133026616.csv'), (26617, '70200026617.csv'), (26615, '70219026615.csv'), (26411, '70261026411.csv'), (26451, '70273026451.csv'), (26410, '70296026410.csv'), (25513, '70321025513.csv'), (25503, '70326025503.csv'), (25501, '70350025501.csv'), (25339, '70361025339.csv'), (25322, '70367025322.csv'), (25333, '70371025333.csv'), (25309, '70381025309.csv'), (25329, '70386025329.csv'), (25338, '70387025338.csv'), (25325, '70395025325.csv'), (25704, '70454025704.csv'), (73805, '72073573805.csv'), (12836, '72201012836.csv'), (12839, '72202012839.csv'), (12844, '72203012844.csv'), (12812, '72203412812.csv'), (12838, '72204012838.csv'), (12815, '72205012815.csv'), (12854, '72205712854.csv'), (13889, '72206013889.csv'), (3822, '72207003822.csv'), (13880, '72208013880.csv'), (12894, '72210812894.csv'), (12842, '72211012842.csv'), (12871, '72211512871.csv'), (12873, '72211612873.csv'), (53883, '72213653883.csv'), (93805, '

#### Run only the first time to download every individual LCD file (year 2019) corresponding to each WBAN station

In [414]:
output_csv_path_base = os.path.join(root,
                                    "Output_Data",
                                    "NOAA",
                                    "LCD_files")
output_csv_path_base

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\LCD_files'

In [445]:
import urllib.request
import time

base_url = 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/'

i = 1
print('Beginning files download with urllib...')
for WBAN, WBAN_csv_name in WBAN_available.items():
    IATA = LCD_WBAN_clean.loc[LCD_WBAN_clean['WBAN'] == WBAN, 'FAA_LOC_ID'].values[0]
    csv_name = IATA + "_" + str(WBAN) + ".csv"
    csv_path = os.path.join(output_csv_path_base, csv_name)
    url = base_url + WBAN_csv_name
    print(i, ") IATA:", IATA, "/ WBAN:", WBAN, "/ Link:", url)
    urllib.request.urlretrieve(url, csv_path)
#     time.sleep(5)
    i += 1
#     if i == 3:
#         break

Beginning files download with urllib...
1 ) IATA: BRW / WBAN: 27502 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70026027502.csv
2 ) IATA: SCC / WBAN: 27406 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70063727406.csv
3 ) IATA: OTZ / WBAN: 26616 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70133026616.csv
4 ) IATA: OME / WBAN: 26617 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70200026617.csv
5 ) IATA: BET / WBAN: 26615 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70219026615.csv
6 ) IATA: FAI / WBAN: 26411 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70261026411.csv
7 ) IATA: ANC / WBAN: 26451 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70273026451.csv
8 ) IATA: CDV / WBAN: 26410 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/70296026

68 ) IATA: ACT / WBAN: 13959 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72256013959.csv
69 ) IATA: GRK / WBAN: 3902 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72257603902.csv
70 ) IATA: DAL / WBAN: 13960 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72258013960.csv
71 ) IATA: DFW / WBAN: 3927 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72259003927.csv
72 ) IATA: DRT / WBAN: 22010 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72261022010.csv
73 ) IATA: SJT / WBAN: 23034 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72263023034.csv
74 ) IATA: MAF / WBAN: 23023 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72265023023.csv
75 ) IATA: ABI / WBAN: 13962 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72266013962.csv
76 ) IATA: LBB / WBAN: 230

135 ) IATA: DCA / WBAN: 13743 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72405013743.csv
136 ) IATA: BWI / WBAN: 93721 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72406093721.csv
137 ) IATA: HGR / WBAN: 93706 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72406693706.csv
138 ) IATA: ACY / WBAN: 93730 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72407093730.csv
139 ) IATA: PHL / WBAN: 13739 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72408013739.csv
140 ) IATA: TTN / WBAN: 14792 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72409514792.csv
141 ) IATA: LYH / WBAN: 13733 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72410013733.csv
142 ) IATA: SHD / WBAN: 93760 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72410593760.csv
143 ) IATA: ROA 

202 ) IATA: BGM / WBAN: 4725 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72515004725.csv
203 ) IATA: ITH / WBAN: 94761 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72515594761.csv
204 ) IATA: ELM / WBAN: 14748 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72515614748.csv
205 ) IATA: ABE / WBAN: 14737 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72517014737.csv
206 ) IATA: ALB / WBAN: 14735 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72518014735.csv
207 ) IATA: SYR / WBAN: 14771 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72519014771.csv
208 ) IATA: PIT / WBAN: 94823 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72520094823.csv
209 ) IATA: LBE / WBAN: 54735 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72520754735.csv
210 ) IATA: CAK /

269 ) IATA: MKE / WBAN: 14839 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72640014839.csv
270 ) IATA: MSN / WBAN: 14837 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72641014837.csv
271 ) IATA: LSE / WBAN: 14920 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72643014920.csv
272 ) IATA: EAU / WBAN: 14991 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72643514991.csv
273 ) IATA: RST / WBAN: 14925 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72644014925.csv
274 ) IATA: GRB / WBAN: 14898 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72645014898.csv
275 ) IATA: ATW / WBAN: 4825 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72645704825.csv
276 ) IATA: CWA / WBAN: 94890 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/72646594890.csv
277 ) IATA: FSD /

336 ) IATA: LIH / WBAN: 22536 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91165022536.csv
337 ) IATA: HNL / WBAN: 22521 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91182022521.csv
338 ) IATA: OGG / WBAN: 22516 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91190022516.csv
339 ) IATA: KOA / WBAN: 21510 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91197521510.csv
340 ) IATA: GUM / WBAN: 41415 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91212041415.csv
341 ) IATA: ITO / WBAN: 21504 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91285021504.csv
342 ) IATA: PPG / WBAN: 61705 / Link: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/91765061705.csv


In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___