In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.transtats.bts.gov/Fields.asp?Table_ID=236&SYS_Table_Name=T_ONTIME_REPORTING&User_Table_Name=Reporting%20Carrier%20On-Time%20Performance%20(1987-present)&Year_Info=1&First_Year=1987&Last_Year=2020&Rate_Info=0&Frequency=Monthly&Data_Frequency=Annual,Quarterly,Monthly).

___

# 2. Get the data

### WBAN list (Meteo stations)

In [4]:
csv_path = os.path.join(root,
                        "Raw_Data",
                        "NOAA",
                        "WBAN",
                        "MASTER-STN-HIST.csv")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Raw_Data\\NOAA\\WBAN\\MASTER-STN-HIST.csv'

In [5]:
WBAN = pd.read_csv(csv_path,
                   sep='^',
                   encoding='latin1',
                   low_memory = False)

In [6]:
WBAN

Unnamed: 0,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTY,TIME_ZONE,HIST_WBAN_NAME
0,10000001,24285.0,,,,,LINCOLN,8.0,NEWPORT MUNI AP
1,10000001,24285.0,,,,,LINCOLN,8.0,NEWPORT MUNI AP
2,10000001,24285.0,,,,,LINCOLN,8.0,NEWPORT MUNI AP
3,10000001,24285.0,,,,,LINCOLN,8.0,NEWPORT MUNI AP
4,10000001,24285.0,,,,,LINCOLN,8.0,NEWPORT MUNI AP
...,...,...,...,...,...,...,...,...,...
178011,30125303,,,,,,,5.0,CAMDEN
178012,30125303,,,,,,,5.0,CAMDEN
178013,30125303,,,,,,,5.0,CAMDEN
178014,30125303,,,,,,,5.0,CAMDEN


In [7]:
WBAN.dropna(inplace=True)
WBAN.drop_duplicates(inplace=True)
WBAN

Unnamed: 0,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTY,TIME_ZONE,HIST_WBAN_NAME
13,10000158,25322.0,70367.0,GST,GST,PAGS,SKAGWAY-HOONAH-ANGOON,9.0,GUSTAVUS AP
19,10000202,25325.0,70395.0,KTN,KTN,PAKT,KETCHIKAN GATEWAY BOROUGH,9.0,KETCHIKAN INTL AP
28,10000202,25325.0,70395.0,KTN,KTN,PAKT,KETCHIKAN GATEWAY BOROUGH,9.0,KETCHIKAN AP
37,10000355,25335.0,70362.0,SGY,SGY,PAGY,SKAGWAY-HOONAH-ANGOON,9.0,SKAGWAY AP
41,10000446,25338.0,70387.0,WRG,WRG,PAWG,WRANGELL-PETERSBURG,9.0,WRANGELL AP
...,...,...,...,...,...,...,...,...,...
168721,30000752,53868.0,72427.0,OQT,OQTT,1 KOQT,ANDERSON,5.0,OAK RIDGE ASOS
171069,30001692,93874.0,74757.0,GPT,GPT,KGPT,HARRISON,6.0,GULFPORT - BILOXI AP
175241,30015538,26512.0,70246.0,MHM,MHM,PAMH,YUKON-KOYUKUK,9.0,MINCHUMINA AP
175756,30073829,14753.0,74492.0,HOM,3 MQEM,3,NORFOLK,5.0,BLUE HILL LCD


In [8]:
WBAN.isna().any().sum()

0

In [9]:
WBAN[WBAN.select_dtypes('number').columns] = WBAN.select_dtypes('number').astype('int64')
WBAN

Unnamed: 0,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTY,TIME_ZONE,HIST_WBAN_NAME
13,10000158,25322,70367,GST,GST,PAGS,SKAGWAY-HOONAH-ANGOON,9,GUSTAVUS AP
19,10000202,25325,70395,KTN,KTN,PAKT,KETCHIKAN GATEWAY BOROUGH,9,KETCHIKAN INTL AP
28,10000202,25325,70395,KTN,KTN,PAKT,KETCHIKAN GATEWAY BOROUGH,9,KETCHIKAN AP
37,10000355,25335,70362,SGY,SGY,PAGY,SKAGWAY-HOONAH-ANGOON,9,SKAGWAY AP
41,10000446,25338,70387,WRG,WRG,PAWG,WRANGELL-PETERSBURG,9,WRANGELL AP
...,...,...,...,...,...,...,...,...,...
168721,30000752,53868,72427,OQT,OQTT,1 KOQT,ANDERSON,5,OAK RIDGE ASOS
171069,30001692,93874,74757,GPT,GPT,KGPT,HARRISON,6,GULFPORT - BILOXI AP
175241,30015538,26512,70246,MHM,MHM,PAMH,YUKON-KOYUKUK,9,MINCHUMINA AP
175756,30073829,14753,74492,HOM,3 MQEM,3,NORFOLK,5,BLUE HILL LCD


In [10]:
WBAN.drop_duplicates(subset='WBAN_ID', keep='last', inplace=True)

In [11]:
WBAN_IATAs = WBAN['FAA_LOC_ID'].unique()
WBAN_IATAs

array(['KTN', 'SGY', 'EET', 'IGM', 'AVX', 'FAT', 'RBL', 'RDD', 'SAN',
       'SFO', 'GJT', 'LIC', 'ILG', 'AAF', 'EYW', 'PNS', 'TLH', 'VRB',
       'FFC', 'SAV', 'AYS', 'LWS', 'ORD', 'ICT', 'SDF', 'NHZ', 'LAN',
       'MQT', 'MKG', 'STJ', 'SGF', 'JAN', 'HKY', 'FAR', 'ISN', 'ELY',
       'DSV', 'SYR', 'ILN', 'OKC', 'PHL', 'DAL', 'STX', 'BTV', 'EPH',
       'HTS', 'LND', 'DEN', 'PKF', 'GST', 'EKA', 'WRG', 'MOB', 'MGM',
       'BHM', 'HSV', 'LZK', 'FSM', 'HRO', 'DUG', 'FHU', 'TUS', 'YUM',
       'PHX', 'INW', 'NJK', 'NSI', 'LGB', 'LAX', 'NTD', 'MWS', 'SBA',
       'SDB', 'SMX', 'BFL', 'NID', 'NLC', 'SNS', 'BIH', 'NUQ', 'HWD',
       'OAK', 'SCK', 'CCR', 'SUU', 'SAC', 'TRK', 'ALS', 'PUB', 'FCS',
       'COS', 'DNR', 'CAG', 'BDR', 'BDL', 'MIA', 'FLL', 'PBI', 'TPA',
       'MCO', 'DAB', 'GNV', 'CRG', 'VPS', 'JAX', 'AMG', 'MCN', 'AGS',
       'ATL', 'AHN', 'PIH', 'BOI', 'SLO', 'SPI', 'UIN', 'PIA', 'MLI',
       'MDW', 'RFD', 'EVV', 'IND', 'FWA', 'SBN', 'BRL', 'DSM', 'CID',
       'DBQ', 'SUX',

In [12]:
len(WBAN_IATAs)

405

In [13]:
output_csv_path = os.path.join(root,
                               "Output_Data",
                               "NOAA",
                               "MASTER-STN-HIST_preprocessed.csv")

WBAN.to_csv(path_or_buf=output_csv_path,
            index=False,
            encoding='latin1')

___

### Local Climatological Data (LCD) > 2019 Master file

Source: https://www.ncdc.noaa.gov/cdo-web/datatools/lcd

Local Climatological Data (LCD) is only available for stations and locations within the United States and its territories. Select the state or territory, location, and time to view specific data.

In [14]:
csv_path = os.path.join(root,
                        "Raw_Data",
                        "NOAA",
                        "LCD",
                        "2019.tar.gz")

csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Raw_Data\\NOAA\\LCD\\2019.tar.gz'

In [15]:
LCD = pd.read_csv(csv_path,
                  header=0,
                  nrows=10000,
                  sep=',',
                  encoding='latin1',
                  low_memory = False)
LCD.head(10)

Unnamed: 0,01001099999.csv,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD
0,1001100000.0,2019-01-01T00:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"345,1,N,0095,1","00600,1,C,9",10000199,-651,-1231,101991,6999999.0,,,31061.0,21061.0,"08,1,+00800,1,08,1",,,"9,AGL ,+99999,+99999",08991081999006001999999,,,,999999101871,"3,1,052,1,+999,9",851.0,1621,39902201999.0,,9999999049.0,SYN004BUFR,
1,1001100000.0,2019-01-01T01:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"350,1,N,0086,1",99999999,999999999,-621,-1151,102131,,,,,,,,,,,,,,999999102011,"3,1,049,1,+999,9",,1271,,,,SYN004BUFR,
2,1001100000.0,2019-01-01T02:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"343,1,N,0086,1",99999999,999999999,-621,-1101,102231,,,,,,,,,,,,,,999999102111,"2,1,040,1,+999,9",,1331,,,,SYN004BUFR,
3,1001100000.0,2019-01-01T03:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"337,1,N,0077,1",99999999,999999999,-621,-1141,102341,,,,,,,,,,,,,,999999102221,"2,1,034,1,+999,9",,1311,,,,SYN004BUFR,
4,1001100000.0,2019-01-01T04:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"345,1,N,0086,1",99999999,999999999,-621,-1181,102421,,,,,,,,,,,,,,999999102301,"2,1,029,1,+999,9",,1601,,,,SYN004BUFR,
5,1001100000.0,2019-01-01T05:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"331,1,N,0063,1",99999999,999999999,-621,-1161,102521,,,,,,,,,,,,,,999999102401,"3,1,029,1,+999,9",,1051,,,,SYN004BUFR,
6,1001100000.0,2019-01-01T06:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"306,1,N,0038,1","00600,1,C,9",30000199,-621,-1141,102591,12999999.0,24000131.0,,21061.0,21061.0,"06,1,+00600,1,08,1",,,"9,AGL ,+99999,+99999",06991061999006001999999,299.0,"120,M,-0047,1","120,N,-0065,1",999999102471,"3,1,025,1,+999,9",11.0,1271,90601071999.0,,,SYN004BUFR,
7,1001100000.0,2019-01-01T07:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"281,1,N,0029,1",99999999,999999999,-621,-1141,102641,,,,,,,,,,,,,,999999102521,"2,1,022,1,+999,9",,761,,,,SYN004BUFR,
8,1001100000.0,2019-01-01T08:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"268,1,N,0039,1",99999999,999999999,-621,-1131,102681,,,,,,,,,,,,,,999999102561,"2,1,016,1,+999,9",,641,,,,SYN004BUFR,
9,1001100000.0,2019-01-01T09:00:00,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"254,1,N,0031,1",22000199,30000199,-591,-1161,102711,,,,11031.0,11031.0,"03,1,+00600,1,08,1",,,"9,AGL ,+99999,+99999",03991031999006001999999,,,,999999102591,"2,1,012,1,+999,9",11.0,611,,,,SYN004BUFR,


___

### OTP (2019)

In [16]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
# For the time being, 'TAIL_NUM' will be disregarded due to its high cardinality  (~ 4500) and low expected added value    
#      'TAIL_NUM',
     'ORIGIN',
     'DEST',
#      'CRS_DEP_TIME', # Redundant having the 'DEP_ARR_hour' feature
     'DEP_TIME_hour',
#      'DEP_TIME',
#      'DEP_DELAY',
#      'DEP_DEL15',
#      'TAXI_OUT',
     'TAXI_OUT_median',
#      'TAXI_IN',
     'TAXI_IN_median',
#      'CRS_ARR_TIME', # Redundant having the 'ARR_ARR_hour' feature
     'ARR_TIME_hour', 
#      'ARR_TIME',   
#      'ARR_DELAY',
     'ARR_DEL15', # → Target !!
#      'CANCELLED',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP',
#      'CARRIER_DELAY',
#      'WEATHER_DELAY',
#      'NAS_DELAY',
#      'SECURITY_DELAY',
#      'LATE_AIRCRAFT_DELAY',
]

In [17]:
preprocessed_input_csv_path = os.path.join(root,
                                           "Output_Data",
                                           "US_DoT",
                                           "AL_OTP_MVP_Preprocessed_19_v2_clean.csv")
preprocessed_input_csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\US_DoT\\AL_OTP_MVP_Preprocessed_19_v2_clean.csv'

In [18]:
cols_dtypes = {
               'MONTH' : 'category',
               'DAY_OF_MONTH' : 'category',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'ORIGIN' : 'category',
               'DEST' : 'category',
               'DEP_TIME_hour' : 'category',
               'TAXI_OUT_median' : 'int64',
               'TAXI_IN_median' : 'int64',
               'ARR_TIME_hour' : 'category', 
               'ARR_DEL15' : 'int32', # → Target !!
               'CRS_ELAPSED_TIME' : 'int64',
               'DISTANCE' : 'int64',
               'DISTANCE_GROUP' : 'category'
               }

In [23]:
%%time

OTP = pd.read_csv(preprocessed_input_csv_path,
                  encoding='latin1',
                  usecols=cols,
                  low_memory = False)

Wall time: 3min 10s


In [24]:
# Cast types in accordance to previously defined dictionary:
OTP = OTP.astype(cols_dtypes)
# Display DF columns according to 'cols' order:
OTP = OTP[cols]
OTP

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_TIME_hour,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP
0,1,3,4,9E,TYS,ATL,11,15,8,12,1,70,152,1
1,1,4,5,9E,TYS,ATL,11,15,8,12,1,70,152,1
2,1,5,6,9E,ATL,SGF,9,17,5,10,0,121,563,3
3,1,6,7,9E,ATL,SGF,9,17,5,10,0,123,563,3
4,1,7,1,9E,ATL,SGF,9,17,5,10,0,123,563,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7268227,12,31,2,B6,MCO,SWF,13,15,6,16,1,163,989,4
7268228,12,31,2,B6,DCA,BOS,14,15,6,15,0,90,399,2
7268229,12,31,2,B6,PHL,BOS,7,17,6,8,0,85,280,2
7268230,12,31,2,B6,BOS,SJU,8,16,5,13,0,242,1674,7


In [33]:
OTP_IATAs = OTP['ORIGIN'].unique()
OTP_IATAs

['TYS', 'ATL', 'SGF', 'SRQ', 'DTW', ..., 'AKN', 'DLG', 'HYA', 'PGV', 'XWA']
Length: 360
Categories (360, object): ['TYS', 'ATL', 'SGF', 'SRQ', ..., 'DLG', 'HYA', 'PGV', 'XWA']

In [26]:
IATAs = []
for IATA in OTP_IATAs:
    if IATA not in WBAN_IATAs:
        IATAs.append(IATA)
len(IATAs)

139

In [27]:
WBAN

Unnamed: 0,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTY,TIME_ZONE,HIST_WBAN_NAME
28,10000202,25325,70395,KTN,KTN,PAKT,KETCHIKAN GATEWAY BOROUGH,9,KETCHIKAN AP
37,10000355,25335,70362,SGY,SGY,PAGY,SKAGWAY-HOONAH-ANGOON,9,SKAGWAY AP
47,10000485,53864,72230,EET,EET,KEET,SHELBY,6,ALABASTER SHELBY CO AP
109,10000865,93167,72370,IGM,IGM,KIGM,MOHAVE,7,KINGMAN AP
120,10001012,23191,72292,AVX,AVX,KAVX,LOS ANGELES,8,SANTA CATALINA AP
...,...,...,...,...,...,...,...,...,...
168139,30000521,4837,72634,APX,APXM,4 KAPX,OTSEGO,5,GAYLORD 9SSW
168721,30000752,53868,72427,OQT,OQTT,1 KOQT,ANDERSON,5,OAK RIDGE ASOS
171069,30001692,93874,74757,GPT,GPT,KGPT,HARRISON,6,GULFPORT - BILOXI AP
175241,30015538,26512,70246,MHM,MHM,PAMH,YUKON-KOYUKUK,9,MINCHUMINA AP


In [40]:
OTP_IATAs = OTP_IATAs.astype('object')

In [83]:
OTP_WBAN = OTP.merge(WBAN,
          how='left',
          left_on='ORIGIN',
          right_on='FAA_LOC_ID')
OTP_WBAN

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_TIME_hour,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID,NWS_LOC_ID,ICAO_ID,COUNTY,TIME_ZONE,HIST_WBAN_NAME
0,1,3,4,9E,TYS,ATL,11,15,8,12,1,70,152,1,20018222.0,13891.0,72326.0,TYS,TYS,KTYS,BLOUNT,5.0,KNOXVILLE AP
1,1,4,5,9E,TYS,ATL,11,15,8,12,1,70,152,1,20018222.0,13891.0,72326.0,TYS,TYS,KTYS,BLOUNT,5.0,KNOXVILLE AP
2,1,5,6,9E,ATL,SGF,9,17,5,10,0,121,563,3,20004906.0,13874.0,72219.0,ATL,ATL,KATL,FULTON,5.0,ATLANTA HARTSFIELD-JACKSON INT
3,1,6,7,9E,ATL,SGF,9,17,5,10,0,123,563,3,20004906.0,13874.0,72219.0,ATL,ATL,KATL,FULTON,5.0,ATLANTA HARTSFIELD-JACKSON INT
4,1,7,1,9E,ATL,SGF,9,17,5,10,0,123,563,3,20004906.0,13874.0,72219.0,ATL,ATL,KATL,FULTON,5.0,ATLANTA HARTSFIELD-JACKSON INT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7269059,12,31,2,B6,MCO,SWF,13,15,6,16,1,163,989,4,20004451.0,12815.0,72205.0,MCO,MCO,KMCO,ORANGE,5.0,ORLANDO INTL AP
7269060,12,31,2,B6,DCA,BOS,14,15,6,15,0,90,399,2,20027254.0,13743.0,72405.0,DCA,DCA,KDCA,ARLINGTON,5.0,WASHINGTON REAGAN NATL AP
7269061,12,31,2,B6,PHL,BOS,7,17,6,8,0,85,280,2,10010044.0,13739.0,72408.0,PHL,PHL,KPHL,DELAWARE,5.0,PHILA INTL AP
7269062,12,31,2,B6,BOS,SJU,8,16,5,13,0,242,1674,7,20009288.0,14739.0,72509.0,BOS,BOS,KBOS,SUFFOLK,5.0,BOSTON


In [84]:
OTP_WBAN = OTP_WBAN[['ORIGIN', 'HOMR_ID', 'WBAN_ID', 'WMO_ID', 'FAA_LOC_ID']]
OTP_WBAN

Unnamed: 0,ORIGIN,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID
0,TYS,20018222.0,13891.0,72326.0,TYS
1,TYS,20018222.0,13891.0,72326.0,TYS
2,ATL,20004906.0,13874.0,72219.0,ATL
3,ATL,20004906.0,13874.0,72219.0,ATL
4,ATL,20004906.0,13874.0,72219.0,ATL
...,...,...,...,...,...
7269059,MCO,20004451.0,12815.0,72205.0,MCO
7269060,DCA,20027254.0,13743.0,72405.0,DCA
7269061,PHL,10010044.0,13739.0,72408.0,PHL
7269062,BOS,20009288.0,14739.0,72509.0,BOS


In [85]:
OTP_WBAN[OTP_WBAN['ORIGIN'] == 'JFK']

Unnamed: 0,ORIGIN,HOMR_ID,WBAN_ID,WMO_ID,FAA_LOC_ID
78,JFK,20019418.0,94789.0,74486.0,JFK
659,JFK,20019418.0,94789.0,74486.0,JFK
823,JFK,20019418.0,94789.0,74486.0,JFK
824,JFK,20019418.0,94789.0,74486.0,JFK
825,JFK,20019418.0,94789.0,74486.0,JFK
...,...,...,...,...,...
7269032,JFK,20019418.0,94789.0,74486.0,JFK
7269034,JFK,20019418.0,94789.0,74486.0,JFK
7269037,JFK,20019418.0,94789.0,74486.0,JFK
7269055,JFK,20019418.0,94789.0,74486.0,JFK


In [87]:
JFK = pd.read_csv('https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/74486094789.csv',
              encoding='latin1',
              low_memory = False)

In [91]:
JFK.sample(5)

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
13302,74486094789,2019-12-27T12:51:00,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,30.19,45,51.0,0.0,,0.11,8.0,80.0,FEW:02 25 BKN:07 95 BKN:07 250,30.19,30.16,10.0,48.0,210,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET13212/27/19 12:51:02 METAR KJFK 271751Z 210...,ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01
13379,74486094789,2019-12-29T19:26:00,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-16,7,30.1,40,40.0,0.05,RA:02 BR:1 |RA |RA,,,100.0,SCT:04 18 BKN:07 24 OVC:08 35,,30.08,5.0,40.0,50,,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET11812/29/19 19:26:03 SPECI KJFK 300026Z 050...,ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01
864,74486094789,2019-01-24T11:51:00,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-15,7,29.42,51,51.0,0.32,RA:02 BR:1 |RA |RA,,,100.0,SCT:04 5 BKN:07 19 OVC:08 80,29.42,29.39,4.0,51.0,190,41.0,34.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET16901/24/19 11:51:02 METAR KJFK 241651Z 190...,ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01
12487,74486094789,2019-12-06T04:00:00,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-12,4,,22,35.0,,,0.02,9.0,59.0,41,30.14,30.11,9.94,30.0,300,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SYN06474486 32666 43007 10017 21056 30195 4020...,ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01
5583,74486094789,2019-05-30T17:33:00,40.63915,-73.76401,3.4,"JFK INTERNATIONAL AIRPORT, NY US",FM-16,7,29.74,61,61.0,0.09,TS:7 +RA:02 BR:1 |s RA s TS s |RA s,,,100.0,BKN:07 3 OVC:08 30,,29.72,0.5,61.0,120,,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MET20205/30/19 17:33:02 SPECI KJFK 302233Z 120...,ESE,1.5,mi,"TEMP, PRECIP, SNOW",,"PSY, SRG, SNOWBOARD",,,FAA CWO,2009-05-01


### Local Climatological Data (LCD) > 2019 Master file

Source: https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/

In [63]:
LCD_tables = pd.read_html('https://www.ncei.noaa.gov/data/local-climatological-data/access/2019/')

In [71]:
LCD = LCD_tables[0].loc[2:len(LCD_tables[0])-2,:]
LCD

Unnamed: 0,Name,Last modified,Size,Description
2,01001099999.csv,2020-10-24 23:45,4.2M,
3,01001499999.csv,2020-10-25 00:04,3.2M,
4,01002099999.csv,2020-10-24 23:44,4.0M,
5,01003099999.csv,2020-10-25 00:08,4.3M,
6,01006099999.csv,2020-10-25 00:53,4.1M,
...,...,...,...,...
13463,A0735500241.csv,2020-10-24 23:52,14M,
13464,A0735700182.csv,2020-10-24 23:26,14M,
13465,A0735900240.csv,2020-10-24 23:41,17M,
13466,A5125500445.csv,2020-10-24 23:33,13M,


#### Run only the first time to generate the global CLEAN file (year 2019)

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___