In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.ncei.noaa.gov/data/local-climatological-data/doc/LCD_documentation.pdf).

___

# Get the data

### LCD individual file (2019)

Let's first check which airports are present in the OTP dataset, and later compare them to those appearing in the WBAN database. Ideally, all the airports contained in the OTP dataset should appear in the WBAN database. This would mean that there presumably might be a weather station at each of those airports.

In [4]:
csv_files_path = os.path.join(root,
                              "Output_Data",
                              "NOAA",
                              "LCD_files")
csv_files_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\LCD_files'

In [5]:
individualLCDfileName = "ABE_14737.csv"
individualLCDfileNamePath = os.path.join(csv_files_path, individualLCDfileName)
individualLCDfileNamePath

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\NOAA\\LCD_files\\ABE_14737.csv'

## 2. Dataset overview

In [11]:
cols = [
 'STATION', # Station ID (VMO+WBAN)
 'DATE', # Date: YYYY-MM-DD format / Time: 24-hour clock in local standard time (LST)
#  'LATITUDE',
#  'LONGITUDE',
#  'ELEVATION',
#  'NAME',
#  'REPORT_TYPE',
#  'SOURCE',
 'HourlyAltimeterSetting', # Atmospheric pressure reduced to sea level using temperature profile of the “standard” atmosphere
#  'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature', # Commonly used as the standard air temperature reported (given in whole degrees Fahrenheit)
 'HourlyPrecipitation', # Amount of precipitation in inches to hundredths over the past hour (“T” indicates a trace amount)
#  'HourlyPresentWeatherType',
#  'HourlyPressureChange',
#  'HourlyPressureTendency',
 'HourlyRelativeHumidity', # Relative humidity given to the nearest whole percentage
 'HourlySkyConditions', # A report of each cloud layer (up to 3).Each layer given in the following format: ccc:ll-xxx where:
                         ## ccc = Coverage: CLR (clear sky), FEW (few clouds), SCT (scattered clouds), BKN (broken clouds),
                         ##                 OVC (overcast), VV (obscured sky), 10 (partially obscured sky) → Last layer used (*)
                         ##  ll = Layer amount given in eighths (aka “oktas”) → Not used
                         ## xxx = Cloud base height at lowest point of layer → Not used   
                             ## (*) NOTE: Since up to 3 cloud layers can be reported, the full state of the sky 
                             ##           can best be determined by the contraction given for the last layer
#  'HourlySeaLevelPressure',
#  'HourlyStationPressure',
 'HourlyVisibility', # Horizontal distance an object can be seen and identified given in whole miles
#  'HourlyWetBulbTemperature',
#  'HourlyWindDirection', # wind direction/speed/gust variables would be useful in case RWY Magnetic Heading was also available
#  'HourlyWindGustSpeed',   # without it, there is no use for it. Strong crosswinds can greatly penalize A/C performance;
#  'HourlyWindSpeed',       # however, RWYs are normally designed so that headwind prevails to improve A/C performance
#  'Sunrise',
#  'Sunset',
#  'DailyAverageDewPointTemperature',
#  'DailyAverageDryBulbTemperature',
#  'DailyAverageRelativeHumidity',
#  'DailyAverageSeaLevelPressure',
#  'DailyAverageStationPressure',
#  'DailyAverageWetBulbTemperature',
#  'DailyAverageWindSpeed',
#  'DailyCoolingDegreeDays',
#  'DailyDepartureFromNormalAverageTemperature',
#  'DailyHeatingDegreeDays',
#  'DailyMaximumDryBulbTemperature',
#  'DailyMinimumDryBulbTemperature',
#  'DailyPeakWindDirection',
#  'DailyPeakWindSpeed',
#  'DailyPrecipitation',
#  'DailySnowDepth',
#  'DailySnowfall',
#  'DailySustainedWindDirection',
#  'DailySustainedWindSpeed',
#  'DailyWeather',
#  'MonthlyAverageRH',
#  'MonthlyDaysWithGT001Precip',
#  'MonthlyDaysWithGT010Precip',
#  'MonthlyDaysWithGT32Temp',
#  'MonthlyDaysWithGT90Temp',
#  'MonthlyDaysWithLT0Temp',
#  'MonthlyDaysWithLT32Temp',
#  'MonthlyDepartureFromNormalAverageTemperature',
#  'MonthlyDepartureFromNormalCoolingDegreeDays',
#  'MonthlyDepartureFromNormalHeatingDegreeDays',
#  'MonthlyDepartureFromNormalMaximumTemperature',
#  'MonthlyDepartureFromNormalMinimumTemperature',
#  'MonthlyDepartureFromNormalPrecipitation',
#  'MonthlyDewpointTemperature',
#  'MonthlyGreatestPrecip',
#  'MonthlyGreatestPrecipDate',
#  'MonthlyGreatestSnowDepth',
#  'MonthlyGreatestSnowDepthDate',
#  'MonthlyGreatestSnowfall',
#  'MonthlyGreatestSnowfallDate',
#  'MonthlyMaxSeaLevelPressureValue',
#  'MonthlyMaxSeaLevelPressureValueDate',
#  'MonthlyMaxSeaLevelPressureValueTime',
#  'MonthlyMaximumTemperature',
#  'MonthlyMeanTemperature',
#  'MonthlyMinSeaLevelPressureValue',
#  'MonthlyMinSeaLevelPressureValueDate',
#  'MonthlyMinSeaLevelPressureValueTime',
#  'MonthlyMinimumTemperature',
#  'MonthlySeaLevelPressure',
#  'MonthlyStationPressure',
#  'MonthlyTotalLiquidPrecipitation',
#  'MonthlyTotalSnowfall',
#  'MonthlyWetBulb',
#  'AWND',
#  'CDSD',
#  'CLDD',
#  'DSNW',
#  'HDSD',
#  'HTDD',
#  'NormalsCoolingDegreeDay',
#  'NormalsHeatingDegreeDay',
#  'ShortDurationEndDate005',
#  'ShortDurationEndDate010',
#  'ShortDurationEndDate015',
#  'ShortDurationEndDate020',
#  'ShortDurationEndDate030',
#  'ShortDurationEndDate045',
#  'ShortDurationEndDate060',
#  'ShortDurationEndDate080',
#  'ShortDurationEndDate100',
#  'ShortDurationEndDate120',
#  'ShortDurationEndDate150',
#  'ShortDurationEndDate180',
#  'ShortDurationPrecipitationValue005',
#  'ShortDurationPrecipitationValue010',
#  'ShortDurationPrecipitationValue015',
#  'ShortDurationPrecipitationValue020',
#  'ShortDurationPrecipitationValue030',
#  'ShortDurationPrecipitationValue045',
#  'ShortDurationPrecipitationValue060',
#  'ShortDurationPrecipitationValue080',
#  'ShortDurationPrecipitationValue100',
#  'ShortDurationPrecipitationValue120',
#  'ShortDurationPrecipitationValue150',
#  'ShortDurationPrecipitationValue180',
 'REM', # Surface Weather Observations & Reports (METAR)
#  'BackupDirection',
#  'BackupDistance',
#  'BackupDistanceUnit',
#  'BackupElements',
#  'BackupElevation',
#  'BackupEquipment',
#  'BackupLatitude',
#  'BackupLongitude',
#  'BackupName',
#  'WindEquipmentChangeDate'
]

Selected columns:

- `STATION` → Station ID (VMO+WBAN)  
- `DATE` → Date: YYYY-MM-DD format / Time: 24-hour clock in local standard time (LST)  
- `HourlyAltimeterSetting` → Atmospheric pressure reduced to sea level using temperature profile of the “standard” atmosphere  
- `HourlyDryBulbTemperature` → Commonly used as the standard air temperature reported (given in whole degrees Fahrenheit)  
- `HourlyPrecipitation` → Amount of precipitation in inches to hundredths over the past hour (“T” indicates a trace amount)   
- `HourlyRelativeHumidity` → Relative humidity given to the nearest whole percentage  
- `HourlySkyConditions` → A report of each cloud layer (up to 3).  
Each layer given in the following format: `ccc:ll-xxx` where:
    - ccc = Coverage:
        - CLR (clear sky)
        - FEW (few clouds)
        - SCT (scattered clouds)
        - BKN (broken clouds)
        - OVC (overcast)
        - VV (obscured sky)
        - 10 (partially obscured sky) → **Only last layer used (*)**
    - ll = Layer amount given in eighths (aka “oktas”) → *Not used*
    - xxx = Cloud base height at lowest point of layer → *Not used*  
    (*) NOTE: Since up to 3 cloud layers can be reported, the full state of the sky can best be determined by the contraction given for the last layer.
- `HourlyVisibility` → Horizontal distance an object can be seen and identified given in whole miles
- `REM` → Surface Weather Observations & Reports (METAR)
    
Additional note for **wind**:  
- `HourlyWindDirection`
- `HourlyWindGustSpeed`
- `HourlyWindSpeed`  
Wind direction/speed/gust variables would be useful in case RWY Magnetic Heading was also available; without it, there is no use for the wind in the model. Strong crosswinds can greatly penalize A/C performance; however, RWYs are normally designed so that headwind prevails to improve A/C performance. Therefore, such type of winds should be unusual in average.

In [12]:
lcd = pd.read_csv(individualLCDfileNamePath,
                  encoding='latin1',
                  usecols=cols,
                  low_memory = False)
lcd.sample(5)

Unnamed: 0,STATION,DATE,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM
12784,72517014737,2019-12-18T07:00:00,,28.0,,75.0,,9.94,SYN08672517 32966 02406 11022 21061 39982 4012...
6889,72517014737,2019-07-09T04:04:00,30.05,65.0,,90.0,CLR:00,1.75,MET09207/09/19 04:04:02 SPECI KABE 090904Z 000...
10700,72517014737,2019-10-23T01:00:00,,58.0,,87.0,74,9.94,SYN08672517 15866 62606 10144 20122 39959 4009...
8866,72517014737,2019-08-31T15:51:00,30.23,81.0,0.0,28.0,CLR:00,10.0,MET10208/31/19 15:51:02 METAR KABE 312051Z 310...
6212,72517014737,2019-06-20T03:51:00,29.66,69.0,0.1,90.0,SCT:04 5 OVC:08 10,5.0,MET13106/20/19 03:51:02 METAR KABE 200851Z 090...


In [11]:
list(lcd.columns)

['STATION',
 'DATE',
 'LATITUDE',
 'LONGITUDE',
 'ELEVATION',
 'NAME',
 'REPORT_TYPE',
 'SOURCE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyPresentWeatherType',
 'HourlyPressureChange',
 'HourlyPressureTendency',
 'HourlyRelativeHumidity',
 'HourlySkyConditions',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindGustSpeed',
 'HourlyWindSpeed',
 'Sunrise',
 'Sunset',
 'DailyAverageDewPointTemperature',
 'DailyAverageDryBulbTemperature',
 'DailyAverageRelativeHumidity',
 'DailyAverageSeaLevelPressure',
 'DailyAverageStationPressure',
 'DailyAverageWetBulbTemperature',
 'DailyAverageWindSpeed',
 'DailyCoolingDegreeDays',
 'DailyDepartureFromNormalAverageTemperature',
 'DailyHeatingDegreeDays',
 'DailyMaximumDryBulbTemperature',
 'DailyMinimumDryBulbTemperature',
 'DailyPeakWindDirection',
 'DailyPeakWindSpeed',
 'DailyPrecipit

In [8]:
def val_freq(col='', df=lcd):
    i = 0
    for v in df[col].value_counts().sort_index():
        print("{} : {} records ({:.2f}%)" \
              .format(df[col].value_counts().sort_index().index[i], v,  v / len(df) * 100))
        i += 1

In [9]:
for col in lcd.columns:
    print(col, ':', lcd[col].nunique(), 'unique values')
    if lcd[col].nunique() < 50:
        val_freq(col)
    print("")

STATION : 1 unique values
72517014737 : 13299 records (100.00%)

DATE : 13274 unique values

LATITUDE : 1 unique values
40.64985 : 13299 records (100.00%)

LONGITUDE : 1 unique values
-75.44771 : 13299 records (100.00%)

ELEVATION : 1 unique values
118.9 : 13299 records (100.00%)

NAME : 1 unique values
ALLENTOWN LEHIGH VALLEY INTERNATIONAL AIRPORT, PA US : 13299 records (100.00%)

REPORT_TYPE : 6 unique values
FM-12 : 1428 records (10.74%)
FM-15 : 8745 records (65.76%)
FM-16 : 2748 records (20.66%)
SOD   : 365 records (2.74%)
SOM   : 12 records (0.09%)
SY-MT : 1 records (0.01%)

SOURCE : 3 unique values
4 : 1482 records (11.14%)
6 : 820 records (6.17%)
7 : 10997 records (82.69%)

HourlyAltimeterSetting : 153 unique values

HourlyDewPointTemperature : 99 unique values

HourlyDryBulbTemperature : 100 unique values

HourlyPrecipitation : 101 unique values

HourlyPresentWeatherType : 70 unique values

HourlyPressureChange : 42 unique values
-0.21 : 2 records (0.02%)
-0.2 : 2 records (0.02

47.0 : 1 records (0.01%)
50.0 : 1 records (0.01%)
54.0 : 1 records (0.01%)
58.0 : 1 records (0.01%)
59.0 : 1 records (0.01%)

DailyMaximumDryBulbTemperature : 73 unique values

DailyMinimumDryBulbTemperature : 70 unique values

DailyPeakWindDirection : 36 unique values
10.0 : 5 records (0.04%)
20.0 : 1 records (0.01%)
30.0 : 3 records (0.02%)
40.0 : 7 records (0.05%)
50.0 : 7 records (0.05%)
60.0 : 11 records (0.08%)
70.0 : 11 records (0.08%)
80.0 : 15 records (0.11%)
90.0 : 6 records (0.05%)
100.0 : 9 records (0.07%)
110.0 : 5 records (0.04%)
120.0 : 3 records (0.02%)
130.0 : 2 records (0.02%)
140.0 : 11 records (0.08%)
150.0 : 11 records (0.08%)
160.0 : 4 records (0.03%)
170.0 : 6 records (0.05%)
180.0 : 11 records (0.08%)
190.0 : 5 records (0.04%)
200.0 : 1 records (0.01%)
210.0 : 13 records (0.10%)
220.0 : 11 records (0.08%)
230.0 : 16 records (0.12%)
240.0 : 16 records (0.12%)
250.0 : 19 records (0.14%)
260.0 : 25 records (0.19%)
270.0 : 5 records (0.04%)
280.0 : 7 records (0.05%)

12-12 : 1 records (0.01%)
16-16 : 1 records (0.01%)
18-18 : 1 records (0.01%)
19-20 : 1 records (0.01%)
20-21 : 1 records (0.01%)
21-22 : 1 records (0.01%)
23-24 : 1 records (0.01%)
29-30 : 1 records (0.01%)

MonthlyGreatestSnowDepth : 5 unique values
0.0 : 8 records (0.06%)
1.0 : 1 records (0.01%)
4.0 : 1 records (0.01%)
6.0 : 1 records (0.01%)
30.0 : 1 records (0.01%)

MonthlyGreatestSnowDepthDate : 4 unique values
4.0 : 1 records (0.01%)
11.0 : 1 records (0.01%)
13.0 : 1 records (0.01%)
30.0 : 1 records (0.01%)

MonthlyGreatestSnowfall : 6 unique values
0.0 : 6 records (0.05%)
1.6 : 1 records (0.01%)
2.8 : 1 records (0.01%)
3.9 : 1 records (0.01%)
6.3 : 1 records (0.01%)
T : 2 records (0.02%)

MonthlyGreatestSnowfallDate : 6 unique values
03-03 : 1 records (0.01%)
05-05 : 1 records (0.01%)
08-08 : 1 records (0.01%)
11-11 : 1 records (0.01%)
12-12 : 1 records (0.01%)
29-29 : 1 records (0.01%)

MonthlyMaxSeaLevelPressureValue : 10 unique values
30.28 : 1 records (0.01%)
30.29 : 2 reco

2019-08-07 15:03 : 1 records (0.01%)
2019-09-12 14:40 : 1 records (0.01%)
2019-10-31 22:30 : 1 records (0.01%)
2019-11-24 03:35 : 1 records (0.01%)
2019-12-30 08:49 : 1 records (0.01%)

ShortDurationEndDate080 : 8 unique values
2019-04-20 07:26 : 1 records (0.01%)
2019-05-04 23:45 : 1 records (0.01%)
2019-06-28 16:42 : 1 records (0.01%)
2019-08-07 15:46 : 1 records (0.01%)
2019-09-12 14:34 : 1 records (0.01%)
2019-10-31 22:30 : 1 records (0.01%)
2019-11-24 03:35 : 1 records (0.01%)
2019-12-09 17:22 : 1 records (0.01%)

ShortDurationEndDate100 : 8 unique values
2019-04-20 07:54 : 1 records (0.01%)
2019-05-05 00:08 : 1 records (0.01%)
2019-06-28 16:42 : 1 records (0.01%)
2019-08-07 16:02 : 1 records (0.01%)
2019-09-12 14:40 : 1 records (0.01%)
2019-10-31 22:55 : 1 records (0.01%)
2019-11-24 03:53 : 1 records (0.01%)
2019-12-09 17:22 : 1 records (0.01%)

ShortDurationEndDate120 : 8 unique values
2019-04-20 08:13 : 1 records (0.01%)
2019-05-05 00:26 : 1 records (0.01%)
2019-06-28 16:42 : 1

In [None]:
# Absolute & Relative frequency of missing values by column:
pd.set_option('display.max_rows', df.shape[1])
missing = pd.DataFrame([df.isna().sum(), df.isna().sum() / len(df) * 100], index=['Absolute', 'Relative']).T.sort_values(by='Relative', ascending=False)
missing

In [None]:
# Quick approach → check how many rows contain empty values to see if directly dropping them would be feasible:

# Check which rows have at least 1 NaN:
df[df.isna().any(axis=1)]

In [None]:
%%time

output_csv_path = os.path.join(root,
                               "Output_Data",
                               "US_DoT",
#                                "LCD_xxx")

OTP_WBAN_OriginDest.to_csv(path_or_buf=output_csv_path,
                           index=False,
                           encoding='latin1')

In [None]:
from pandas_profiling import ProfileReport

report_name = individualLCDfileName[:-4] + '.html'

# Complete report:
prof = ProfileReport(lcd, minimal=True)
prof.to_file(report_name)

# # # Sample report (more computationally efficient)
# prof = ProfileReport(lcd.sample(1000)) 
# prof.to_file(report_name)

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=131.0), HTML(value='')))

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___