In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.ncei.noaa.gov/data/local-climatological-data/doc/LCD_documentation.pLCD).

___

# Get the data

## LCD clean file (2019)

### Import file

#### Define file path

In [164]:
output_csv_dir = os.path.join(root,
                              "Output_Data",
                              "NOAA",
                              "LCD_AllStations")

file_name = "LCD_all_clean.csv"

LCD = pd.read_csv(os.path.join(output_csv_dir, file_name),
                  encoding='latin1',
                  dtype = {
                           'STATION' : 'string',
                           'WMO' : 'string',
                           'WBAN' : 'string',
                           'DATE' : 'string',
                           'Date' : 'string',
                           'Hour' : 'string',
                           'HourlyAltimeterSetting' : 'float64',
                           'HourlyDryBulbTemperature' : 'float64',
                           'HourlyPrecipitation' : 'float64',
                           'HourlyRelativeHumidity' : 'float64', 
                           'HourlySkyConditions' : 'string',
                           'HourlyVisibility' : 'float64',
                           'REM' : 'string'
                           }
                 )

In [165]:
LCD.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM
882461,72643514991,726435,14991,2019-04-14 00:56:00,2019-04-14,0,30.05,29.0,0.0,75.0,OVC,10.0,MET09504/14/19 00:56:02 METAR KEAU 140656Z 050...
1797796,72334013893,723340,13893,2019-03-26 12:00:00,2019-03-26,12,30.305,54.0,0.0,59.0,(missing),9.94,SYN08672334 32666 63213 10122 20044 30160 4026...
1636405,72422093820,724220,93820,2019-10-16 02:04:00,2019-10-16,2,29.8,62.0,0.02,86.0,OVC,5.0,MET14310/16/19 02:04:02 SPECI KLEX 160704Z 180...
1837467,72455503936,724555,3936,2019-10-05 22:52:00,2019-10-05,22,30.06,48.0,0.0,93.0,CLR,10.0,MET09010/05/19 22:52:01 METAR KMHK 060452Z 000...
690572,72569024089,725690,24089,2019-04-08 22:53:00,2019-04-08,22,30.02,34.0,0.0,92.0,CLR,10.0,MET10804/08/19 22:53:02 METAR KCPR 090553Z 330...


In [166]:
LCD_year = LCD['Date'].str[:4]
LCD_month = LCD['Date'].str[5:7]
LCD_day = LCD['Date'].str[8:10]
LCD_id = LCD['WBAN'] + '_' + LCD_year + '-' + LCD_month + '-' + LCD_day + '_' + LCD['Hour']
LCD['LCD_id'] = LCD_id
LCD.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM,LCD_id
1086642,72451523064,724515,23064,2019-08-10 10:54:00,2019-08-10,10,29.98,83.0,0.0,59.0,CLR,10.0,MET09508/10/19 10:54:02 METAR KGCK 101654Z 150...,23064_2019-08-10_10
913486,72582524121,725825,24121,2019-11-01 18:56:00,2019-11-01,18,30.38,36.0,0.0,31.0,CLR,10.0,MET09711/01/19 18:56:01 METAR KEKO 020256Z 020...,24121_2019-11-01_18
1829062,72226013895,722260,13895,2019-10-20 14:53:00,2019-10-20,14,29.93,79.0,0.0,52.0,CLR,10.0,MET10210/20/19 14:53:02 METAR KMGM 202053Z 000...,13895_2019-10-20_14
875345,72552614905,725526,14905,2019-06-19 14:33:00,2019-06-19,14,29.79,72.0,0.0,69.0,BKN,10.0,MET09006/19/19 14:33:02 METAR KEAR 192033Z VRB...,14905_2019-06-19_14
915383,72515614748,725156,14748,2019-01-19 19:33:00,2019-01-19,19,29.91,20.0,0.02,85.0,VV,0.75,MET11301/19/19 19:33:01 SPECI KELM 200033Z 080...,14748_2019-01-19_19


## OTP-LCD merge

### Import OTP dataset

In [56]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
     'ORIGIN',
     'WBAN_Origin',
     'DEST',
     'WBAN_Dest',
     'DEP_TIME_hour',
     'ARR_TIME_hour',
     'TAXI_OUT_median',
     'TAXI_IN_median',
     'ARR_DEL15',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP'
]

In [57]:
cols_dtypes = {
               'MONTH' : 'string',
               'DAY_OF_MONTH' : 'string',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'ORIGIN' : 'category',
               'DEST' : 'category',
               'DEP_TIME_hour' : 'string',
               'TAXI_OUT_median' : 'int64',
               'TAXI_IN_median' : 'int64',
               'ARR_TIME_hour' : 'string', 
               'ARR_DEL15' : 'int32', # → Target !!
               'CRS_ELAPSED_TIME' : 'int64',
               'DISTANCE' : 'int64',
               'DISTANCE_GROUP' : 'category',
               'WBAN_Origin' : 'string',
               'WBAN_Dest' : 'string'
               }

In [58]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "US_DoT",
                        "AL_OTP_WBAN_19.csv")

OTP = pd.read_csv(csv_path,
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

In [112]:
# It is observed that WBAN format is not always in 5-digit format due to conversion from 'int' dtype, so:
for c in ['WBAN_Origin', 'WBAN_Dest']:
    OTP[c] = OTP[c].apply(lambda x: ('0'+x) if len(x)==4 else x)
# Likewise, convert months and days into 2-digit format:
for d in ['MONTH', 'DAY_OF_MONTH']:
    OTP[d] = OTP[d].apply(lambda x: ('0'+x) if len(x)==1 else x)

# SEGUIR AQUÍ: Parece que funciona! Ahora habría que incluir también las de WBAN_Dest, nombrar cada variable acordemente, y probar con el dataset entero

In [167]:
LCD.sample(1)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM,LCD_id
425119,72655594938,726555,94938,2019-12-09 23:53:00,2019-12-09,23,29.96,-8.0,0.0,71.0,CLR,9.0,MET11912/09/19 23:53:02 METAR KBRD 100553Z 230...,94938_2019-12-09_23


In [168]:
LCD_original_cols = LCD.columns
# Origin merge:
LCD.columns = LCD_original_cols.map(lambda x: str(x) + '_Origin')
OTP_id_Origin = OTP['WBAN_Origin'] + '_' + '2019-' + OTP['MONTH'] + '-' + OTP['DAY_OF_MONTH'] + '_' + OTP['DEP_TIME_hour']
OTP['LCD_id_Origin'] = OTP_id_Origin
OTP_LCD_Origin = OTP.merge(LCD, how='inner', left_on='LCD_id_Origin', right_on='LCD_id')

# Dest merge:
LCD.columns = LCD_original_cols.map(lambda x: str(x) + '_Dest')
OTP_id_Dest = OTP['WBAN_Dest'] + '_' + '2019-' + OTP['MONTH'] + '-' + OTP['DAY_OF_MONTH'] + '_' + OTP['ARR_TIME_hour']
OTP_LCD_Origin['LCD_id_Dest'] = OTP_id_Dest
OTP_LCD = OTP_LCD_Origin.merge(LCD, how='inner', left_on='LCD_id_Dest', right_on='LCD_id')

MemoryError: 

In [None]:
OTP_LCD.sample(5)

### 1st attempt: brute force (all at once)

As expected, the "brute-force" approach is unfeasible. The following error is displayed:
```
MemoryError: Unable to allocate 617. GiB for an array with shape (82863097036,) and data type int64
```

### 2nd attempt: individual merges concatenated (file-by-file composition)

In [None]:
LCD_ind = LCD[(LCD['WBAN'] == '13891') \
#             & (LCD['Date'] == '2019-01-01') \
#             & (LCD['Time_h'].isin(['00', '01']))
              ]
LCD_ind

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

The `missingno` represent data with horizontal sticks, the absence of a stick in place shows a null value

In [None]:
import missingno as msno

msno.matrix(LCD_ind, figsize=(14, 5), color=(0.24, 0.77, 0.77))

Observed problems:
- Missing values (NaN's):
    - By column:
        - `HourlyPrecipitation`: fill with 0's
        - `HourlySkyConditions` and `REM`: fill with empty strings i.e. ''
        - 
    
        
    - There are rows which have many NaN elements → If there are more than 
- There might be different weather measurements for each hour:
    - Let's keep the first one which presents no `NaN`

In [None]:
OTPLCD = OTP.merge(LCD, how='left', left_on='WBAN_Origin', right_on='WBAN')
OTPLCD

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___