In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.ncei.noaa.gov/data/local-climatological-data/doc/LCD_documentation.pLCD).

___

# Get the data

## LCD clean file (2019)

### Import file

#### Define file path

In [55]:
output_csv_dir = os.path.join(root,
                              "Output_Data",
                              "NOAA",
                              "LCD_AllStations")

file_name = "LCD_all_clean.csv"

LCD = pd.read_csv(os.path.join(output_csv_dir, file_name),
                  encoding='latin1',
                  dtype = {
                           'STATION' : 'string',
                           'WMO' : 'string',
                           'WBAN' : 'string',
                           'DATE' : 'string',
                           'Date' : 'string',
                           'Hour' : 'string',
                           'HourlyAltimeterSetting' : 'float64',
                           'HourlyDryBulbTemperature' : 'float64',
                           'HourlyPrecipitation' : 'float64',
                           'HourlyRelativeHumidity' : 'float64', 
                           'HourlySkyConditions' : 'string',
                           'HourlyVisibility' : 'float64',
                           'REM' : 'string'
                           }
                 )

In [5]:
LCD.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM
1130166,74756012816,747560,12816,2019-07-30 11:53:00,2019-07-30,11,30.08,88.0,0.0,52.0,CLR,10.0,MET09507/30/19 11:53:02 METAR KGNV 301653Z VRB...
1746302,72637914845,726379,14845,2019-05-09 12:01:00,2019-05-09,12,29.68,62.0,0.0,96.0,OVC,8.0,MET10905/09/19 12:01:02 SPECI KMBS 091701Z 210...
880040,72643514991,726435,14991,2019-01-03 03:10:00,2019-01-03,3,29.88,25.0,0.0,78.0,OVC,10.0,MET08801/03/19 03:10:02 SPECI KEAU 030910Z 210...
1390821,72302013748,723020,13748,2019-06-13 06:08:00,2019-06-13,6,29.87,73.0,0.0,84.0,OVC,10.0,MET09306/13/19 06:08:02 SPECI KILM 131108Z 250...
524452,72475593129,724755,93129,2019-04-13 16:53:00,2019-04-13,16,30.03,55.0,0.0,35.0,CLR,10.0,MET10904/13/19 16:53:02 METAR KCDC 132353Z 310...


In [138]:
LCD_year = LCD['Date'].str[:4]
LCD_month = LCD['Date'].str[5:7]
LCD_day = LCD['Date'].str[8:10]
LCD_id = LCD['WBAN'] + '_' + LCD_year + '-' + LCD_month + '-' + LCD_day + '_' + LCD['Hour']
LCD['LCD_id'] = LCD_id
LCD.sample(5)

## OTP-LCD merge

### Import OTP dataset

In [56]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
     'ORIGIN',
     'WBAN_Origin',
     'DEST',
     'WBAN_Dest',
     'DEP_TIME_hour',
     'ARR_TIME_hour',
     'TAXI_OUT_median',
     'TAXI_IN_median',
     'ARR_DEL15',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP'
]

In [57]:
cols_dtypes = {
               'MONTH' : 'string',
               'DAY_OF_MONTH' : 'string',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'ORIGIN' : 'category',
               'DEST' : 'category',
               'DEP_TIME_hour' : 'string',
               'TAXI_OUT_median' : 'int64',
               'TAXI_IN_median' : 'int64',
               'ARR_TIME_hour' : 'string', 
               'ARR_DEL15' : 'int32', # → Target !!
               'CRS_ELAPSED_TIME' : 'int64',
               'DISTANCE' : 'int64',
               'DISTANCE_GROUP' : 'category',
               'WBAN_Origin' : 'string',
               'WBAN_Dest' : 'string'
               }

In [58]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "US_DoT",
                        "AL_OTP_WBAN_19.csv")

OTP = pd.read_csv(csv_path,
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

In [112]:
# It is observed that WBAN format is not always in 5-digit format due to conversion from 'int' dtype, so:
for c in ['WBAN_Origin', 'WBAN_Dest']:
    OTP[c] = OTP[c].apply(lambda x: ('0'+x) if len(x)==4 else x)
# Likewise, convert months and days into 2-digit format:
for d in ['MONTH', 'DAY_OF_MONTH']:
    OTP[d] = OTP[d].apply(lambda x: ('0'+x) if len(x)==1 else x)

# SEGUIR AQUÍ: Parece que funciona! Ahora habría que incluir también las de WBAN_Dest, nombrar cada variable acordemente, y probar con el dataset entero

In [147]:
LCD_original_cols = LCD.columns
# Origin merge:
LCD.columns[:-1] = LCD_original_cols[:-1].map(lambda x: str(x) + '_Origin')
OTP_id_Origin = OTP['WBAN_Origin'] + '_' + '2019-' + OTP['MONTH'] + '-' + OTP['DAY_OF_MONTH'] + '_' + OTP['DEP_TIME_hour']
OTP['LCD_id'] = OTP_id_Origin
OTP_LCD_Origin = OTP.merge(LCD, how='inner', on='LCD_id')

# Dest merge:
LCD.columns[:-1] = LCD_original_cols[:-1].map(lambda x: str(x) + '_Dest')
OTP_id_Dest = OTP['WBAN_Dest'] + '_' + '2019-' + OTP['MONTH'] + '-' + OTP['DAY_OF_MONTH'] + '_' + OTP['ARR_TIME_hour']
OTP['LCD_id'] = OTP_id_Dest
OTP_LCD = OTP_LCD_Origin.merge(LCD, how='inner', on='LCD_id')

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,WBAN_Origin,DEST,WBAN_Dest,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,LCD_id
5616806,7,29,1,AS,SJC,23293,SEA,24233,17,15,9,0,19,124,696,3,23293_2019-07-29_17
4225125,8,23,5,DL,SEA,24233,LAX,23174,7,18,9,0,10,176,954,4,24233_2019-08-23_7
6480571,5,25,6,WN,MDW,14819,MEM,13893,18,10,4,0,19,95,480,2,14819_2019-05-25_18
830178,12,18,3,DL,DTW,94847,LAS,23169,12,16,6,0,13,280,1749,7,94847_2019-12-18_12
1129452,5,17,5,AA,SAV,3822,CLT,13881,14,14,9,1,15,74,213,1,03822_2019-05-17_14


### 1st attempt: brute force (all at once)

As expected, the "brute-force" approach is unfeasible. The following error is displayed:
```
MemoryError: Unable to allocate 617. GiB for an array with shape (82863097036,) and data type int64
```

### 2nd attempt: individual merges concatenated (file-by-file composition)

In [None]:
LCD_ind = LCD[(LCD['WBAN'] == '13891') \
#             & (LCD['Date'] == '2019-01-01') \
#             & (LCD['Time_h'].isin(['00', '01']))
              ]
LCD_ind

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

The `missingno` represent data with horizontal sticks, the absence of a stick in place shows a null value

In [None]:
import missingno as msno

msno.matrix(LCD_ind, figsize=(14, 5), color=(0.24, 0.77, 0.77))

Observed problems:
- Missing values (NaN's):
    - By column:
        - `HourlyPrecipitation`: fill with 0's
        - `HourlySkyConditions` and `REM`: fill with empty strings i.e. ''
        - 
    
        
    - There are rows which have many NaN elements → If there are more than 
- There might be different weather measurements for each hour:
    - Let's keep the first one which presents no `NaN`

In [None]:
OTPLCD = OTP.merge(LCD, how='left', left_on='WBAN_Origin', right_on='WBAN')
OTPLCD

In [None]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

___

___