#  MODIFICAR UNA VEZ SE INCORPORE EL VIENTO Y LA LOCALIZACIÓN EN EL NB 'LCD_DataPreparation_v2'

In [1]:
# Import libraries to be used

# Warning messages display
## import warnings
## warnings.filterwarnings(action='ignore') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Memory monitoring
%load_ext memory_profiler
### Use '%memit' to check at each point

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
## pd.set_option('display.max_rows', None) # It greatly slows down the output display and freezes the kernel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
t0 = time.perf_counter() 

In [3]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


Additional information on each column meaning can be found [here](https://www.ncei.noaa.gov/data/local-climatological-data/doc/LCD_documentation.pLCD).

___

# Get the data

## LCD clean file (2019)

### Import file

#### Define file path

In [4]:
output_csv_dir = os.path.join(root,
                              "Output_Data",
                              "NOAA",
                              "LCD_AllStations")

file_name = "LCD_all_clean_v2.csv"

LCD = pd.read_csv(os.path.join(output_csv_dir, file_name),
                  encoding='latin1',
                  dtype = {
                           'STATION' : 'string',
                           'LATITUDE' : 'float64',
                           'LONGITUDE' : 'float64',
                           'WMO' : 'string',
                           'WBAN' : 'string',
                           'DATE' : 'string',
                           'Date' : 'string',
                           'Hour' : 'string',
                           'HourlyAltimeterSetting' : 'float32',
                           'HourlyDryBulbTemperature' : 'int32',
                           'HourlyPrecipitation' : 'float32',
                           'HourlyRelativeHumidity' : 'int32', 
                           'HourlySkyConditions' : 'string',
                           'HourlyVisibility' : 'float32',
                           'HourlyWindDirection' : 'string',
                           'HourlyWindGustSpeed' : 'int32',
                           'HourlyWindSpeed' : 'int32',
                           'REM' : 'string'
                           }
                 )

In [5]:
LCD.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM
1706414,72643014920,726430,14920,2019-10-15 03:53:00,2019-10-15,3,29.79,44.0,0.0,71.0,CLR,10.0,MET09010/15/19 03:53:02 METAR KLSE 150953Z 140...
2928837,72216693845,722166,93845,2019-08-10 23:53:00,2019-08-10,23,29.97,78.0,0.0,87.0,CLR,10.0,MET10008/10/19 23:53:02 METAR KVLD 110453Z 130...
2514557,72392523190,723925,23190,2019-03-20 16:53:00,2019-03-20,16,30.02,61.0,0.0,67.0,SCT,10.0,MET10003/20/19 16:53:01 METAR KSBA 210053Z 260...
2025622,91190022516,911900,22516,2019-03-21 23:54:00,2019-03-21,23,30.13,71.0,0.0,81.0,SCT,10.0,MET13703/21/19 23:54:02 METAR PHOG 220954Z 050...
637707,72531594870,725315,94870,2019-03-24 07:53:00,2019-03-24,7,30.12,44.0,0.0,73.0,OVC,10.0,MET09903/24/19 07:53:02 METAR KCMI 241353Z 120...


In [6]:
LCD_year = LCD['Date'].str[:4]
LCD_month = LCD['Date'].str[5:7]
LCD_day = LCD['Date'].str[8:10]
LCD_id = LCD['WBAN'] + '_' + LCD_year + '-' + LCD_month + '-' + LCD_day + '_' + LCD['Hour']
LCD['LCD_id'] = LCD_id
LCD.sample(5)

Unnamed: 0,STATION,WMO,WBAN,DATE,Date,Hour,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyVisibility,REM,LCD_id
2734066,72434013994,724340,13994,2019-04-19 19:51:00,2019-04-19,19,29.81,53.0,0.0,40.0,FEW,10.0,MET09404/19/19 19:51:01 METAR KSTL 200151Z 340...,13994_2019-04-19_19
484430,72288023152,722880,23152,2019-09-17 21:53:00,2019-09-17,21,29.91,70.0,0.0,66.0,CLR,10.0,MET10709/17/19 21:53:02 METAR KBUR 180553Z 160...,23152_2019-09-17_21
1666370,91165022536,911650,22536,2019-03-19 03:53:00,2019-03-19,3,30.06,70.0,0.0,61.0,CLR,10.0,MET09503/19/19 03:53:02 METAR PHLI 191353Z 060...,22536_2019-03-19_3
2579607,72205712854,722057,12854,2019-08-23 20:53:00,2019-08-23,20,29.98,79.0,0.0,88.0,CLR,10.0,MET09008/23/19 20:53:01 METAR KSFB 240153Z 060...,12854_2019-08-23_20
2806526,72214093805,722140,93805,2019-08-23 06:53:00,2019-08-23,6,30.04,77.0,0.0,96.0,SCT,10.0,MET13108/23/19 06:53:02 METAR KTLH 231153Z 000...,93805_2019-08-23_6


## OTP dataset

### Import OTP dataset

# PRIMERO PASAR POR EL NB DE "WBAN_IATA" !!!

## MODIFICAR LAS FEATURES DE IMPORTACIÓN PARA INCLUIR LAS RECIENTEMENTE INCORPORADAS

In [7]:
cols = [
     'MONTH',
     'DAY_OF_MONTH',
     'DAY_OF_WEEK',
     'OP_UNIQUE_CARRIER',
     'TAIL_NUM',
     'ORIGIN',
     'ORIGIN_CITY_NAME',
     'ORIGIN_STATE_ABR',
     'ORIGIN_STATE_NM',
     'WBAN_Origin',
     'DEST',
    
    seguir!!!
    
    
     'DEST_CITY_NAME',
     'DEST_STATE_ABR',
     'DEST_STATE_NM',
     'WBAN_Dest',
     'DEP_TIME_hour',
     'ARR_TIME_hour',
     'TAXI_OUT_median',
     'TAXI_IN_median',
     'ARR_DEL15',
     'CRS_ELAPSED_TIME',
     'DISTANCE',
     'DISTANCE_GROUP'
]

In [8]:
cols_dtypes = {
               'MONTH' : 'string',
               'DAY_OF_MONTH' : 'string',
               'DAY_OF_WEEK' : 'category',
               'OP_UNIQUE_CARRIER' : 'category',
               'ORIGIN' : 'category',
               'DEST' : 'category',
               'DEP_TIME_hour' : 'string',
               'TAXI_OUT_median' : 'int64',
               'TAXI_IN_median' : 'int64',
               'ARR_TIME_hour' : 'string', 
               'ARR_DEL15' : 'int32', # → Target !!
               'CRS_ELAPSED_TIME' : 'int64',
               'DISTANCE' : 'int64',
               'DISTANCE_GROUP' : 'category',
               'WBAN_Origin' : 'string',
               'WBAN_Dest' : 'string'
               }

In [9]:
csv_path = os.path.join(root,
                        "Output_Data",
                        "US_DoT",
                        "AL_OTP_WBAN_19.csv")

OTP = pd.read_csv(csv_path,
                  encoding='latin1',
                  usecols=cols,
                  dtype=cols_dtypes)

### Adapt OTP dataset format

In [10]:
# It is observed that WBAN format is not always in 5-digit format due to conversion from 'int' dtype, so:
for c in ['WBAN_Origin', 'WBAN_Dest']:
    OTP[c] = OTP[c].apply(lambda x: ('0'+x) if len(x)==4 else x)
# Likewise, convert months and days into 2-digit format:
for d in ['MONTH', 'DAY_OF_MONTH']:
    OTP[d] = OTP[d].apply(lambda x: ('0'+x) if len(x)==1 else x)

## OTP-LCD merge

### 1st merge: `Origin`

In [12]:
LCD_original_cols = LCD.columns
# Origin merge:
LCD.columns = LCD_original_cols.map(lambda x: str(x) + '_Origin')
OTP_id_Origin = OTP['WBAN_Origin'] + '_' + '2019-' + OTP['MONTH'] + '-' + OTP['DAY_OF_MONTH'] + '_' + OTP['DEP_TIME_hour']
OTP['LCD_id_Origin'] = OTP_id_Origin
OTP_LCD_Origin = OTP.merge(LCD, how='inner', on='LCD_id_Origin', suffixes=['_OTP', '_LCD'])

In [13]:
OTP_LCD_Origin.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,WBAN_Origin_OTP,DEST,WBAN_Dest,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,REM_Origin
6409486,9,19,4,MQ,SPI,93822,DFW,3927,14,12,11,0,16,124,630,3,93822_2019-09-19_14,72439093822,724390,93822,2019-09-19 14:52:00,2019-09-19,14,30.02,87.0,0.0,50.0,CLR,10.0,MET10209/19/19 14:52:02 METAR KSPI 192052Z VRB...


### 2nd merge: `Dest`

In [14]:
# Dest merge:
LCD.columns = LCD_original_cols.map(lambda x: str(x) + '_Dest')
OTP_id_Dest = OTP_LCD_Origin['WBAN_Dest'] + '_' + '2019-' + OTP_LCD_Origin['MONTH'] + '-' \
              + OTP_LCD_Origin['DAY_OF_MONTH'] + '_' + OTP_LCD_Origin['ARR_TIME_hour']
OTP_LCD_Origin['LCD_id_Dest'] = OTP_id_Dest
OTP_LCD = OTP_LCD_Origin.merge(LCD, how='inner', on='LCD_id_Dest', suffixes=['_OTP', '_LCD'])

In [15]:
OTP_LCD.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,WBAN_Origin_OTP,DEST,WBAN_Dest_OTP,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,LCD_id_Origin,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,DATE_Origin,Date_Origin,Hour_Origin,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,REM_Origin,LCD_id_Dest,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,DATE_Dest,Date_Dest,Hour_Dest,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,REM_Dest
2806507,8,5,1,B6,JFK,94789,LGB,23129,17,20,4,1,20,384,2465,10,94789_2019-08-05_17,74486094789,744860,94789,2019-08-05 17:51:00,2019-08-05,17,29.96,75.0,0.0,84.0,BKN,10.0,MET10508/05/19 17:51:02 METAR KJFK 052251Z 120...,23129_2019-08-05_20,72297023129,722970,23129,2019-08-05 20:53:00,2019-08-05,20,29.9,72.0,0.0,66.0,CLR,10.0,MET09208/05/19 20:53:02 METAR KLGB 060453Z 320...


### Final points before exporting

In [16]:
# Check how many flights have been lost throughout the merging process:
print("OTP dataset number of flights : {}".format(len(OTP)))
print("OTP-LCD dataset number of flights : {} ({:4.2f}% dropped)".format(len(OTP_LCD),
                                                                         (len(OTP) - len(OTP_LCD)) * 100 / len(OTP)))

OTP dataset number of flights : 7208372
OTP-LCD dataset number of flights : 7200051 (0.12% dropped)


In [17]:
# Drop redundant columns:
drop_cols = ['LCD_id_Origin', 'DATE_Origin', 'Date_Origin', 'Hour_Origin', 'LCD_id_Dest', 'DATE_Dest', 'Date_Dest', 'Hour_Dest']
OTP_LCD.drop(drop_cols, axis=1, inplace=True)
OTP_LCD.sample(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,WBAN_Origin_OTP,DEST,WBAN_Dest_OTP,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,STATION_Origin,WMO_Origin,WBAN_Origin_LCD,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,REM_Origin,STATION_Dest,WMO_Dest,WBAN_Dest_LCD,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest,REM_Dest
5758194,6,12,3,MQ,DFW,3927,CMI,94870,18,19,4,0,20,124,692,3,72259003927,722590,3927,29.98,84.0,0.0,38.0,(missing),9.94,SYN08672259 32766 20112 10289 20133 39941 4014...,72531594870,725315,94870,29.82,58.0,0.0,93.0,OVC,10.0,MET13506/12/19 20:53:01 METAR KCMI 130253Z 210...


___

## Export resulting DF into CSV file

In [18]:
output_csv_dir = os.path.join(root,
                              "Output_Data",
                              "US_DoT-NOAA")
file_name = "OTP_LCD_allColumns.csv"

if file_name not in os.listdir(output_csv_dir):
    # Save such DataFrame into a CSV file (only once):
    OTP_LCD.to_csv(path_or_buf=os.path.join(output_csv_dir, file_name),
                   index=False,
                   encoding='latin1')
    print("File '" + file_name + "' has been generated.")
else:    
    print("File '" + file_name + "' already exists.\nNo file has been generated (previous one remains).")

File 'OTP_LCD_allColumns.csv' already exists.
No file has been generated (previous one remains).


___

In [19]:
t1 = time.perf_counter() - t0
print("Time elapsed: ", t1) # CPU seconds elapsed (floating point)

Time elapsed:  143.8933288


___