___

In [1]:
# Import libraries to be used

# Warning messages display
# import warnings
# warnings.filterwarnings(action='once') # https://docs.python.org/3/library/warnings.html#the-warnings-filter

# Directories/Files management
import os.path
## from zipfile import ZipFile # De momento no ha hecho falta 

# Timing
import time

# Data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns in DataFrames
pd.set_option('display.max_rows', 100) # If too high, it greatly slows down the output display and freezes the kernel


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # choose a style: 'plt.style.available'
sns.set_theme(context='notebook',
              style="darkgrid") # {darkgrid, whitegrid, dark, white, ticks}
palette = sns.color_palette("flare", as_cmap=True);
import altair as alt

# Machine Learning
## from sklearn.[...] import ...

In [2]:
%load_ext memory_profiler

In [3]:
t0 = time.perf_counter() 

In [4]:
# Detect Operating System running and manage paths accordingly

if os.name == 'nt': # Windows
    root = r"C:\Users\turge\CompartidoVM\0.TFM"
    print("Running on Windows.")
elif os.name == 'posix': # Ubuntu
    root = "/home/dsc/shared/0.TFM"
    print("Running on Ubuntu.")
print("root path\t", root)

Running on Windows.
root path	 C:\Users\turge\CompartidoVM\0.TFM


___

## 1. Load the dataset

In [5]:
cols = [
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'OP_UNIQUE_CARRIER',
 'ORIGIN',
#  'WBAN_Origin_OTP', # Redundant having the 'ORIGIN' feature
 'DEST',
#  'WBAN_Dest_OTP', # Redundant having the 'DEST' feature
 'DEP_TIME_hour',
 'TAXI_OUT_median',
 'TAXI_IN_median',
 'ARR_TIME_hour',
 'CRS_ELAPSED_TIME',
 'DISTANCE',
 'DISTANCE_GROUP', # Redundant having the 'DISTANCE' feature
#  'STATION_Origin', # Redundant having the 'ORIGIN' feature
#  'WMO_Origin', # Redundant having the 'ORIGIN' feature
#  'WBAN_Origin_LCD', # Redundant having the 'ORIGIN' feature
 'HourlyAltimeterSetting_Origin',
 'HourlyDryBulbTemperature_Origin',
 'HourlyPrecipitation_Origin',
 'HourlyRelativeHumidity_Origin',
 'HourlySkyConditions_Origin',
 'HourlyVisibility_Origin',
#  'REM_Origin', # Not relevant for the model
#  'STATION_Dest', # Redundant having the 'DEST' feature
#  'WMO_Dest', # Redundant having the 'DEST' feature
#  'WBAN_Dest_LCD', # Redundant having the 'DEST' feature
 'HourlyAltimeterSetting_Dest',
 'HourlyDryBulbTemperature_Dest',
 'HourlyPrecipitation_Dest',
 'HourlyRelativeHumidity_Dest',
 'HourlySkyConditions_Dest',
 'HourlyVisibility_Dest',
#  'REM_Dest', # Redundant having the 'ORIGIN' feature
 'ARR_DEL15' # → Target !!
]

In [6]:
preprocessed_input_csv_path = os.path.join(root,
                                           "Output_Data",
                                           "US_DoT-NOAA",
                                           "OTP_LCD_allColumns.csv")
preprocessed_input_csv_path

'C:\\Users\\turge\\CompartidoVM\\0.TFM\\Output_Data\\US_DoT-NOAA\\OTP_LCD_allColumns.csv'

In [7]:
%%time

df_all = pd.read_csv(preprocessed_input_csv_path,
                     encoding='latin1',
                     usecols=cols,
                     dtype={
                            'MONTH' : 'string',
                            'DAY_OF_MONTH' : 'string',
                            'DAY_OF_WEEK' : 'string',
                            'OP_UNIQUE_CARRIER' : 'string',
                            'ORIGIN' : 'string',
                            'DEST' : 'string',
                            'DEP_TIME_hour' : 'string',
                            'TAXI_OUT_median' : 'int32',
                            'TAXI_IN_median' : 'int32',
                            'ARR_TIME_hour' : 'string',
                            'CRS_ELAPSED_TIME' : 'int32',
                            'DISTANCE' : 'int32',
                            'DISTANCE_GROUP' : 'int32',
                            'HourlyAltimeterSetting_Origin' : 'float32',
                            'HourlyDryBulbTemperature_Origin' : 'float32',
                            'HourlyPrecipitation_Origin' : 'float32',
                            'HourlyRelativeHumidity_Origin' : 'float32',
                            'HourlySkyConditions_Origin' : 'string',
                            'HourlyVisibility_Origin' : 'float32',
                            'HourlyAltimeterSetting_Dest' : 'float32',
                            'HourlyDryBulbTemperature_Dest' : 'float32',
                            'HourlyPrecipitation_Dest' : 'float32',
                            'HourlyRelativeHumidity_Dest' : 'float32',
                            'HourlySkyConditions_Dest' : 'string',
                            'HourlyVisibility_Dest' : 'float32',
                            'ARR_DEL15' : 'int32'
                           }
                    )
df_all.sample(5)

Wall time: 54.9 s


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest
2366470,12,2,1,YV,PVD,IAD,6,13,5,1,7,94,371,2,29.549999,37.0,0.0,100.0,OVC,3.0,29.584999,37.0,0.0,89.0,(missing),9.94
1616397,7,5,5,OH,OMA,CLT,6,14,13,0,10,163,913,4,30.045,73.0,0.0,90.0,(missing),9.94,30.16,80.0,0.0,74.0,(missing),9.94
2870244,9,10,2,YX,CMH,DCA,15,14,6,0,16,83,323,2,30.16,90.0,0.0,41.0,SCT,10.0,30.26,87.0,0.0,43.0,(missing),9.94
5245276,8,10,6,AS,SFO,SEA,14,20,9,0,16,135,679,3,29.99,74.0,0.0,56.0,FEW,10.0,29.950001,73.0,0.0,62.0,(missing),9.94
5864933,10,29,2,F9,MCO,SAT,16,15,7,0,18,182,1041,5,30.030001,87.0,0.0,61.0,BKN,10.0,29.905001,59.0,0.0,90.0,(missing),3.73


In [16]:
sorted(df_all['HourlyPrecipitation_Origin'].unique())

[0.0,
 0.01,
 0.02,
 0.03,
 0.04,
 0.05,
 0.06,
 0.07,
 0.08,
 0.09,
 0.1,
 0.11,
 0.12,
 0.13,
 0.14,
 0.15,
 0.16,
 0.17,
 0.18,
 0.19,
 0.2,
 0.21,
 0.22,
 0.23,
 0.24,
 0.25,
 0.26,
 0.27,
 0.28,
 0.29,
 0.3,
 0.31,
 0.32,
 0.33,
 0.34,
 0.35,
 0.36,
 0.37,
 0.38,
 0.39,
 0.4,
 0.41,
 0.42,
 0.43,
 0.44,
 0.45,
 0.46,
 0.47,
 0.48,
 0.49,
 0.5,
 0.51,
 0.52,
 0.53,
 0.54,
 0.55,
 0.56,
 0.57,
 0.58,
 0.59,
 0.6,
 0.61,
 0.62,
 0.63,
 0.64,
 0.65,
 0.66,
 0.67,
 0.68,
 0.69,
 0.7,
 0.71,
 0.73,
 0.75,
 0.76,
 0.77,
 0.78,
 0.81,
 0.82,
 0.83,
 0.85,
 0.86,
 0.87,
 0.88,
 0.92,
 0.95,
 0.98,
 1.0,
 1.01,
 1.02,
 1.03,
 1.05,
 1.07,
 1.09,
 1.1,
 1.13,
 1.16,
 1.2,
 1.35,
 1.42,
 1.43,
 1.44,
 1.49,
 1.77,
 2.08,
 2.8]

In [18]:
df_all['HourlyPrecipitation_Origin'].value_counts()

0.00    6948899
0.01      94273
0.02      41002
0.03      27584
0.04      19039
         ...   
0.85          1
1.16          1
1.20          1
1.35          1
0.48          1
Name: HourlyPrecipitation_Origin, Length: 106, dtype: int64

In [17]:
sorted(df_all['HourlyPrecipitation_Dest'].unique())

[0.0,
 0.01,
 0.02,
 0.03,
 0.04,
 0.05,
 0.06,
 0.07,
 0.08,
 0.09,
 0.1,
 0.11,
 0.12,
 0.13,
 0.14,
 0.15,
 0.16,
 0.17,
 0.18,
 0.19,
 0.2,
 0.21,
 0.22,
 0.23,
 0.24,
 0.25,
 0.26,
 0.27,
 0.28,
 0.29,
 0.3,
 0.31,
 0.32,
 0.33,
 0.34,
 0.35,
 0.36,
 0.37,
 0.38,
 0.39,
 0.4,
 0.41,
 0.42,
 0.43,
 0.44,
 0.45,
 0.46,
 0.47,
 0.48,
 0.49,
 0.5,
 0.51,
 0.52,
 0.53,
 0.54,
 0.55,
 0.56,
 0.57,
 0.58,
 0.59,
 0.6,
 0.61,
 0.62,
 0.63,
 0.64,
 0.65,
 0.66,
 0.67,
 0.68,
 0.69,
 0.7,
 0.71,
 0.75,
 0.76,
 0.77,
 0.78,
 0.8,
 0.81,
 0.82,
 0.83,
 0.85,
 0.86,
 0.88,
 0.89,
 0.92,
 0.95,
 0.98,
 1.01,
 1.02,
 1.03,
 1.09,
 1.1,
 1.13,
 1.16,
 1.2,
 1.37,
 1.42,
 1.43,
 1.44,
 1.49,
 1.77]

In [21]:
df_all['HourlyVisibility_Origin'].value_counts()

10.000000    4900238
9.940000     1311768
9.000000      137968
8.000000      108501
7.000000       95977
              ...   
9.250000           1
1.880000           1
9.428572           1
9.619047           1
1.178571           1
Name: HourlyVisibility_Origin, Length: 155, dtype: int64

In [20]:
df_all['HourlyVisibility_Dest'].value_counts()

10.000000    5029002
9.940000     1255356
9.000000      134830
8.000000      103427
7.000000       92499
              ...   
9.761905           1
6.080000           1
9.777778           1
9.866667           1
1.166667           1
Name: HourlyVisibility_Dest, Length: 151, dtype: int64

In [15]:
df_all[df_all['HourlyPrecipitation_Origin'] > 0]

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest
11,01,03,4,B6,BOS,ATL,9,16,9,0,12,179,946,4,29.879999,38.0,0.01,86.0,OVC,10.00,30.000000,54.0,0.00,93.0,OVC,1.50
17,01,03,4,9E,BTR,ATL,9,15,8,0,12,97,448,2,30.000000,55.0,0.02,96.0,OVC,10.00,30.000000,54.0,0.00,93.0,OVC,1.50
19,01,03,4,WN,GSP,ATL,11,10,7,1,12,55,153,1,30.040001,53.0,0.04,96.0,OVC,5.00,30.000000,54.0,0.00,93.0,OVC,1.50
25,01,03,4,AA,DFW,ATL,8,16,9,0,12,124,731,3,30.030001,37.0,0.01,96.0,OVC,0.75,30.000000,54.0,0.00,93.0,OVC,1.50
62,01,03,4,EV,GSP,ORD,11,16,13,0,12,125,577,3,30.040001,53.0,0.04,96.0,OVC,5.00,29.965000,35.0,0.00,67.0,(missing),9.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7199992,06,18,2,AS,JNU,GST,17,12,6,0,17,35,41,1,30.040001,51.0,0.02,92.0,OVC,10.00,30.059999,53.0,0.01,83.0,OVC,10.00
7200007,07,27,6,AS,JNU,GST,17,12,6,0,17,35,41,1,30.049999,58.0,0.05,93.0,OVC,3.00,30.049999,58.0,0.00,90.0,OVC,10.00
7200023,07,16,2,AS,JNU,GST,17,12,6,0,17,35,41,1,29.690001,56.0,0.02,97.0,OVC,10.00,29.660000,59.0,0.01,93.0,(missing),10.00
7200028,08,22,4,AS,JNU,GST,17,12,6,1,17,35,41,1,29.680000,50.0,0.04,93.0,OVC,6.00,29.650000,52.0,0.05,80.0,OVC,10.00


In [14]:
df_all[df_all['HourlyPrecipitation_Dest'] > 0]

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_TIME_hour,TAXI_OUT_median,TAXI_IN_median,ARR_DEL15,ARR_TIME_hour,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,HourlyAltimeterSetting_Origin,HourlyDryBulbTemperature_Origin,HourlyPrecipitation_Origin,HourlyRelativeHumidity_Origin,HourlySkyConditions_Origin,HourlyVisibility_Origin,HourlyAltimeterSetting_Dest,HourlyDryBulbTemperature_Dest,HourlyPrecipitation_Dest,HourlyRelativeHumidity_Dest,HourlySkyConditions_Dest,HourlyVisibility_Dest
1028,01,04,5,9E,TYS,ATL,7,15,8,0,8,69,152,1,29.840000,48.0,0.00,100.0,(missing),3.73,29.790001,56.0,0.24,90.0,OVC,2.0
1029,01,04,5,DL,DTW,ATL,6,16,8,0,8,140,594,3,29.850000,31.0,0.00,79.0,FEW,9.00,29.790001,56.0,0.24,90.0,OVC,2.0
1030,01,04,5,DL,LGA,ATL,6,22,8,1,8,157,762,4,30.000000,37.0,0.00,62.0,BKN,10.00,29.790001,56.0,0.24,90.0,OVC,2.0
1031,01,04,5,OO,CHA,ATL,7,14,8,1,8,64,106,1,29.785000,49.0,0.00,90.0,(missing),6.84,29.790001,56.0,0.24,90.0,OVC,2.0
1032,01,04,5,DL,TPA,ATL,7,13,8,0,8,100,406,2,29.969999,71.0,0.00,94.0,(missing),9.94,29.790001,56.0,0.24,90.0,OVC,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7200005,07,25,4,AS,JNU,GST,17,12,6,0,17,35,41,1,29.889999,57.0,0.00,90.0,OVC,10.00,29.910000,57.0,0.02,94.0,OVC,9.0
7200016,07,24,3,AS,JNU,GST,17,12,6,0,17,35,41,1,30.040001,59.0,0.00,93.0,(missing),10.00,30.059999,57.0,0.01,88.0,OVC,7.0
7200023,07,16,2,AS,JNU,GST,17,12,6,0,17,35,41,1,29.690001,56.0,0.02,97.0,OVC,10.00,29.660000,59.0,0.01,93.0,(missing),10.0
7200028,08,22,4,AS,JNU,GST,17,12,6,1,17,35,41,1,29.680000,50.0,0.04,93.0,OVC,6.00,29.650000,52.0,0.05,80.0,OVC,10.0


In [13]:
df_all[['HourlyPrecipitation_Origin', 'HourlyPrecipitation_Dest']].head(50)

Unnamed: 0,HourlyPrecipitation_Origin,HourlyPrecipitation_Dest
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


## Export resulting DF into CSV file

In [8]:
output_csv_dir = os.path.join(root,
                              "Output_Data",
                              "US_DoT-NOAA")
file_name = "OTP_LCD_PowerBI_AllRows.csv"

if file_name not in os.listdir(output_csv_dir):
    # Save such DataFrame into a CSV file (only once):
    df_all.to_csv(path_or_buf=os.path.join(output_csv_dir, file_name),
                   index=False,
                   encoding='latin1')
    print("File '" + file_name + "' has been generated.")
else:    
    print("File '" + file_name + "' already exists.\nNo file has been generated (previous one remains).")

File 'OTP_LCD_PowerBI_AllRows.csv' has been generated.


___