# Imports & settings

In [487]:
import pandas as pd
import numpy as np

# Weather dataset inspection

In [488]:
# Loads the weather dataset.
weather = pd.read_csv('data/weather.csv')

  weather = pd.read_csv('data/weather.csv')


In [489]:
print('The weather raw dataset shape is: ', weather.shape)

The weather raw dataset shape is:  (90667, 66)


In [490]:
print('The weather raw dataset columns are: ', weather.columns)

The weather raw dataset columns are:  Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'AWND_ATTRIBUTES', 'DAPR', 'DAPR_ATTRIBUTES', 'DASF', 'DASF_ATTRIBUTES',
       'MDPR', 'MDPR_ATTRIBUTES', 'MDSF', 'MDSF_ATTRIBUTES', 'PGTM',
       'PGTM_ATTRIBUTES', 'PRCP', 'PRCP_ATTRIBUTES', 'PSUN', 'PSUN_ATTRIBUTES',
       'SNOW', 'SNOW_ATTRIBUTES', 'SNWD', 'SNWD_ATTRIBUTES', 'TAVG',
       'TAVG_ATTRIBUTES', 'TMAX', 'TMAX_ATTRIBUTES', 'TMIN', 'TMIN_ATTRIBUTES',
       'TOBS', 'TOBS_ATTRIBUTES', 'TSUN', 'TSUN_ATTRIBUTES', 'WDF2',
       'WDF2_ATTRIBUTES', 'WDF5', 'WDF5_ATTRIBUTES', 'WESD', 'WESD_ATTRIBUTES',
       'WESF', 'WESF_ATTRIBUTES', 'WSF2', 'WSF2_ATTRIBUTES', 'WSF5',
       'WSF5_ATTRIBUTES', 'WT01', 'WT01_ATTRIBUTES', 'WT02', 'WT02_ATTRIBUTES',
       'WT03', 'WT03_ATTRIBUTES', 'WT04', 'WT04_ATTRIBUTES', 'WT05',
       'WT05_ATTRIBUTES', 'WT06', 'WT06_ATTRIBUTES', 'WT08', 'WT08_ATTRIBUTES',
       'WT09', 'WT09_ATTRIBUTES', 'WT11', 'WT11_ATTRIBUTE

In [491]:
print('Weather raw dataset variable types: ', weather.dtypes)

Weather raw dataset variable types:  STATION             object
NAME                object
LATITUDE           float64
LONGITUDE          float64
ELEVATION          float64
                    ...   
WT08_ATTRIBUTES     object
WT09               float64
WT09_ATTRIBUTES     object
WT11               float64
WT11_ATTRIBUTES     object
Length: 66, dtype: object


In [492]:
print('Some rows of the weather raw dataset: ', weather.head())

Some rows of the weather raw dataset:         STATION                 NAME  LATITUDE  LONGITUDE  ELEVATION  \
0  USC00280907  BOONTON 1 SE, NJ US  40.89174  -74.39635       85.3   
1  USC00280907  BOONTON 1 SE, NJ US  40.89174  -74.39635       85.3   
2  USC00280907  BOONTON 1 SE, NJ US  40.89174  -74.39635       85.3   
3  USC00280907  BOONTON 1 SE, NJ US  40.89174  -74.39635       85.3   
4  USC00280907  BOONTON 1 SE, NJ US  40.89174  -74.39635       85.3   

         DATE  AWND AWND_ATTRIBUTES  DAPR DAPR_ATTRIBUTES  ...  WT05  \
0  2018-01-01   NaN             NaN   NaN             NaN  ...   NaN   
1  2018-01-02   NaN             NaN   NaN             NaN  ...   NaN   
2  2018-01-03   NaN             NaN   NaN             NaN  ...   NaN   
3  2018-01-04   NaN             NaN   NaN             NaN  ...   NaN   
4  2018-01-05   NaN             NaN   NaN             NaN  ...   NaN   

   WT05_ATTRIBUTES  WT06 WT06_ATTRIBUTES  WT08  WT08_ATTRIBUTES  WT09  \
0              NaN   NaN    

# Weather dataset preprocessing

In [493]:
def filter_weather(weather):
    """
        Filters the weather dataset for the years 2018 and 2020.
    """
    
    # Convert 'DATE' column to Pandas datetime if not already in datetime format
    weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_weather(data, year):
        return data[
            ((data['DATE'].dt.year == year) & (data['DATE'].dt.month >= 6) & (data['DATE'].dt.month <= 8))
        ]

    summer_weather_2018 = filter_summer_weather(weather, 2018)
    summer_weather_2020 = filter_summer_weather(weather, 2020)

    return summer_weather_2018, summer_weather_2020

# Gets the summer weather for 2018 and 2020 datasets.
summer_weather_2018, summer_weather_2020 = filter_weather(weather)

In [494]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify the weather station and the date of the observation.
observation_columns = [
    'STATION', 
    'NAME', 
    'DATE'
]

# Columns about coordinate location of the observation.
coordinates = [
    'LATITUDE', 
    'LONGITUDE', 
    'ELEVATION'
]

# Columns about the weather conditions (snowfall).
snowfall_columns = [
    'SNOW', # Snowfall.
    'SNWD'  # Snow depth.
]

# Columns about the weather conditions (temperature).
temperature_columns = [
    'TAVG', # Average temperature.
    'TMAX', # Maximum temperature.
    'TMIN'  # Minimum temperature.
]

# Columns about the weather conditions (wind).
wind_columns = [
    'AWND', # Average daily wind speed.
    'WDF2', # Direction of the fastest 2-minute wind.
    'WDF5', # Direction of the fastest 5-second wind.
    'WSF2', # Fastest 2-minute wind speed.
    'WSF5'  # Fastest 5-second wind speed.
]

# Columns about the weather conditions (precipitation).
# Consider only 'PRCP' column.

In [495]:
def filter_columns_of_interest(summer_weather, columns_of_interest):
    """
        Filters the summer collisions to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_weather.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_weather = summer_weather.drop(columns=columns_to_drop)
    
    return summer_weather

# Defines columns of interest.
columns_of_interest = observation_columns + coordinates + snowfall_columns + temperature_columns + wind_columns + ['PRCP']

# Filter columns of interest for both summer_collisions_2018 and summer_collisions_2020.
summer_weather_2018 = filter_columns_of_interest(summer_weather_2018, columns_of_interest)
summer_weather_2020 = filter_columns_of_interest(summer_weather_2020, columns_of_interest)

In [496]:
summer_weather_2020.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2',
       'WSF5'],
      dtype='object')

In [497]:
# Study columns missing values for collisions_2018.
missing_values_2018 = summer_weather_2018.isna().sum()

# Study columns missing values for collisions_2020.
missing_values_2020 = summer_weather_2020.isna().sum()

# Display missing values for both dataframes.
print("Missing values in 2018:")
print(missing_values_2018)

Missing values in 2018:
STATION         0
NAME            0
LATITUDE        0
LONGITUDE       0
ELEVATION       0
DATE            0
AWND         6614
PRCP          140
SNOW         4104
SNWD         6116
TAVG         7074
TMAX         6157
TMIN         6169
WDF2         6614
WDF5         6614
WSF2         6614
WSF5         6614
dtype: int64


In [498]:
print('The summer_2020 dataset shape is: ', summer_weather_2018.shape)

The summer_2020 dataset shape is:  (7350, 17)


In [499]:
print("\nMissing values in 2020:")
print(missing_values_2020)


Missing values in 2020:
STATION         0
NAME            0
LATITUDE        0
LONGITUDE       0
ELEVATION       0
DATE            0
AWND         7384
PRCP           81
SNOW         4470
SNWD         6578
TAVG         7826
TMAX         6817
TMIN         6817
WDF2         7384
WDF5         7388
WSF2         7384
WSF5         7388
dtype: int64


In [500]:
print('The summer_2020 dataset shape is: ', summer_weather_2020.shape)

The summer_2020 dataset shape is:  (8102, 17)


In [501]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
880,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-01,,0.0,0.0,0.0,,20.0,8.9,,,,
881,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-02,,0.0,0.0,0.0,,21.1,8.9,,,,
882,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-03,,18.5,0.0,0.0,,22.8,10.0,,,,
883,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-04,,5.1,0.0,0.0,,26.1,15.6,,,,
884,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-05,,9.1,0.0,0.0,,28.9,16.7,,,,


In [502]:
def replace_nan_with_zero(summer_weather):
    # Replace all NaN values with 0 in the entire DataFrame.
   summer_weather.fillna(0, inplace=True)
   return summer_weather

summer_weather_2018 = replace_nan_with_zero(summer_weather_2018)
summer_weather_2020 = replace_nan_with_zero(summer_weather_2020)

In [503]:
def encode_weather_conditions(summer_weather):
    # Creates new columns to define labels for weather conditions
    summer_weather['WINDY'] = (summer_weather['AWND'].gt(0) | summer_weather['WDF2'].gt(0) | summer_weather['WDF5'].gt(0) | summer_weather['WSF2'].gt(0) | summer_weather['WSF5'].gt(0))
    summer_weather['RAINY'] = summer_weather['PRCP'].gt(0)  # Considers only non-zero precipitation as rainy
    summer_weather['SNOWY'] = (summer_weather['SNOW'].gt(0) | summer_weather['SNWD'].gt(0))
    summer_weather['SUNNY'] = ~(summer_weather['WINDY'] | summer_weather['RAINY'] | summer_weather['SNOWY'])

    return summer_weather

summer_weather_2018 = encode_weather_conditions(summer_weather_2018)
summer_weather_2020 = encode_weather_conditions(summer_weather_2020)

In [504]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
880,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-01,0.0,0.0,0.0,0.0,...,20.0,8.9,0.0,0.0,0.0,0.0,False,False,False,True
881,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-02,0.0,0.0,0.0,0.0,...,21.1,8.9,0.0,0.0,0.0,0.0,False,False,False,True
882,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-03,0.0,18.5,0.0,0.0,...,22.8,10.0,0.0,0.0,0.0,0.0,False,True,False,True
883,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-04,0.0,5.1,0.0,0.0,...,26.1,15.6,0.0,0.0,0.0,0.0,False,True,False,True
884,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-05,0.0,9.1,0.0,0.0,...,28.9,16.7,0.0,0.0,0.0,0.0,False,True,False,True


In [505]:
summer_weather_2018.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
151,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-01,0.0,1.8,0.0,0.0,...,20.6,16.7,0.0,0.0,0.0,0.0,False,True,False,True
152,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-02,0.0,5.3,0.0,0.0,...,28.9,18.9,0.0,0.0,0.0,0.0,False,True,False,True
153,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-03,0.0,14.0,0.0,0.0,...,29.4,14.4,0.0,0.0,0.0,0.0,False,True,False,True
154,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-04,0.0,16.8,0.0,0.0,...,18.9,10.6,0.0,0.0,0.0,0.0,False,True,False,True
155,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-05,0.0,0.0,0.0,0.0,...,23.3,11.1,0.0,0.0,0.0,0.0,False,False,False,True


In [506]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
880,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-01,0.0,0.0,0.0,0.0,...,20.0,8.9,0.0,0.0,0.0,0.0,False,False,False,True
881,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-02,0.0,0.0,0.0,0.0,...,21.1,8.9,0.0,0.0,0.0,0.0,False,False,False,True
882,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-03,0.0,18.5,0.0,0.0,...,22.8,10.0,0.0,0.0,0.0,0.0,False,True,False,True
883,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-04,0.0,5.1,0.0,0.0,...,26.1,15.6,0.0,0.0,0.0,0.0,False,True,False,True
884,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-05,0.0,9.1,0.0,0.0,...,28.9,16.7,0.0,0.0,0.0,0.0,False,True,False,True


In [507]:
def memory_usage(summer_weather):
    memory_usage = summer_weather.memory_usage(deep=True).sum() / (1024**2)  # In Megabytes.
    return memory_usage

# Calculate memory usage for both datasets.
memory_usage_2018 = memory_usage(summer_weather_2018)
memory_usage_2020 = memory_usage(summer_weather_2020)

print('Memory usage for summer_weather_2018 is: ', memory_usage_2018, ' MB')
print('Memory usage for summer_weather_2020 is: ', memory_usage_2020, ' MB')

Memory usage for summer_weather_2018 is:  1.9763164520263672  MB
Memory usage for summer_weather_2020 is:  2.1771793365478516  MB


In [508]:
# Reset index for summer_collisions_2018 dataset.
summer_weather_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_collisions_2020 dataset.
summer_weather_2020.reset_index(drop=True, inplace=True)

In [509]:
# Store datasets in pickle format.
summer_weather_2018.to_pickle('summer_collisions_2018', compression='bz2')
summer_weather_2020.to_pickle('summer_collisions_2020', compression='bz2')