# Imports & settings

In [169]:
import pandas as pd
import numpy as np
import pickle

# Preprocessing

## 1. Collisions dataset preprocessing

In [170]:
# Loads the collision dataset.
collisions = pd.read_csv('raw_data/collisions.csv')

  collisions = pd.read_csv('raw_data/collisions.csv')


In [171]:
def filter_collisions(collisions):
    """
        Filters the collisions dataset for the years 2018 and 2020.
    """

    # Converts the 'CRASH DATE' column to a datetime format.
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_collisions(data, year):
        return data[
            ((data['CRASH DATE'].dt.year == year) & (data['CRASH DATE'].dt.month >= 6) & (data['CRASH DATE'].dt.month <= 8))
        ]

    summer_collisions_2018 = filter_summer_collisions(collisions, 2018)
    summer_collisions_2020 = filter_summer_collisions(collisions, 2020)

    return summer_collisions_2018, summer_collisions_2020

# Gets the summer collisions for 2018 and 2020 datasets.
summer_collisions_2018, summer_collisions_2020 = filter_collisions(collisions)

In [172]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify collisions (have some time attributes).
id_columns= [
    'CRASH DATE',
    'CRASH TIME',
    'ZIP CODE',
    'BOROUGH'
]

# Columns that identify collision in space (locally).
location_columns = [
    'LOCATION',
    'ON STREET NAME', 
    'CROSS STREET NAME',
    'OFF STREET NAME'
]

# Columns about coordinate location of the collision.
coordinates = [
    'LATITUDE',
    'LONGITUDE'
]

# Columns about the type of vehicles involved in the collision.
vehicle_type = [
    'VEHICLE TYPE CODE 1',
    'VEHICLE TYPE CODE 2',
    'VEHICLE TYPE CODE 3',
    'VEHICLE TYPE CODE 4',
    'VEHICLE TYPE CODE 5'
]

# Columns about contributing factors to the collision.
factor_columns = [
    'CONTRIBUTING FACTOR VEHICLE 1',
    'CONTRIBUTING FACTOR VEHICLE 2', 
    'CONTRIBUTING FACTOR VEHICLE 3',
    'CONTRIBUTING FACTOR VEHICLE 4', 
    'CONTRIBUTING FACTOR VEHICLE 5'
]

# Columns about persons injured and killed in a collision.
persons_injured_columns = [
    'NUMBER OF PERSONS INJURED',
    'NUMBER OF PERSONS KILLED', 
]

# Columns about two wheeled vehicles involved in a collision.
two_wheeled_vehicle_columns = [
    'NUMBER OF CYCLIST INJURED',
    'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED',
    'NUMBER OF MOTORIST KILLED',
]

# Some extra information about the collisions.
extra_columns = [
    'COLLISION_ID',
    'NUMBER OF PEDESTRIANS INJURED', 
    'NUMBER OF PEDESTRIANS KILLED'
]

In [173]:
def filter_columns_of_interest(summer_collisions, columns_of_interest):
    """
        Filters the summer collisions to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_collisions.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_collisions = summer_collisions.drop(columns=columns_to_drop)
    
    return summer_collisions

# Defines columns of interest.
columns_of_interest = id_columns + coordinates + vehicle_type + persons_injured_columns + two_wheeled_vehicle_columns

# Filter columns of interest for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = filter_columns_of_interest(summer_collisions_2018, columns_of_interest)
summer_collisions_2020 = filter_columns_of_interest(summer_collisions_2020, columns_of_interest)

In [174]:
def manage_missing_values(summer_collisions):
    """
        Processes the datasets with specified criteria on information category selection (column selection).
    """

    # Replaces 'Unspecified' with NaN values.
    summer_collisions = summer_collisions.replace('Unspecified', np.nan)
    
    # Drops rows with missing values in categories time_columns, coordinates.
    summer_collisions = summer_collisions.dropna(subset=id_columns, how='any')
    summer_collisions = summer_collisions.dropna(subset=coordinates, how='any')
    
    # Drops rows with missing values in vehicle_type where at least two types are specified.
    summer_collisions = summer_collisions.dropna(subset=vehicle_type, thresh=2)
    
    return summer_collisions

# Gets the minimum missing values dataframe, with all information needed to conduct the visualizations, for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = manage_missing_values(summer_collisions_2018)
summer_collisions_2020 = manage_missing_values(summer_collisions_2020)

In [175]:
def add_persons_affected(summer_collisions):
    """
        Adds a column corresponding to the persons affected by a collision.
    """
    # Combines the data.
    summer_collisions['PERSONS AFFECTED'] = summer_collisions['NUMBER OF PERSONS INJURED'] + summer_collisions['NUMBER OF PERSONS KILLED']

    # Removes the columns used to compute the persons affected.
    summer_collisions = summer_collisions.drop(columns=persons_injured_columns)

    return summer_collisions

summer_collisions_2018 = add_persons_affected(summer_collisions_2018)
summer_collisions_2020 = add_persons_affected(summer_collisions_2020)

In [176]:
def add_two_wheels_vehicle_affected(summer_collisions):
    """
        Adds a column corresponding to the persons affected by a collision while driving some two wheeled vehicle.
    """

    # Combines the data.
    summer_collisions['2 WHEELS VEHICLE AFFECTED'] = (
        summer_collisions['NUMBER OF CYCLIST INJURED'] + summer_collisions['NUMBER OF CYCLIST KILLED'] +
        summer_collisions['NUMBER OF MOTORIST INJURED'] + summer_collisions['NUMBER OF MOTORIST KILLED']
    )

    # Removes the columns used to compute the persons affected.
    summer_collisions = summer_collisions.drop(columns=two_wheeled_vehicle_columns)
    
    return summer_collisions

summer_collisions_2018 = add_two_wheels_vehicle_affected(summer_collisions_2018)
summer_collisions_2020 = add_two_wheels_vehicle_affected(summer_collisions_2020)

In [177]:
def categorize_time(summer_collisions):
    """
        Categorizes the 'CRASH TIME' column in the following categories: 'day', 'evening', and 'night', 
    """
    
    # Converts to datetime format.
    summer_collisions['CRASH TIME'] = pd.to_datetime(summer_collisions['CRASH TIME'])
    summer_collisions['CRASH TIME'] = summer_collisions['CRASH TIME'].dt.hour

    # Categorizes time.
    summer_collisions['CRASH TIME'] = summer_collisions['CRASH TIME'].apply(lambda hour: 'Night' if 0 <= hour < 8 else ('Day' if 8 <= hour < 16 else 'Evening'))

    return summer_collisions

summer_collisions_2018 = categorize_time(summer_collisions_2018)
summer_collisions_2020 = categorize_time(summer_collisions_2020)


  summer_collisions['CRASH TIME'] = pd.to_datetime(summer_collisions['CRASH TIME'])
  summer_collisions['CRASH TIME'] = pd.to_datetime(summer_collisions['CRASH TIME'])


In [178]:
def process_vehicle_types(summer_collisions):
    """
        Adds acolumn called 'VEHICLE TYPE' concatenating all vehicle type columns for each entry.
    """
    # Concatenate values of columns 6 to 11 into a single column while excluding NaN values.
    summer_collisions['VEHICLE TYPE'] = summer_collisions.iloc[:, 6:11].apply(lambda row: [value for value in row.tolist() if not pd.isnull(value)], axis=1)

    # Drop the original vehicle type code columns.
    summer_collisions = summer_collisions.drop(columns=['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'])

    return summer_collisions

summer_collisions_2018 = process_vehicle_types(summer_collisions_2018)
summer_collisions_2020 = process_vehicle_types(summer_collisions_2020)

In [179]:
def add_daytype(summer_collisions):
    """
        Adds a column to the datasets indicating wheter is weekday or weekend.
    """

    # Adds a new column indicating wheter is weekday (1) or weekend (0), recoded as 'Weekday' and 'Weekend' respectively.
    summer_collisions['DAY TYPE'] = (summer_collisions['CRASH DATE'].dt.dayofweek // 5 == 1).astype(int)
    summer_collisions['DAY TYPE'] = summer_collisions['DAY TYPE'].map({0: 'Weekday', 1: 'Weekend'})
    
    return summer_collisions

# Adds the day type column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_daytype(summer_collisions_2018)
summer_collisions_2020 = add_daytype(summer_collisions_2020)

In [180]:
def add_covid_restrictions(summer_collisions, threshold_date):
    """
        Adds a column indicating wheter COVID-19 restrictions were dictated or not.
    """

    # Adds a new column indicating wheter the COVID-19 restrictions were dictated (1) or not (0), recoded as Covid and No Covid respectively.
    summer_collisions['COVID-19 RESTRICTIONS'] = (summer_collisions['CRASH DATE'] >= threshold_date).astype(int)
    summer_collisions['COVID-19 RESTRICTIONS'] = summer_collisions['COVID-19 RESTRICTIONS'].map({1: 'Covid-19', 0: 'No Covid-19'})

    return summer_collisions

# Threshold date for the dictation of COVID-19 restrictions.
covid_threshold_date = pd.to_datetime('2020-03-15')

# Adds the COVID-19 restriction column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_covid_restrictions(summer_collisions_2018, covid_threshold_date)
summer_collisions_2020 = add_covid_restrictions(summer_collisions_2020, covid_threshold_date)

In [181]:
def add_year_column(summer_collisions):
    """
        Adds a column specifying the year of the collision.
    """

    # Converts the 'CRASH DATE' column to a datetime format.
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')
    summer_collisions['YEAR'] = summer_collisions['CRASH DATE'].dt.year
    
    return summer_collisions

summer_collisions_2018 = add_year_column(summer_collisions_2018)
summer_collisions_2020 = add_year_column(summer_collisions_2020)

In [182]:
# Changes data type of ZIP CODE column to integer.
summer_collisions_2018['ZIP CODE'] = pd.to_numeric(summer_collisions_2018['ZIP CODE'], errors='coerce', downcast='integer')
summer_collisions_2020['ZIP CODE'] = pd.to_numeric(summer_collisions_2020['ZIP CODE'], errors='coerce', downcast='integer')

In [183]:
# Reset index for summer_collisions_2018 dataset.
summer_collisions_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_collisions_2020 dataset.
summer_collisions_2020.reset_index(drop=True, inplace=True)

In [184]:
summer_collisions_2018.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,PERSONS AFFECTED,2 WHEELS VEHICLE AFFECTED,VEHICLE TYPE,DAY TYPE,COVID-19 RESTRICTIONS,YEAR
0,2018-08-30,Day,BROOKLYN,11204,40.61662,-73.99972,0.0,0,"[Sedan, Sedan]",Weekday,No Covid-19,2018
1,2018-08-31,Night,BRONX,10472,40.827168,-73.870125,0.0,0,"[Sedan, Sedan]",Weekday,No Covid-19,2018
2,2018-08-28,Evening,BROOKLYN,11211,40.70654,-73.95041,0.0,0,"[Station Wagon/Sport Utility Vehicle, Station ...",Weekday,No Covid-19,2018
3,2018-08-29,Day,QUEENS,11694,40.586067,-73.82263,1.0,1,"[Station Wagon/Sport Utility Vehicle, Sedan]",Weekday,No Covid-19,2018
4,2018-08-05,Evening,MANHATTAN,10075,40.77364,-73.95986,1.0,1,"[Taxi, Station Wagon/Sport Utility Vehicle]",Weekend,No Covid-19,2018


In [185]:
summer_collisions_2020.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,PERSONS AFFECTED,2 WHEELS VEHICLE AFFECTED,VEHICLE TYPE,DAY TYPE,COVID-19 RESTRICTIONS,YEAR
0,2020-08-01,Evening,BRONX,10462,40.840508,-73.85515,1.0,1,"[Sedan, E-Bike]",Weekend,Covid-19,2020
1,2020-06-04,Night,QUEENS,11368,40.744232,-73.861275,0.0,0,"[Bus, Station Wagon/Sport Utility Vehicle]",Weekday,Covid-19,2020
2,2020-06-11,Night,BROOKLYN,11208,40.67743,-73.87591,0.0,0,"[Tractor Truck Diesel, Station Wagon/Sport Uti...",Weekday,Covid-19,2020
3,2020-06-06,Evening,BRONX,10451,40.824898,-73.927635,1.0,1,"[Taxi, Bike]",Weekend,Covid-19,2020
4,2020-07-19,Night,MANHATTAN,10039,40.826176,-73.93877,1.0,1,"[Sedan, E-Bike]",Weekend,Covid-19,2020


In [186]:
def save(data, file_path):
    """
        Saves the dataset as a pickle file.
    """

    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"The dataset has been saved in pickle format at {file_path}.")

save(summer_collisions_2018, 'preprocessed_data/summer_collisions_2018.pkl')
save(summer_collisions_2020, 'preprocessed_data/summer_collisions_2020.pkl')

The dataset has been saved in pickle format at preprocessed_data/summer_collisions_2018.pkl.
The dataset has been saved in pickle format at preprocessed_data/summer_collisions_2020.pkl.


## 2. Weather dataset preprocessing

In [187]:
# Loads the weather dataset.
weather = pd.read_csv('raw_data/weather.csv')

  weather = pd.read_csv('raw_data/weather.csv')


In [188]:
def filter_weather(weather):
    """
        Filters the weather dataset for the years 2018 and 2020.
    """
    
    # Convert 'DATE' column to datetime format.
    weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_weather(data, year):
        return data[
            ((data['DATE'].dt.year == year) & (data['DATE'].dt.month >= 6) & (data['DATE'].dt.month <= 8))
        ]

    summer_weather_2018 = filter_summer_weather(weather, 2018)
    summer_weather_2020 = filter_summer_weather(weather, 2020)

    return summer_weather_2018, summer_weather_2020

# Gets the summer weather for 2018 and 2020 datasets.
summer_weather_2018, summer_weather_2020 = filter_weather(weather)

In [189]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify the weather station and the date of the observation.
observation_columns = [
    'STATION', 
    'NAME', 
    'DATE'
]

# Columns about coordinate location of the observation.
coordinates = [
    'LATITUDE', 
    'LONGITUDE', 
    'ELEVATION'
]

# Columns about the weather conditions (snowfall).
snowfall_columns = [
    'SNOW', # Snowfall.
    'SNWD'  # Snow depth.
]

# Columns about the weather conditions (temperature).
temperature_columns = [
    'TAVG', # Average temperature.
    'TMAX', # Maximum temperature.
    'TMIN'  # Minimum temperature.
]

# Columns about the weather conditions (wind).
wind_columns = [
    'AWND', # Average daily wind speed.
    'WDF2', # Direction of the fastest 2-minute wind.
    'WDF5', # Direction of the fastest 5-second wind.
    'WSF2', # Fastest 2-minute wind speed.
    'WSF5'  # Fastest 5-second wind speed.
]

# Columns about the weather conditions (precipitation).
# Consider only 'PRCP' column.

In [190]:
def filter_columns_of_interest(summer_weather, columns_of_interest):
    """
        Filters the summer weather to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_weather.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_weather = summer_weather.drop(columns=columns_to_drop)
    
    return summer_weather

# Defines columns of interest.
columns_of_interest = observation_columns + coordinates + snowfall_columns + temperature_columns + wind_columns + ['PRCP']

# Filter columns of interest for both summer_weather_2018 and summer_weather_2020.
summer_weather_2018 = filter_columns_of_interest(summer_weather_2018, columns_of_interest)
summer_weather_2020 = filter_columns_of_interest(summer_weather_2020, columns_of_interest)

In [191]:
def replace_nan_with_zero(summer_weather):
    # Replaces all NaN values with 0 values.
   summer_weather.fillna(0, inplace=True)
   return summer_weather

summer_weather_2018 = replace_nan_with_zero(summer_weather_2018)
summer_weather_2020 = replace_nan_with_zero(summer_weather_2020)

In [192]:
def filter_weather_data(summer_weather):
    """
        Removes entries with missing values in the 'TAVG' and 'AWND' columns, doesn't make sense to keep them.
    """
    
    return summer_weather[(summer_weather['TAVG'] != 0) & (summer_weather['AWND'] != 0)]

summer_weather_2018 = filter_weather_data(summer_weather_2018)
summer_weather_2020 = filter_weather_data(summer_weather_2020)

In [193]:
def encode_weather_conditions(summer_weather):
    # Creates new columns to define labels for weather conditions
    summer_weather['WINDY'] = (summer_weather['AWND'].gt(0) | summer_weather['WDF2'].gt(0) | summer_weather['WDF5'].gt(0) | summer_weather['WSF2'].gt(0) | summer_weather['WSF5'].gt(0))
    summer_weather['RAINY'] = summer_weather['PRCP'].gt(0)  # Considers only non-zero precipitation as rainy
    summer_weather['SNOWY'] = (summer_weather['SNOW'].gt(0) | summer_weather['SNWD'].gt(0))
    summer_weather['SUNNY'] = ~(summer_weather['WINDY'] | summer_weather['RAINY'] | summer_weather['SNOWY'])

    return summer_weather

summer_weather_2018 = encode_weather_conditions(summer_weather_2018)
summer_weather_2020 = encode_weather_conditions(summer_weather_2020)

In [194]:
# Reset index for summer_weather_2018 dataset.
summer_weather_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_weather_2020 dataset.
summer_weather_2020.reset_index(drop=True, inplace=True)

In [195]:
summer_weather_2018.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
0,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-01,2.9,1.3,0.0,0.0,...,27.8,17.2,170.0,50.0,5.8,8.9,True,True,False,False
1,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-02,4.1,0.3,0.0,0.0,...,30.6,19.4,110.0,100.0,10.3,12.1,True,True,False,False
2,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-03,5.8,7.6,0.0,0.0,...,19.4,11.7,100.0,90.0,10.7,12.5,True,True,False,False
3,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-04,4.4,8.9,0.0,0.0,...,22.2,11.1,50.0,50.0,8.9,10.3,True,True,False,False
4,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-05,4.2,0.0,0.0,0.0,...,25.0,15.0,290.0,290.0,10.7,14.8,True,False,False,False


In [196]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
0,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-01,4.5,0.0,0.0,0.0,...,22.2,12.2,320.0,320.0,9.8,13.4,True,False,False,False
1,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-02,3.7,0.0,0.0,0.0,...,24.4,17.2,270.0,250.0,8.1,10.3,True,False,False,False
2,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-03,4.0,4.8,0.0,0.0,...,27.8,17.2,350.0,250.0,10.7,14.3,True,True,False,False
3,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-04,3.2,3.6,0.0,0.0,...,31.1,20.6,210.0,210.0,8.1,10.7,True,True,False,False
4,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-05,3.9,6.9,0.0,0.0,...,28.3,20.6,170.0,170.0,8.1,10.3,True,True,False,False


In [197]:
def save(data, file_path):
    """
        Saves the dataset as a pickle file.
    """

    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"The dataset has been saved in pickle format at {file_path}.")

save(summer_weather_2018, 'preprocessed_data/summer_weather_2018.pkl')
save(summer_weather_2020, 'preprocessed_data/summer_weather_2020.pkl')

The dataset has been saved in pickle format at preprocessed_data/summer_weather_2018.pkl.
The dataset has been saved in pickle format at preprocessed_data/summer_weather_2020.pkl.
