# Imports & settings

In [None]:
import pandas as pd
import numpy as np

# Preprocessing

## 1. Collisions dataset preprocessing

In [None]:
# Loads the collision dataset.
collisions = pd.read_csv('data/collisions.csv')

In [None]:
def filter_collisions(collisions):
    """
        Filters the collisions dataset for the years 2018 and 2020.
    """

    # Converts the 'CRASH DATE' column to a datetime format.
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_collisions(data, year):
        return data[
            ((data['CRASH DATE'].dt.year == year) & (data['CRASH DATE'].dt.month >= 6) & (data['CRASH DATE'].dt.month <= 8))
        ]

    summer_collisions_2018 = filter_summer_collisions(collisions, 2018)
    summer_collisions_2020 = filter_summer_collisions(collisions, 2020)

    return summer_collisions_2018, summer_collisions_2020

# Gets the summer collisions for 2018 and 2020 datasets.
summer_collisions_2018, summer_collisions_2020 = filter_collisions(collisions)

In [None]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify collision in time.
time_columns = [
    'CRASH DATE',
    'CRASH TIME',
    'BOROUGH',
    'ZIP CODE',
    'COLLISION_ID']

# Columns that identify collision in space (locally).
location_columns = [
    'ON STREET NAME', 
    'CROSS STREET NAME',
    'OFF STREET NAME'
]

# Columns about coordinate location of the collision.
coordinates = [
    'LATITUDE',
    'LONGITUDE'
]

# Columns that identify the people involved in the collision.
people_involved = [
    'NUMBER OF PERSONS INJURED',
    'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED',
    'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED',
    'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED',
    'NUMBER OF MOTORIST KILLED'
]

# Columns about factors contributing to the collision.
contributing_factor_vehicle = [
    'CONTRIBUTING FACTOR VEHICLE 1',
    'CONTRIBUTING FACTOR VEHICLE 2',
    'CONTRIBUTING FACTOR VEHICLE 3',
    'CONTRIBUTING FACTOR VEHICLE 4',
    'CONTRIBUTING FACTOR VEHICLE 5'
]

# Columns about the type of vehicles involved in the collision.
vehicle_type = [
    'VEHICLE TYPE CODE 1',
    'VEHICLE TYPE CODE 2',
    'VEHICLE TYPE CODE 3',
    'VEHICLE TYPE CODE 4',
    'VEHICLE TYPE CODE 5'
]

In [None]:
def filter_columns_of_interest(summer_collisions, columns_of_interest):
    """
        Filters the summer collisions to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_collisions.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_collisions = summer_collisions.drop(columns=columns_to_drop)
    
    return summer_collisions

# Defines columns of interest.
columns_of_interest = time_columns + coordinates + vehicle_type

# Filter columns of interest for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = filter_columns_of_interest(summer_collisions_2018)
summer_collisions_2020 = filter_columns_of_interest(summer_collisions_2020)

In [None]:
def manage_missing_values(summer_collisions):
    """
        Processes the datasets with specified criteria on information category selection (column selection).
    """

    # Replaces 'Unspecified' with NaN values.
    summer_collisions = summer_collisions.replace('Unspecified', np.nan)
    
    # Drops rows with missing values in categories time_columns, coordinates.
    summer_collisions = summer_collisions.dropna(subset=time_columns, how='any')
    summer_collisions = summer_collisions.dropna(subset=coordinates, how='any')
    
    # Drops rows with missing values in vehicle_type where at least two types are specified.
    summer_collisions = summer_collisions.dropna(subset=vehicle_type, thresh=2)
    
    return summer_collisions

# Gets the minimum missing values dataframe, with all information needed to conduct the visualizations, for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = manage_missing_values(summer_collisions_2018)
summer_collisions_2020 = manage_missing_values(summer_collisions_2020)

In [None]:
def add_daytype(summer_collisions):
    """
        Adds a column to the datasets indicating wheter is weekday or weekend.
    """

    # Adds a new column indicating wheter is weekday (1) or weekend (0).
    summer_collisions['DAY TYPE'] = (summer_collisions['CRASH DATE'].dt.dayofweek // 5 == 1).astype(int)
    
    return summer_collisions

# Adds the day type column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_daytype(summer_collisions_2018)
summer_collisions_2020 = add_daytype(summer_collisions_2020)

In [None]:
def add_covid_restrictions(summer_collisions, threshold_date):
    """
        Adds a column indicating wheter COVID-19 restrictions were dictated or not.
    """

    # Adds a new column indicating wheter the COVID-19 restrictions were dictated (1) or not (0).
    summer_collisions['COVID-19 RESTRICTIONS'] = (summer_collisions['CRASH DATE'] >= threshold_date).astype(int)

    return summer_collisions

# Threshold date for the dictation of COVID-19 restrictions.
covid_threshold_date = pd.to_datetime('2020-03-15')

# Adds the COVID-19 restriction column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_covid_restrictions(summer_collisions_2018, covid_threshold_date)
summer_collisions_2020 = add_covid_restrictions(summer_collisions_2020, covid_threshold_date)

## 2. Weather dataset preprocessing