# Imports & settings

In [156]:
import pandas as pd
import numpy as np

# Collisions dataset preprocessing

In [157]:
# Loads the downloaded raw collision data (entire dataset).
collisions = pd.read_csv('data/collisions.csv')

  collisions = pd.read_csv('data/collisions.csv')


In [158]:
# Dataset inspection.
print('The collisions raw dataset shape is: ', collisions.shape)

The collisions raw dataset shape is:  (2034951, 29)


In [159]:
print('The collisions raw dataset columns are: ', collisions.columns)

The collisions raw dataset columns are:  Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')


In [160]:
print('Collisions raw dataset variable types: ', collisions.dtypes)

Collisions raw dataset variable types:  CRASH DATE                        object
CRASH TIME                        object
BOROUGH                           object
ZIP CODE                          object
LATITUDE                         float64
LONGITUDE                        float64
LOCATION                          object
ON STREET NAME                    object
CROSS STREET NAME                 object
OFF STREET NAME                   object
NUMBER OF PERSONS INJURED        float64
NUMBER OF PERSONS KILLED         float64
NUMBER OF PEDESTRIANS INJURED      int64
NUMBER OF PEDESTRIANS KILLED       int64
NUMBER OF CYCLIST INJURED          int64
NUMBER OF CYCLIST KILLED           int64
NUMBER OF MOTORIST INJURED         int64
NUMBER OF MOTORIST KILLED          int64
CONTRIBUTING FACTOR VEHICLE 1     object
CONTRIBUTING FACTOR VEHICLE 2     object
CONTRIBUTING FACTOR VEHICLE 3     object
CONTRIBUTING FACTOR VEHICLE 4     object
CONTRIBUTING FACTOR VEHICLE 5     object
COLLISION_ID     

In [161]:
print('Some rows of the collisions raw dataset: ', collisions.head())

Some rows of the collisions raw dataset:     CRASH DATE CRASH TIME   BOROUGH ZIP CODE   LATITUDE  LONGITUDE  \
0  09/11/2021       2:39       NaN      NaN        NaN        NaN   
1  03/26/2022      11:45       NaN      NaN        NaN        NaN   
2  06/29/2022       6:55       NaN      NaN        NaN        NaN   
3  09/11/2021       9:35  BROOKLYN  11208.0  40.667202 -73.866500   
4  12/14/2021       8:13  BROOKLYN  11233.0  40.683304 -73.917274   

                  LOCATION           ON STREET NAME CROSS STREET NAME  \
0                      NaN    WHITESTONE EXPRESSWAY         20 AVENUE   
1                      NaN  QUEENSBORO BRIDGE UPPER               NaN   
2                      NaN       THROGS NECK BRIDGE               NaN   
3    (40.667202, -73.8665)                      NaN               NaN   
4  (40.683304, -73.917274)          SARATOGA AVENUE    DECATUR STREET   

           OFF STREET NAME  ...  CONTRIBUTING FACTOR VEHICLE 2  \
0                      NaN  ...       

In [162]:
# Convert the "CRASH DATE" column to a datetime format.
collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')

# Filter the data for the year 2018.
collisions_2018 = collisions[collisions['CRASH DATE'].dt.year == 2018]

# Filter the data for the year 2020.
collisions_2020 = collisions[collisions['CRASH DATE'].dt.year == 2020]

In [163]:
# Study columns missing values:
collisions = collisions_2020
missing_values = collisions.isna().sum()
missing_values

CRASH DATE                            0
CRASH TIME                            0
BOROUGH                           39228
ZIP CODE                          39236
LATITUDE                           8917
LONGITUDE                          8917
LOCATION                           8917
ON STREET NAME                    29460
CROSS STREET NAME                 59628
OFF STREET NAME                   83456
NUMBER OF PERSONS INJURED             0
NUMBER OF PERSONS KILLED              0
NUMBER OF PEDESTRIANS INJURED         0
NUMBER OF PEDESTRIANS KILLED          0
NUMBER OF CYCLIST INJURED             0
NUMBER OF CYCLIST KILLED              0
NUMBER OF MOTORIST INJURED            0
NUMBER OF MOTORIST KILLED             0
CONTRIBUTING FACTOR VEHICLE 1       512
CONTRIBUTING FACTOR VEHICLE 2     24624
CONTRIBUTING FACTOR VEHICLE 3    102208
CONTRIBUTING FACTOR VEHICLE 4    110006
CONTRIBUTING FACTOR VEHICLE 5    112063
COLLISION_ID                          0
VEHICLE TYPE CODE 1                1073


In [164]:
# The following are the columns we are interestedd in to perform the visualizations.
# Key columns we need to identify the collision.
key_columns = [
    'CRASH DATE',
    'CRASH TIME',
    'BOROUGH',
    'ZIP CODE',
    'ON STREET NAME',
    'CROSS STREET NAME',
    'OFF STREET NAME',
    'COLLISION_ID']

# Coordinates locate collision univocaly.
coordinates = [
    'LATITUDE',
    'LONGITUDE'
]

# Columns we need to identify the vehicles involved in the collision.
contributing_factor_vehicle = [
    'CONTRIBUTING FACTOR VEHICLE 1',
    'CONTRIBUTING FACTOR VEHICLE 2',
    'CONTRIBUTING FACTOR VEHICLE 3',
    'CONTRIBUTING FACTOR VEHICLE 4',
    'CONTRIBUTING FACTOR VEHICLE 5'
]

vehicle_type = [
    'VEHICLE TYPE CODE 1',
    'VEHICLE TYPE CODE 2',
    'VEHICLE TYPE CODE 3',
    'VEHICLE TYPE CODE 4',
    'VEHICLE TYPE CODE 5'
]

# Columns we need to identify the people involved in the collision.
people_involved = [
    'NUMBER OF PERSONS INJURED',
    'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED',
    'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED',
    'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED',
    'NUMBER OF MOTORIST KILLED'
]

In [165]:
# We create a new dataset with the columns we are interested in.
collisions = collisions[key_columns+coordinates+contributing_factor_vehicle+vehicle_type]

In [166]:
# We remove all rows that have missing values.
collisions = collisions.replace('Unspecified', np.nan)
collisions = collisions.dropna(subset=coordinates, how='any') # We remove all rows that haven't specified coordinates.
collisions = collisions.dropna(subset=vehicle_type, thresh=1) # We remove all rows that haven't sepecified at least one vehicle type.
collisions = collisions.dropna(subset=contributing_factor_vehicle, thresh=1) # We remove all rows that haven't sepecified at least one contributing factor.
missing_values = collisions.isna().sum()
missing_values
print('The collisions dataset with useful columns shape is: ', collisions.shape)

The collisions dataset with useful columns shape is:  (75422, 20)


In [167]:
memory_usage = collisions.memory_usage(deep=True).sum() / (1024**2)  # Convert to megabytes
print('Memory usage is: ', memory_usage)

Memory usage is:  58.8120174407959
