# EDA notebook

In [21]:
# Standard imports

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [22]:
# Geo imports

from shapely.geometry import Point, Polygon, shape
import geopandas as gpd

In [2]:
# Set pandas preferences

pd.set_option('display.max_columns', None)

### Formulas

In [28]:
def neighborhood_json(point):
    '''
    Function accepts a Point object from the shapely library.
    It parses through the JSON of nyc neighborhood geo data, checking if any of them contain the point.
    If there is a match, the neighborhood name is returned.
    
    '''
    for feature in nycmap['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['ntaname']
            continue

In [29]:
def borough_json(point):
    '''
    This is a repeat of the function above, except to return borough instead of neighborhood.
    
    Function accepts a Point object from the shapely library.
    It parses through the JSON of nyc neighborhood geo data, checking if any of them contain the point.
    If there is a match, the borough name is returned.
    
    '''
    for feature in nycmap['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['boro_name']
            continue

In [3]:
# Import data

df_collisions = pd.read_csv('./data/nypd-motor-vehicle-collisions.csv', low_memory=False)

In [4]:
df_collisions.head()

Unnamed: 0,ACCIDENT DATE,ACCIDENT TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2019-08-05T00:00:00.000,16:30,QUEENS,11434,40.676052,-73.790184,"{'type': 'Point', 'coordinates': [-73.790184, ...",,,150-08 123 AVENUE,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,4184637,Sedan,Pick-up Truck,,,
1,2019-08-27T00:00:00.000,16:02,BROOKLYN,11225,40.65778,-73.951096,"{'type': 'Point', 'coordinates': [-73.951096, ...",,,288 HAWTHORNE STREET,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,4195773,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
2,2019-08-15T00:00:00.000,17:57,MANHATTAN,10002,40.718143,-73.993835,"{'type': 'Point', 'coordinates': [-73.993835, ...",CHRYSTIE STREET,GRAND STREET,,1.0,0.0,0,0,0,0,1,0,Driver Inattention/Distraction,,,,,4202457,Sedan,,,,
3,2019-08-30T00:00:00.000,21:53,BRONX,10460,40.840534,-73.86661,"{'type': 'Point', 'coordinates': [-73.86661, 4...",,,1837 EAST TREMONT AVENUE,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,4198749,Taxi,Station Wagon/Sport Utility Vehicle,,,
4,2019-08-06T00:00:00.000,9:45,MANHATTAN,10016,40.74544,-73.9754,"{'type': 'Point', 'coordinates': [-73.9754, 40...",EAST 35 STREET,2 AVENUE,,1.0,0.0,0,0,1,0,0,0,Driver Inattention/Distraction,Driver Inattention/Distraction,,,,4183798,Station Wagon/Sport Utility Vehicle,Bike,,,


In [5]:
df_collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 29 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   ACCIDENT DATE                  1612178 non-null  object 
 1   ACCIDENT TIME                  1612178 non-null  object 
 2   BOROUGH                        1127553 non-null  object 
 3   ZIP CODE                       1127376 non-null  object 
 4   LATITUDE                       1415893 non-null  float64
 5   LONGITUDE                      1415893 non-null  float64
 6   LOCATION                       1415893 non-null  object 
 7   ON STREET NAME                 1298002 non-null  object 
 8   CROSS STREET NAME              1079193 non-null  object 
 9   OFF STREET NAME                219732 non-null   object 
 10  NUMBER OF PERSONS INJURED      1612161 non-null  float64
 11  NUMBER OF PERSONS KILLED       1612145 non-null  float64
 12  NUMBER OF PEDE

In [6]:
# Convert accident date to a datetime object

df_collisions['ACCIDENT DATE'] = pd.to_datetime(df_collisions['ACCIDENT DATE'])

In [7]:
# There is data from 20212 through 2019 in this dataset

df_collisions['ACCIDENT DATE'].describe(datetime_is_numeric=True)

count                          1612178
mean     2016-04-03 12:22:05.011009536
min                2012-07-01 00:00:00
25%                2014-06-15 00:00:00
50%                2016-04-10 00:00:00
75%                2018-02-21 00:00:00
max                2019-11-26 00:00:00
Name: ACCIDENT DATE, dtype: object

### Target variable

- This dataset shows extreme class imbalance with 99.9% of traffic incidents not resulting in a fatality

In [9]:
df_collisions['NUMBER OF PERSONS KILLED'].value_counts()

0.0    1610308
1.0       1783
2.0         42
3.0          8
4.0          2
8.0          1
5.0          1
Name: NUMBER OF PERSONS KILLED, dtype: int64

In [17]:
df_collisions['NUMBER OF PERSONS KILLED'].value_counts(normalize=True)

0.0    9.988605e-01
1.0    1.105980e-03
2.0    2.605225e-05
3.0    4.962333e-06
4.0    1.240583e-06
8.0    6.202916e-07
5.0    6.202916e-07
Name: NUMBER OF PERSONS KILLED, dtype: float64

In [11]:
df_collisions['NUMBER OF PEDESTRIANS KILLED'].value_counts()

0    1611180
1        984
2         13
6          1
Name: NUMBER OF PEDESTRIANS KILLED, dtype: int64

In [12]:
df_collisions['NUMBER OF CYCLIST KILLED'].value_counts()

0    1612030
1        147
2          1
Name: NUMBER OF CYCLIST KILLED, dtype: int64

In [14]:
df_collisions['NUMBER OF MOTORIST KILLED'].value_counts()

0    1611480
1        662
2         26
3          8
5          1
4          1
Name: NUMBER OF MOTORIST KILLED, dtype: int64

In [15]:
# Confirm that 'Number of Persons Killed' is not addititive with the other death columns

df_overlap = df_collisions[(df_collisions['NUMBER OF PEDESTRIANS KILLED'] == 1) & 
                          (df_collisions['NUMBER OF PERSONS KILLED'] == 0)]

# There are no records where a pedestrian was killed but not a 'person'. There is no overlap.
df_overlap.shape

(0, 29)

## Add missing neighborhoods

In [23]:
# Read in JSON map of NYC neighborhoods

nycmap = json.load(open('./nyc_geo_data/2010 Neighborhood Tabulation Areas (NTAs).geojson'))

In [27]:
# Create a 'point' variable column out of the longitute and latitutde

df_collisions['point_coordinates'] = df_collisions.apply(lambda row: Point(row['LONGITUDE'], row['LATITUDE']), axis=1)

df_collisions.head()

  values = construct_1d_object_array_from_listlike(values)


Unnamed: 0,ACCIDENT DATE,ACCIDENT TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,point_coordinates
0,2019-08-05,16:30,QUEENS,11434,40.676052,-73.790184,"{'type': 'Point', 'coordinates': [-73.790184, ...",,,150-08 123 AVENUE,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,4184637,Sedan,Pick-up Truck,,,,POINT (-73.790184 40.676052)
1,2019-08-27,16:02,BROOKLYN,11225,40.65778,-73.951096,"{'type': 'Point', 'coordinates': [-73.951096, ...",,,288 HAWTHORNE STREET,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,4195773,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,POINT (-73.95109599999999 40.65778)
2,2019-08-15,17:57,MANHATTAN,10002,40.718143,-73.993835,"{'type': 'Point', 'coordinates': [-73.993835, ...",CHRYSTIE STREET,GRAND STREET,,1.0,0.0,0,0,0,0,1,0,Driver Inattention/Distraction,,,,,4202457,Sedan,,,,,POINT (-73.993835 40.718143)
3,2019-08-30,21:53,BRONX,10460,40.840534,-73.86661,"{'type': 'Point', 'coordinates': [-73.86661, 4...",,,1837 EAST TREMONT AVENUE,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,4198749,Taxi,Station Wagon/Sport Utility Vehicle,,,,POINT (-73.86660999999999 40.84053400000001)
4,2019-08-06,9:45,MANHATTAN,10016,40.74544,-73.9754,"{'type': 'Point', 'coordinates': [-73.9754, 40...",EAST 35 STREET,2 AVENUE,,1.0,0.0,0,0,1,0,0,0,Driver Inattention/Distraction,Driver Inattention/Distraction,,,,4183798,Station Wagon/Sport Utility Vehicle,Bike,,,,POINT (-73.97539999999999 40.74544)


In [None]:
# Apply neighborhood function to the coordinates

df_collisions['neighborhood'] = df_collisions['point_coordinates'].apply(lambda x: neighborhood_json(x))

In [None]:
# Apply borough function to the coordinates

df_collisions['borough'] = df_collisions['point_coordinates'].apply(lambda x: borough_json(x))