Importing necessary libraries

In [59]:
import pandas as pd
import numpy as np

Setting up DataFrame.

In [60]:
nyc_crashes = pd.read_csv("original-crashes.csv")
print(nyc_crashes.shape)

(100000, 29)


Dropping columns.
Keeping column contributing_factor_vehicle_1

In [61]:
to_drop = \
    ["borough","zip_code","location","on_street_name",
     "off_street_name","cross_street_name", "number_of_pedestrians_injured",
     "number_of_pedestrians_killed", "number_of_cyclist_injured",
     "number_of_cyclist_killed", "number_of_motorist_injured",
     "number_of_motorist_killed",
     "contributing_factor_vehicle_2", "contributing_factor_vehicle_3",
     "contributing_factor_vehicle_4","contributing_factor_vehicle_5",
     "collision_id","vehicle_type_code1","vehicle_type_code2",
     "vehicle_type_code_3","vehicle_type_code_4","vehicle_type_code_5"]

nyc_crashes = nyc_crashes.drop(columns=to_drop)

Exploring DataFrame "nyc_crashes"

In [62]:
print(nyc_crashes.columns)
print(nyc_crashes.isna().sum())
print(nyc_crashes.dtypes)

Index(['crash_date', 'crash_time', 'latitude', 'longitude',
       'number_of_persons_injured', 'number_of_persons_killed',
       'contributing_factor_vehicle_1'],
      dtype='object')
crash_date                          0
crash_time                          0
latitude                         8035
longitude                        8035
number_of_persons_injured           0
number_of_persons_killed            0
contributing_factor_vehicle_1     371
dtype: int64
crash_date                        object
crash_time                        object
latitude                         float64
longitude                        float64
number_of_persons_injured          int64
number_of_persons_killed           int64
contributing_factor_vehicle_1     object
dtype: object


In [63]:
print(nyc_crashes.head(1).to_string())

                crash_date crash_time   latitude  longitude  number_of_persons_injured  number_of_persons_killed   contributing_factor_vehicle_1
0  2017-04-18T00:00:00.000      23:10  40.536728 -74.193344                          0                         0  Driver Inattention/Distraction


Feature slicing crash_date into ["day", "month", "year"]

In [64]:
# Function for the apply-method
def splitting_date(date):
    date = date.split(sep="T")
    date_lst = date[0].split(sep="-")
    day = int(date_lst[2])
    month = int(date_lst[1])
    year = int(date_lst[0])
    return day, month, year

nyc_crashes["day"], nyc_crashes["month"], nyc_crashes["year"] \
    = zip(*nyc_crashes["crash_date"].apply(lambda x: splitting_date(x)))

# Checking if the new columns have the correct min-max
print(nyc_crashes[["day","month","year"]].describe())

# Dropping "crash_date" column.
nyc_crashes = nyc_crashes.drop(columns=["crash_date"])

# Bringing new columns to the front
cols_to_move = ["day", "month","year"]
nyc_crashes = nyc_crashes[cols_to_move + [col for col in nyc_crashes.columns if col not in cols_to_move]]

                 day          month           year
count  100000.000000  100000.000000  100000.000000
mean       16.336260       7.022990    2019.378560
std         8.959698       1.828326       0.779332
min         1.000000       2.000000    2013.000000
25%         8.000000       6.000000    2019.000000
50%        17.000000       7.000000    2020.000000
75%        24.000000       8.000000    2020.000000
max        31.000000      12.000000    2020.000000


"crash_time" to type: int.
Keeping just the hour.

In [65]:
nyc_crashes["crash_time"] \
    = nyc_crashes["crash_time"].apply(lambda x: int(x.split(sep=":")[0]))
print(nyc_crashes.crash_time.value_counts())

17    7128
16    7033
14    6723
15    6302
18    6150
13    5978
12    5524
11    5041
19    4878
10    4670
9     4545
8     4405
20    4255
0     4139
21    3927
22    3639
23    3151
7     2832
6     2127
1     2024
2     1549
5     1384
3     1327
4     1269
Name: crash_time, dtype: int64


Checking for rows that contains NaN's.

In [66]:
print(nyc_crashes.isna().sum())

day                                 0
month                               0
year                                0
crash_time                          0
latitude                         8035
longitude                        8035
number_of_persons_injured           0
number_of_persons_killed            0
contributing_factor_vehicle_1     371
dtype: int64


Dropping rows where the columns "longitude", and "latitude" contain NaN"s.

In [67]:
nyc_crashes = nyc_crashes.dropna(subset=["latitude","longitude"],how="all")

Dropping rows where the columns "longitude", and "latitude" contain same values.

In [68]:
print(nyc_crashes.isna().sum())
print(nyc_crashes.shape)
print(nyc_crashes[nyc_crashes["longitude"] == 0].shape)
print(nyc_crashes[nyc_crashes["latitude"] == 0].shape)
nyc_crashes = nyc_crashes[nyc_crashes["longitude"] != nyc_crashes["latitude"]]
print(nyc_crashes.shape)

day                                0
month                              0
year                               0
crash_time                         0
latitude                           0
longitude                          0
number_of_persons_injured          0
number_of_persons_killed           0
contributing_factor_vehicle_1    355
dtype: int64
(91965, 9)
(169, 9)
(169, 9)
(91796, 9)


Getting to know the column "contributing_factor_vehicle_1"

In [69]:
print(nyc_crashes.columns)
factor_lst = nyc_crashes["contributing_factor_vehicle_1"].value_counts().index.to_list()
print(factor_lst)

Index(['day', 'month', 'year', 'crash_time', 'latitude', 'longitude',
       'number_of_persons_injured', 'number_of_persons_killed',
       'contributing_factor_vehicle_1'],
      dtype='object')
['Driver Inattention/Distraction', 'Unspecified', 'Following Too Closely', 'Failure to Yield Right-of-Way', 'Backing Unsafely', 'Passing or Lane Usage Improper', 'Passing Too Closely', 'Other Vehicular', 'Unsafe Speed', 'Unsafe Lane Changing', 'Traffic Control Disregarded', 'Turning Improperly', 'Driver Inexperience', 'Reaction to Uninvolved Vehicle', 'Alcohol Involvement', 'View Obstructed/Limited', 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion', 'Aggressive Driving/Road Rage', 'Oversized Vehicle', 'Pavement Slippery', 'Brakes Defective', 'Fell Asleep', 'Passenger Distraction', 'Obstruction/Debris', 'Outside Car Distraction', 'Steering Failure', 'Tire Failure/Inadequate', 'Lost Consciousness', 'Illnes', 'Pavement Defective', 'Glare', 'Fatigued/Drowsy', 'Failure to Keep Right', 'Anim

Removing rows containing category "unspecified"


In [81]:
nyc_crashes = nyc_crashes[nyc_crashes["contributing_factor_vehicle_1"] != "Unspecified"]
factor_lst = nyc_crashes["contributing_factor_vehicle_1"].value_counts().index.to_list()

for x in factor_lst:
    print(x)

Driver Inattention/Distraction
Following Too Closely
Failure to Yield Right-of-Way
Backing Unsafely
Passing or Lane Usage Improper
Passing Too Closely
Other Vehicular
Unsafe Speed
Unsafe Lane Changing
Traffic Control Disregarded
Turning Improperly
Driver Inexperience
Reaction to Uninvolved Vehicle
Alcohol Involvement
View Obstructed/Limited
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion
Aggressive Driving/Road Rage
Oversized Vehicle
Pavement Slippery
Brakes Defective
Fell Asleep
Passenger Distraction
Obstruction/Debris
Outside Car Distraction
Steering Failure
Tire Failure/Inadequate
Lost Consciousness
Illnes
Pavement Defective
Glare
Fatigued/Drowsy
Failure to Keep Right
Animals Action
Driverless/Runaway Vehicle
Drugs (illegal)
Accelerator Defective
Traffic Control Device Improper/Non-Working
Cell Phone (hand-Held)
Lane Marking Improper/Inadequate
Physical Disability
Tow Hitch Defective
Using On Board Navigation Device
Other Lighting Defects
Prescription Medication
Vehicle Vandal

In [70]:
# Function to narrow down categories
def narrowing_down_factor(factor):
    distraction = ['Driver Inattention/Distraction','Passenger Distraction',]
    driver_mistake = ['Following Too Closely','Failure to Yield Right-of-Way','Backing Unsafely',
                      'Passing or Lane Usage Improper','Passing Too Closely','Unsafe Lane Changing',
                      'Turning Improperly','Driver Inexperience',#'Oversized Vehicle'#, ]
    illegal_action = ['Unsafe Speed','Alcohol Involvement','Traffic Control Disregarded','Aggressive Driving/Road Rage',
                      ]
    other_involvement =  ['Other Vehicular','Reaction to Uninvolved Vehicle',
                          'View Obstructed/Limited','Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
                          ]