Importing necessary libraries

In [345]:
import pandas as pd

Setting up DataFrame.

In [346]:
nyc_crashes = pd.read_csv("original-crashes.csv")
print(nyc_crashes.shape)

(100000, 29)


Dropping columns.
Keeping column contributing_factor_vehicle_1

In [347]:
to_drop = \
    ["borough","zip_code","location","on_street_name",
     "off_street_name","cross_street_name", "number_of_pedestrians_injured",
     "number_of_pedestrians_killed", "number_of_cyclist_injured",
     "number_of_cyclist_killed", "number_of_motorist_injured",
     "number_of_motorist_killed", "collision_id","vehicle_type_code1","vehicle_type_code2",
     "vehicle_type_code_3","vehicle_type_code_4","vehicle_type_code_5"]

nyc_crashes = nyc_crashes.drop(columns=to_drop)

Exploring DataFrame "nyc_crashes"

In [348]:
print(nyc_crashes.columns)
print(nyc_crashes.isna().sum())
print(nyc_crashes.dtypes)

Index(['crash_date', 'crash_time', 'latitude', 'longitude',
       'number_of_persons_injured', 'number_of_persons_killed',
       'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
       'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4',
       'contributing_factor_vehicle_5'],
      dtype='object')
crash_date                           0
crash_time                           0
latitude                          8035
longitude                         8035
number_of_persons_injured            0
number_of_persons_killed             0
contributing_factor_vehicle_1      371
contributing_factor_vehicle_2    19243
contributing_factor_vehicle_3    91239
contributing_factor_vehicle_4    97760
contributing_factor_vehicle_5    99333
dtype: int64
crash_date                        object
crash_time                        object
latitude                         float64
longitude                        float64
number_of_persons_injured          int64
number_of_persons_ki

In [349]:
print(nyc_crashes.head(1).to_string())

                crash_date crash_time   latitude  longitude  number_of_persons_injured  number_of_persons_killed   contributing_factor_vehicle_1 contributing_factor_vehicle_2 contributing_factor_vehicle_3 contributing_factor_vehicle_4 contributing_factor_vehicle_5
0  2017-04-18T00:00:00.000      23:10  40.536728 -74.193344                          0                         0  Driver Inattention/Distraction                   Unspecified                           NaN                           NaN                           NaN


Feature slicing crash_date into ["day", "month", "year"]

In [350]:
# Function for the apply-method
def splitting_date(date):
    date = date.split(sep="T")
    date_lst = date[0].split(sep="-")
    day = int(date_lst[2])
    month = int(date_lst[1])
    year = int(date_lst[0])
    return day, month, year

nyc_crashes["day"], nyc_crashes["month"], nyc_crashes["year"] \
    = zip(*nyc_crashes["crash_date"].apply(lambda x: splitting_date(x)))

# Checking if the new columns have the correct min-max
print(nyc_crashes[["day","month","year"]].describe())

# Dropping "crash_date" column.
nyc_crashes = nyc_crashes.drop(columns=["crash_date"])

# Bringing new columns to the front
cols_to_move = ["day", "month","year"]
nyc_crashes = nyc_crashes[cols_to_move + [col for col in nyc_crashes.columns if col not in cols_to_move]]

                 day          month           year
count  100000.000000  100000.000000  100000.000000
mean       16.336260       7.022990    2019.378560
std         8.959698       1.828326       0.779332
min         1.000000       2.000000    2013.000000
25%         8.000000       6.000000    2019.000000
50%        17.000000       7.000000    2020.000000
75%        24.000000       8.000000    2020.000000
max        31.000000      12.000000    2020.000000


"crash_time" to type: int.
Keeping just the hour.

In [351]:
nyc_crashes["crash_time"] \
    = nyc_crashes["crash_time"].apply(lambda x: int(x.split(sep=":")[0]))
print(nyc_crashes.crash_time.value_counts())

17    7128
16    7033
14    6723
15    6302
18    6150
13    5978
12    5524
11    5041
19    4878
10    4670
9     4545
8     4405
20    4255
0     4139
21    3927
22    3639
23    3151
7     2832
6     2127
1     2024
2     1549
5     1384
3     1327
4     1269
Name: crash_time, dtype: int64


Checking for rows that contains NaN's.

In [352]:
print(nyc_crashes.isna().sum())

day                                  0
month                                0
year                                 0
crash_time                           0
latitude                          8035
longitude                         8035
number_of_persons_injured            0
number_of_persons_killed             0
contributing_factor_vehicle_1      371
contributing_factor_vehicle_2    19243
contributing_factor_vehicle_3    91239
contributing_factor_vehicle_4    97760
contributing_factor_vehicle_5    99333
dtype: int64


Dropping rows where the columns "longitude", and "latitude" contain NaN"s.

In [353]:
nyc_crashes = nyc_crashes.dropna(subset=["latitude","longitude"],how="all")

Dropping rows where the columns "longitude", and "latitude" contain same values.

In [354]:
print(nyc_crashes.isna().sum())
print(nyc_crashes.shape)
print(nyc_crashes[nyc_crashes["longitude"] == 0].shape)
print(nyc_crashes[nyc_crashes["latitude"] == 0].shape)
nyc_crashes = nyc_crashes[nyc_crashes["longitude"] != nyc_crashes["latitude"]]
print(nyc_crashes.shape)

day                                  0
month                                0
year                                 0
crash_time                           0
latitude                             0
longitude                            0
number_of_persons_injured            0
number_of_persons_killed             0
contributing_factor_vehicle_1      355
contributing_factor_vehicle_2    17754
contributing_factor_vehicle_3    83966
contributing_factor_vehicle_4    89906
contributing_factor_vehicle_5    91338
dtype: int64
(91965, 13)
(169, 13)
(169, 13)
(91796, 13)


Getting to know the columns "contributing_factor_vehicle"

In [355]:
print(nyc_crashes.columns)
factor_lst = nyc_crashes["contributing_factor_vehicle_1"].value_counts().index.to_list()
print(factor_lst)

Index(['day', 'month', 'year', 'crash_time', 'latitude', 'longitude',
       'number_of_persons_injured', 'number_of_persons_killed',
       'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
       'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4',
       'contributing_factor_vehicle_5'],
      dtype='object')
['Driver Inattention/Distraction', 'Unspecified', 'Following Too Closely', 'Failure to Yield Right-of-Way', 'Backing Unsafely', 'Passing or Lane Usage Improper', 'Passing Too Closely', 'Other Vehicular', 'Unsafe Speed', 'Unsafe Lane Changing', 'Traffic Control Disregarded', 'Turning Improperly', 'Driver Inexperience', 'Reaction to Uninvolved Vehicle', 'Alcohol Involvement', 'View Obstructed/Limited', 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion', 'Aggressive Driving/Road Rage', 'Oversized Vehicle', 'Pavement Slippery', 'Brakes Defective', 'Fell Asleep', 'Passenger Distraction', 'Obstruction/Debris', 'Outside Car Distraction', 'Steering Fai

Dropping rows if all values in factor columns are NaN

In [356]:
print(nyc_crashes.shape)
nyc_crashes = nyc_crashes.dropna(subset=["contributing_factor_vehicle_1",
"contributing_factor_vehicle_2",
"contributing_factor_vehicle_3",
"contributing_factor_vehicle_4",
"contributing_factor_vehicle_5"], how="all")
print(nyc_crashes.shape)

(91796, 13)
(91442, 13)


Making new column for each car, combining the factors into categories.

In [357]:
# Function to narrow down categories
def narrowing_down_factor(factor):
    distraction = ['Driver Inattention/Distraction','Passenger Distraction','Fell Asleep','Outside Car Distraction',
                   'Fatigued/Drowsy','Cell Phone (hand-Held)','Using On Board Navigation Device',
                   'Tinted Windows','Eating or Drinking', 'Other Electronic Device', 'Cell Phone (hands-free)',
                   'Listening/Using Headphones']
    driver_mistake = ['Following Too Closely','Failure to Yield Right-of-Way','Backing Unsafely',
                      'Passing or Lane Usage Improper','Passing Too Closely','Unsafe Lane Changing',
                      'Turning Improperly','Driver Inexperience', 'Failure to Keep Right',
                      'Driverless/Runaway Vehicle', 'Oversized Vehicle']
    illegal_action = ['Unsafe Speed','Alcohol Involvement','Traffic Control Disregarded','Aggressive Driving/Road Rage',
                      'Drugs (illegal)']
    other_involvement =  ['Other Vehicular','Reaction to Uninvolved Vehicle', 'Animals Action',
                          'View Obstructed/Limited','Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
                           'Glare','Vehicle Vandalism']
    bad_road_infrastructure = ['Pavement Slippery', 'Obstruction/Debris','Pavement Defective',
                               'Other Lighting Defects']
    car_failures = ['Brakes Defective', 'Steering Failure', 'Tire Failure/Inadequate',
                    'Traffic Control Device Improper/Non-Working', 'Lane Marking Improper/Inadequate',
                    'Tow Hitch Defective','Headlights Defective', 'Shoulders Defective/Improper',
                    'Windshield Inadequate']
    medical = ['Lost Consciousness', 'Illnes','Accelerator Defective','Physical Disability','Prescription Medication']
    if factor in medical:
        return "medical"
    elif factor in car_failures:
        return "car_failure"
    elif factor in bad_road_infrastructure:
        return "bad_road_infrastructure"
    elif factor in other_involvement:
        return "other_involvement"
    elif factor in illegal_action:
        return "illegal_action"
    elif factor in driver_mistake:
        return "driver_mistake"
    elif factor in distraction:
        return "distraction"

nyc_crashes["factor1"] = nyc_crashes["contributing_factor_vehicle_1"].apply(lambda x: narrowing_down_factor(x))
nyc_crashes["factor2"] = nyc_crashes["contributing_factor_vehicle_2"].apply(lambda x: narrowing_down_factor(x))
nyc_crashes["factor3"] = nyc_crashes["contributing_factor_vehicle_3"].apply(lambda x: narrowing_down_factor(x))
nyc_crashes["factor4"] = nyc_crashes["contributing_factor_vehicle_4"].apply(lambda x: narrowing_down_factor(x))
nyc_crashes["factor5"] = nyc_crashes["contributing_factor_vehicle_5"].apply(lambda x: narrowing_down_factor(x))

Double checking for errors

In [358]:
print(nyc_crashes["factor1"].isna().sum())
print(nyc_crashes["factor2"].isna().sum())
print(nyc_crashes["factor3"].isna().sum())
print(nyc_crashes["factor4"].isna().sum())
print(nyc_crashes["factor5"].isna().sum())

23464
79644
90959
91328
91405


"Hot 'n coding" the factor columns for each car.

In [359]:
nyc_crashes["factor1"] = pd.get_dummies(nyc_crashes["factor1"], prefix="1_")
nyc_crashes["factor2"] = pd.get_dummies(nyc_crashes["factor2"], prefix="2_")
nyc_crashes["factor3"] = pd.get_dummies(nyc_crashes["factor3"], prefix="3_")
nyc_crashes["factor4"] = pd.get_dummies(nyc_crashes["factor4"], prefix="4_")
nyc_crashes["factor5"] = pd.get_dummies(nyc_crashes["factor5"], prefix="5_")

Flattening the factors of every car together.

In [360]:
nyc_crashes = nyc_crashes.drop(columns=['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
       'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4',
       'contributing_factor_vehicle_5'])
nyc_crashes_lst = nyc_crashes.columns

sum_column = df["col1"] + df["col2"]

In [361]:
def adding_columns(common_column_name: str):
    name = []
    for column in nyc_crashes_lst:

        if column.endswith(common_column_name):
            name.append(column)
    return name



In [362]:
nyc_crashes["bad_road_infrastructure"] = nyc_crashes[adding_columns("bad_road_infrastructure")].sum(axis=1)
nyc_crashes["car_failure"] = nyc_crashes[adding_columns("car_failure")].sum(axis=1)
nyc_crashes["distraction"] = nyc_crashes[adding_columns("distraction")].sum(axis=1)
nyc_crashes["driver_mistake"] = nyc_crashes[adding_columns("driver_mistake")].sum(axis=1)
nyc_crashes["illegal_action"] = nyc_crashes[adding_columns("illegal_action")].sum(axis=1)
nyc_crashes["medical"] = nyc_crashes[adding_columns("medical")].sum(axis=1)
nyc_crashes["other_involvement"] = nyc_crashes[adding_columns("other_involvement")].sum(axis=1)

In [367]:
print(nyc_crashes["medical"].value_counts())

0.0    91442
Name: medical, dtype: int64
