In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_excel('pedestrian_incidents.xlsx')

In [23]:
df.shape

(22848, 73)

In [24]:
# Remoe unvecessary columns due to target leakage, bad data, high-level of missing data, and redundant data
remove_cols = ['Unnamed: 0', 'crash_record_id', 'vehicle_type', 'crash_date_y', 'date_police_notified', 'prim_contributory_cause'
               , 'sec_contributory_cause', 'street_no', 'street_name','beat_of_occurrence', 'most_severe_injury'
               , 'injuries_total', 'injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident'
               , 'injuries_no_indication', 'injuries_unknown', 'latitude', 'longitude', 'location', 'pedpedal_action', 'make', 'model'
               , 'photos_taken_i', 'statements_taken_i', 'crash_date_est_i', 'private_property_i', 'work_zone_i', 'work_zone_type', 'crash_date_x'
               , 'dooring_i', 'workers_present_i', 'lane_cnt', 'beat_num_x', 'district_name_x', 'beat', 'ward', 'community_area', 'model'
               , 'district_x', 'beat_num_y', 'district', 'district_name_y', 'area_numbe', 'community', 'person_id', 'device_condition']

df.drop(columns=remove_cols, inplace=True)

In [25]:
df.shape

(22848, 26)

In [26]:
# rename injury_classification to target
mapping = {'NO INDICATION OF INJURY': 0, 'REPORTED, NOT EVIDENT':0,'NONINCAPACITATING INJURY': 1, 'INCAPACITATING INJURY': 2, 'INCAPACITATING INJURY': 2}

# convert injury_classification to numerical values
df['injury_classification'] = df['injury_classification'].map(mapping)

# rename injury_classification to target
df.rename(columns={'injury_classification': 'target'}, inplace=True)

# remove potential Na recorcords 
df = df.dropna(subset=['target'])

df['target'] = df[['target']].astype(int)

In [27]:
# Rename columns 
col_name_mapping = {'person_type': 'person_type_ind', 'age': 'age_ind', 'sex': 'sex_ind', 'intersection_related_i': 'intersection_related_i_ind'
                    ,'pedpedal_action': 'pedpedal_action_ind', 'pedpedal_visibility': 'pedpedal_visibility_ind'
                    ,'pedpedal_location': 'pedpedal_location_ind', 'posted_speed_limit': 'posted_speed_limit_ind'
                    ,'traffic_control_device': 'traffic_control_device_ind', 'lighting_condition': 'lighting_condition_ind'
                    ,'roadway_surface_cond': 'roadway_surface_cond_ind', 'first_crash_type': 'first_crash_type_ind', 'trafficway_type': 'trafficway_type_ind'
                    ,'crash_month': 'crash_month_ind', 'intersection_related_ind': 'intersection_related_i_ind', 'district_x': 'district'
                    ,'weather_condition': 'weather_condition_ind', 'alignment': 'alignment_ind', 'road_defect': 'road_defect_ind', 'report_type': 'report_type_ind'
                    ,'crash_type': 'crash_type_ind', 'damage': 'damage_ind', 'street_direction': 'street_direction_ind', 'num_units': 'num_units_ind'
                    ,'crash_hour': 'crash_hour_ind', 'hit_and_run_i': 'hit_and_run_i_ind', 'crash_day_of_week': 'crash_day_of_week_ind'}

# Rename columns using map
df = df.rename(columns=col_name_mapping)

#### Encode feataures

In [28]:
df = df.fillna(0)

In [29]:
# filter dataframe 
suffix = '_ind'
ind_filtered_df = df.filter(like=suffix, axis=1)

# Get list of columns not included from above 
non_ind_columns = df.columns.difference(ind_filtered_df.columns)

# create dataframe of non-categorical columns
non_ind_df = df[non_ind_columns]

In [30]:
# One-hot encode the categorical columns
ind_filtered_df_encoded = pd.get_dummies(ind_filtered_df).astype(int)

In [31]:
# Preview Processed Data
ind_filtered_df_encoded.head(2)

Unnamed: 0,age_ind,posted_speed_limit_ind,num_units_ind,crash_hour_ind,crash_day_of_week_ind,crash_month_ind,person_type_ind_BICYCLE,person_type_ind_PEDESTRIAN,sex_ind_0,sex_ind_F,...,street_direction_ind_E,street_direction_ind_N,street_direction_ind_S,street_direction_ind_W,intersection_related_i_ind_0,intersection_related_i_ind_N,intersection_related_i_ind_Y,hit_and_run_i_ind_0,hit_and_run_i_ind_N,hit_and_run_i_ind_Y
0,43,30,2,18,3,2,0,1,0,1,...,1,0,0,0,1,0,0,1,0,0
1,37,30,2,17,3,2,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0


In [32]:
# Add the transformed ind columns and non-ind columns into a singular df 
df = pd.concat([ind_filtered_df_encoded, non_ind_df], axis=1)

In [33]:
df.head(2)

Unnamed: 0,age_ind,posted_speed_limit_ind,num_units_ind,crash_hour_ind,crash_day_of_week_ind,crash_month_ind,person_type_ind_BICYCLE,person_type_ind_PEDESTRIAN,sex_ind_0,sex_ind_F,...,street_direction_ind_S,street_direction_ind_W,intersection_related_i_ind_0,intersection_related_i_ind_N,intersection_related_i_ind_Y,hit_and_run_i_ind_0,hit_and_run_i_ind_N,hit_and_run_i_ind_Y,district_y,target
0,43,30,2,18,3,2,0,1,0,1,...,0,0,1,0,0,1,0,0,0.0,1
1,37,30,2,17,3,2,0,1,0,1,...,0,0,1,0,0,1,0,0,20.0,1


In [34]:
df.shape

(22613, 139)

In [35]:
# write dataset 
df.to_parquet('processed_data.parquet')