# Data Cleaning

## Crashes Dataset

In [3]:
import pandas as pd
import numpy as np

crashes = pd.read_csv('../data/crashes_crashes.csv')

40

In [4]:
crashes['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

PRIM_CONTRIBUTORY_CAUSE
UNABLE TO DETERMINE                                                                 310493
FAILING TO YIELD RIGHT-OF-WAY                                                        87668
FOLLOWING TOO CLOSELY                                                                77957
NOT APPLICABLE                                                                       42294
IMPROPER OVERTAKING/PASSING                                                          39303
FAILING TO REDUCE SPEED TO AVOID CRASH                                               33776
IMPROPER BACKING                                                                     31565
IMPROPER LANE USAGE                                                                  28699
DRIVING SKILLS/KNOWLEDGE/EXPERIENCE                                                  26639
IMPROPER TURNING/NO SIGNAL                                                           26520
DISREGARDING TRAFFIC SIGNALS                                      

### Getting rid of rows without target variable

In [104]:
crashes_prime_cause = crashes[(crashes['PRIM_CONTRIBUTORY_CAUSE'] != 'UNABLE TO DETERMINE') & (crashes['PRIM_CONTRIBUTORY_CAUSE'] != 'NOT APPLICABLE')]

### Binning Target Variable

In [106]:
breaking_laws_list = ['DISREGARDING TRAFFIC SIGNALS', 'DISREGARDING STOP SIGN', 'DISREGARDING ROAD MARKINGS', 
                  'DISREGARDING OTHER TRAFFIC SIGNS', 'DISREGARDING YIELD SIGN', 'FAILING TO YIELD RIGHT-OF-WAY'] 

bad_driving_list = ['DRIVING ON WRONG SIDE/WRONG WAY', 'FOLLOWING TOO CLOSELY', 'IMPROPER OVERTAKING/PASSING', 
                    'FAILING TO REDUCE SPEED TO AVOID CRASH', 'TURNING RIGHT ON RED','EXCEEDING SAFE SPEED FOR CONDITIONS',
                    'EXCEEDING AUTHORIZED SPEED LIMIT', 'IMPROPER LANE USAGE', 'PHYSICAL CONDITION OF DRIVER', 
                  'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE','IMPROPER BACKING', 'IMPROPER TURNING/NO SIGNAL']

distraction_list = ['TEXTING', 'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)', 
                    'DISTRACTION - FROM INSIDE VEHICLE','CELL PHONE USE OTHER THAN TEXTING']

drinking_list = ['OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER', 
                 'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)', 'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)']

road_list = ['DISTRACTION - FROM OUTSIDE VEHICLE', 'ROAD ENGINEERING/SURFACE/MARKING DEFECTS', 'ROAD CONSTRUCTION/MAINTENANCE', 'EQUIPMENT - VEHICLE CONDITION', 
             'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)', 'WEATHER']

other_list= ['PASSING STOPPED SCHOOL BUS', 'OBSTRUCTED CROSSWALKS', 'BICYCLE ADVANCING LEGALLY ON RED LIGHT', 
             'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT', 'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST', 'ANIMAL', 'TURNING RIGHT ON RED', 
             'RELATED TO BUS STOP'] 

binning_list = [breaking_laws_list, bad_driving_list, distraction_list, drinking_list, road_list, other_list]
value_list = ['BREAKING LAW', 'BAD DRIVING', 'DISTRACTION INSIDE VEHICLE', 'DRINKING/DRUGS', 'OUTSIDE FACTORS', 'OTHER']



for group, value in zip(binning_list, value_list):
    crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'] = crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].replace(to_replace = group, value = value)

crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'] = crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].replace(to_replace = group, value = value)


PRIM_CONTRIBUTORY_CAUSE
BAD DRIVING                   277735
BREAKING LAW                  114860
OUTSIDE FACTORS                28657
DRINKING/DRUGS                 14797
DISTRACTION INSIDE VEHICLE      7261
OTHER                           2809
Name: count, dtype: int64

### Cleaning NANs

In [107]:
crashes_prime_cause_filled = crashes_prime_cause.fillna({'INTERSECTION_RELATED_I': 'N', 'NOT_RIGHT_OF_WAY_I': 
                                                         'N', 'HIT_AND_RUN_I':'N'})

crashes_prime_cause_filled = crashes_prime_cause_filled.dropna(subset=['LATITUDE', 'LONGITUDE', 'INJURIES_TOTAL', 'INJURIES_FATAL', 
                                                                           'MOST_SEVERE_INJURY'])


### Creating 30 'neighborhoods' by KNN

In [108]:
from sklearn.cluster import KMeans

n_clusters = 30  # Number of clusters to create
X = crashes_prime_cause_filled[['LONGITUDE', 'LATITUDE']]

# Create a K-Means clustering model
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X)

# Add cluster labels to your data
cluster_labels = kmeans.labels_
crashes_prime_cause_filled['GEO_KMEANS_Cluster'] = cluster_labels


  super()._check_params_vs_input(X, default_n_init=10)


### Dropping Useless Columns

In [109]:
crashes_prime_cause_filled = crashes_prime_cause_filled.drop(columns = ['CRASH_DATE_EST_I', 'DEVICE_CONDITION', 'REPORT_TYPE', 'DATE_POLICE_NOTIFIED',
                                                          'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME','BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 
                                                          'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 
                                                          'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_NO_INDICATION', 
                                                          'INJURIES_UNKNOWN', 'LOCATION', 'LANE_CNT', 'CRASH_DATE', 
                                                          'INJURIES_REPORTED_NOT_EVIDENT', 'TRAFFIC_CONTROL_DEVICE', 'INJURIES_TOTAL', 
                                                          'INJURIES_FATAL'])

### Transforming Categorical Data to Numerical Data

In [None]:
crashes_cleaned = pd.get_dummies(crashes_prime_cause_filleddrop, columns = ['WEATHER_CONDITION', 'LIGHTING_CONDITION','FIRST_CRASH_TYPE',
                                                                            'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 
                                                                            'SEC_CONTRIBUTORY_CAUSE', 'ROAD_DEFECT','MOST_SEVERE_INJURY'],
                                                                             drop_first=True, dtype=int)
                                                                                                 




#binary
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in ['CRASH_TYPE','INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE']:
    crashes_cleaned[col] = le.fit_transform(crashes_cleaned[col])


## People Dataset

In [None]:
people = pd.read_csv('../data/crashes_people.csv', low_memory = False)

### Limiting down people to just drivers and keeping relevant columns

In [None]:
people.drop(people[people['PERSON_TYPE'] != 'DRIVER'].index, inplace=True)

people = people[['CRASH_RECORD_ID', 'SEX', 'AGE', 'PHYSICAL_CONDITION','DRIVER_ACTION', 'DRIVER_VISION']]

### Dealing with weird values/NANS

In [None]:
people.drop(people[people['AGE'] < 0].index, inplace=True)

people.SEX.fillna('X', inplace=True)

### Categorical -> Numeric

In [1]:
# Get numerical values for categorical columns using OneHotEncoder
cat = ['SEX', 'PHYSICAL_CONDITION', 'DRIVER_ACTION', 'DRIVER_VISION']

# Import OneHotEncoder from Scikit Learn Preprocessing
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first', sparse = False)

encoded_df = pd.DataFrame(ohe.fit_transform(people[cat]), columns=ohe.get_feature_names(cat))

# Reset the index of both DataFrames
people.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)

# Concatenate the encoded DataFrame with the original DataFrame
people_cleaned = pd.concat([people, encoded_df], axis=1)
people_cleaned.drop(cat, axis = 1, inplace = True)
people_cleaned

NameError: name 'pd' is not defined

# Vehicle Dataset

In [None]:
vehicles = pd.read_csv('../data/crashes_vehicles.csv')

### Dealing with NANs

In [None]:
nan_count = vehicles.isna().sum()

useful_columns = nan_count[nan_count < 1_000_000].index

vehicles = vehicles[useful_columns]

vehicles['VEHICLE_DEFECT'].replace({np.nan: 'UNKNOWN', }, inplace = True)

vehicles['VEHICLE_TYPE'].replace({np.nan: 'UNKNOWN/NA'}, inplace = True)

vehicles['VEHICLE_USE'].replace({np.nan: 'UNKNOWN/NA'}, inplace = True)

vehicles['MANEUVER'].replace({np.nan: 'UNKNOWN/NA'}, inplace = True)

vehicles['OCCUPANT_CNT'].replace({np.nan: 999}, inplace = True)

vehicles['FIRST_CONTACT_POINT'].replace({np.nan: 'UNKOWN' }, inplace = True)



### Converting to datetime

In [None]:
vehicles['CRASH_DATE'] = pd.to_datetime(vehicles['CRASH_DATE'])

### Categorical -> Numeric

In [None]:
cat_columns = ['UNIT_TYPE', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE', 'MANEUVER',\
              'FIRST_CONTACT_POINT']

vehicles_cleaned = pd.get_dummies(vehicles, columns = cat_columns, drop_first = True, dtype = int)