# Exploratory Data Analysis and Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline
sns.set()

In [2]:
crashes = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [3]:
# Low_memory=False, because it has mixed types of data in the same column/too many rows
people = pd.read_csv('Traffic_Crashes_-_People.csv', low_memory=False)

In [4]:
vehicles = pd.read_csv('Traffic_Crashes_-_Vehicles.csv', low_memory=False)

In [5]:
def howmanyunique(data):
    tempo = []
    nombre = []
    for x in data.columns:
        tempo.append(len(data[x].value_counts().unique()))
        nombre.append(x)
    print(tempo)
    print('')
    print(nombre)

In [6]:
def howmanynan(data):
    print(data.isna().sum())   

In [50]:
def rows_w_nan(data):
    for x in data.columns:
        if data[x].isna().sum() > 0:
            print("Row: {} : {} NaN Values.".format(x, data[x].isna().sum()))
            

In [7]:
def howmanyduplicates(data):
    print(data.duplicated().sum())

# Crashes

In [8]:
crashes.shape

(482866, 49)

In [9]:
howmanyunique(crashes)

[1, 1, 2, 21, 28, 19, 8, 12, 6, 18, 20, 22, 6, 7, 7, 3, 2, 2, 2, 2, 3, 12, 40, 40, 424, 4, 488, 254, 2, 2, 2, 2, 4, 2, 13, 5, 14, 5, 8, 14, 12, 24, 1, 24, 7, 12, 138, 138, 138]

['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',

In [10]:
howmanyduplicates(crashes)

0


In [None]:
rows_w_nan(crashes)

In [12]:
# We are going to drop all the columns with a lot of of NaN values, except for location that we can say right ahead that is
# a multicolinearity case, since it has the Latiutude and Longitude together
crashes = crashes.drop(columns=['CRASH_DATE_EST_I', 'LANE_CNT', 'INTERSECTION_RELATED_I',
                                'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'PHOTOS_TAKEN_I', 
                                'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE',
                                'WORKERS_PRESENT_I', 'LOCATION'])

### Data Manipulation with Missing Values (NaN)

> **RD_NO**

So we can see here that RD_NO and REPORT_TYPE still have a lot of missing data and that can be explained by the website, where they say and I quote:


> " For privacy reasons, this column is blank for recent crashes."

So since the RD_NO is another way to connect the cases between datasets, we can easily discard the whole column, because we still have CRASH_RECORD_ID, that has no missing values

> **REPORT_TYPE**

In [53]:
crashes.REPORT_TYPE.value_counts()

NOT ON SCENE (DESK REPORT)    277308
ON SCENE                      192629
AMENDED                          240
Name: REPORT_TYPE, dtype: int64

By checking the REPORT_TYPE column we realize that it doesn't offer any predictive value to our model, but still might use it for the business understanding part.

> So we decide to drop the RD_NO columns because we have a good substitute and drop the rows where the remaining NaN values are, so we have a cleaner dataset

In [54]:
crashes = crashes.drop(columns=['RD_NO'])

In [58]:
crashes.dropna(inplace=True)

### Since "Injuries_Fatal" is our target, let's give some more attention to it

We want to transform the column into a Yes or No kind of answer instead of numbers of deads per accident.

In [15]:
crashes.INJURIES_FATAL.value_counts(dropna=False)

0.0    481428
1.0       437
2.0        28
3.0         5
4.0         1
Name: INJURIES_FATAL, dtype: int64

In [16]:
crashes['INJURIES_FATAL'] = np.where(crashes['INJURIES_FATAL']>0, 1, 0)

In [17]:
crashes.INJURIES_FATAL.value_counts(dropna=False)

0    481428
1       471
Name: INJURIES_FATAL, dtype: int64

# People

In [18]:
people.shape

(1067653, 30)

In [19]:
howmanyunique(people)

[1, 6, 45, 45, 43, 51, 11, 341, 49, 362, 3, 107, 72, 61, 18, 7, 5, 5, 184, 176, 96, 20, 14, 12, 23, 4, 8, 4, 29, 2]

['PERSON_ID', 'PERSON_TYPE', 'CRASH_RECORD_ID', 'RD_NO', 'VEHICLE_ID', 'CRASH_DATE', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 'AGE', 'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'HOSPITAL', 'EMS_AGENCY', 'EMS_RUN_NO', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY', 'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE', 'CELL_PHONE_USE']


In [20]:
howmanyduplicates(people)

0


In [60]:
rows_w_nan(people)

Row: RD_NO : 7837 NaN Values.
Row: VEHICLE_ID : 21126 NaN Values.
Row: SEAT_NO : 849503 NaN Values.
Row: CITY : 278316 NaN Values.
Row: STATE : 269523 NaN Values.
Row: ZIPCODE : 346783 NaN Values.
Row: SEX : 15808 NaN Values.
Row: AGE : 304206 NaN Values.
Row: DRIVERS_LICENSE_STATE : 434424 NaN Values.
Row: DRIVERS_LICENSE_CLASS : 517814 NaN Values.
Row: SAFETY_EQUIPMENT : 3169 NaN Values.
Row: AIRBAG_DEPLOYED : 20222 NaN Values.
Row: EJECTION : 13178 NaN Values.
Row: INJURY_CLASSIFICATION : 565 NaN Values.
Row: HOSPITAL : 873244 NaN Values.
Row: EMS_AGENCY : 944181 NaN Values.
Row: EMS_RUN_NO : 1047577 NaN Values.
Row: DRIVER_ACTION : 220203 NaN Values.
Row: DRIVER_VISION : 220482 NaN Values.
Row: PHYSICAL_CONDITION : 219592 NaN Values.
Row: PEDPEDAL_ACTION : 1047794 NaN Values.
Row: PEDPEDAL_VISIBILITY : 1047837 NaN Values.
Row: PEDPEDAL_LOCATION : 1047793 NaN Values.
Row: BAC_RESULT : 219095 NaN Values.
Row: BAC_RESULT VALUE : 1066278 NaN Values.
Row: CELL_PHONE_USE : 1066496 NaN Va

# Vehicles

In [22]:
vehicles.shape

(986181, 72)

In [23]:
howmanyunique(vehicles)

[1, 17, 17, 40, 16, 9, 22, 1, 2, 150, 522, 51, 68, 17, 21, 25, 8, 28, 2, 2, 25, 2, 133, 121, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 1, 34, 11, 11, 4, 40, 37, 39, 58, 2, 3, 8, 3, 3, 1, 3, 1, 3, 3, 9, 2, 3, 3, 26, 4, 40, 12, 8, 9, 6, 2, 2, 7]

['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE', 'UNIT_NO', 'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_ID', 'CMRC_VEH_I', 'MAKE', 'MODEL', 'LIC_PLATE_STATE', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER', 'TOWED_I', 'FIRE_I', 'OCCUPANT_CNT', 'EXCEED_SPEED_LIMIT_I', 'TOWED_BY', 'TOWED_TO', 'AREA_00_I', 'AREA_01_I', 'AREA_02_I', 'AREA_03_I', 'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I', 'AREA_08_I', 'AREA_09_I', 'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I', 'FIRST_CONTACT_POINT', 'CMV_ID', 'USDOT_NO', 'CCMC_NO', 'ILCC_NO', 'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME', 'CARRIER_STATE', 'CARRIER_CITY', 'HAZMAT_PLACARDS_I', 'HAZMAT_NAME', 'UN_NO', 'HAZMAT_PRESENT_I', 'HAZMAT_REPORT

In [24]:
howmanyduplicates(vehicles)

0


In [61]:
rows_w_nan(vehicles)

Row: RD_NO : 7858 NaN Values.
Row: UNIT_TYPE : 1479 NaN Values.
Row: NUM_PASSENGERS : 838583 NaN Values.
Row: VEHICLE_ID : 22726 NaN Values.
Row: CMRC_VEH_I : 967789 NaN Values.
Row: MAKE : 22731 NaN Values.
Row: MODEL : 22873 NaN Values.
Row: LIC_PLATE_STATE : 104563 NaN Values.
Row: VEHICLE_YEAR : 178719 NaN Values.
Row: VEHICLE_DEFECT : 22726 NaN Values.
Row: VEHICLE_TYPE : 22726 NaN Values.
Row: VEHICLE_USE : 22726 NaN Values.
Row: TRAVEL_DIRECTION : 22726 NaN Values.
Row: MANEUVER : 22726 NaN Values.
Row: TOWED_I : 875221 NaN Values.
Row: FIRE_I : 985468 NaN Values.
Row: OCCUPANT_CNT : 22726 NaN Values.
Row: EXCEED_SPEED_LIMIT_I : 983793 NaN Values.
Row: TOWED_BY : 905157 NaN Values.
Row: TOWED_TO : 935405 NaN Values.
Row: AREA_00_I : 947679 NaN Values.
Row: AREA_01_I : 730721 NaN Values.
Row: AREA_02_I : 817483 NaN Values.
Row: AREA_03_I : 892829 NaN Values.
Row: AREA_04_I : 887725 NaN Values.
Row: AREA_05_I : 836437 NaN Values.
Row: AREA_06_I : 838181 NaN Values.
Row: AREA_07_I 

### Beep boop