# NYPD Motor Vehicle Collision Exploratory Analysis
____
Micaela Flores (mrf444), Laureano Nisenbaum (lvn218), Jason Li (yl2813), Trevor Mitchell (tim225)

### Cleaning the Data: Cleaning from Raw Data and Regrouping Accident Causes (Micaela)

In [21]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.preprocessing import LabelEncoder

def cleanData(filename,startdate,enddate):
    df = pd.read_csv(filename)
    
    #convert date column to datetime objects
    df["DATE"] = pd.to_datetime(df["DATE"]) #takes a bit to run 
    
    start, end = pd.to_datetime([startdate,enddate],format='%d%b%Y')
    
    #filter out only dates starting from start date and ending with end date
    df = df[(df["DATE"] >= start) & (df["DATE"] <= end)]
    
    #drop all rows that contain NaN in the Borough and Zip Code column 
    df = df.dropna(subset=["BOROUGH","ZIP CODE"])
    
    #change Time column to datetime objects
    df['TIME'] = pd.to_datetime(df['TIME'],format= '%H:%M' ).dt.time
    
    #change Zip Code column to integers
    df['ZIP CODE'] = df['ZIP CODE'].astype('int64')
    
    #only get relevant columns
    df = df.drop(["LOCATION","LATITUDE","LONGITUDE","ON STREET NAME","CROSS STREET NAME",\
         "OFF STREET NAME","CONTRIBUTING FACTOR VEHICLE 3","CONTRIBUTING FACTOR VEHICLE 4",\
         "CONTRIBUTING FACTOR VEHICLE 5","UNIQUE KEY","VEHICLE TYPE CODE 1",
         "VEHICLE TYPE CODE 2","VEHICLE TYPE CODE 3","VEHICLE TYPE CODE 4","VEHICLE TYPE CODE 5"],axis=1)
    df = df.loc[df["CONTRIBUTING FACTOR VEHICLE 1"] != "Unspecified"]
    
    #encode the string accident causes into categorical numbers for later grouping
    le = LabelEncoder()
    df.insert(13,'REASON',le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 1'].astype(str)))
    map_key = dict(zip(le.classes_, le.transform(le.classes_))) #for our purposes to know how numbers match up
    
    reason_map = {9:'Distracted Driving',33:'Distracted Driving',14:'Distracted Driving',6:'Distracted Driving',\
              7:'Distracted Driving',8:'Distracted Driving',28:'Distracted Driving',25:'Distracted Driving',\
              54:'Distracted Driving',45:'Distracted Driving',18:'Impaired Driving',2:'Impaired Driving',\
              22:'Impaired Driving',23:'Impaired Driving',17:'Impaired Driving',26:'Impaired Driving',\
              39:'Impaired Driving',12:'Impaired Driving',13:'Impaired Driving',40:'Impaired Driving',\
              4:'Bad Driving',52:'Bad Driving',34:'Bad Driving',16:'Bad Driving', 19:'Bad Driving', \
              35:'Bad Driving',53:'Bad Driving',51:'Bad Driving',10:'Bad Driving',1:'Bad Driving',\
              50:'Bad Driving',15:'Bad Driving', 56:'Poor Driving Conditions',20:'Poor Driving Conditions',\
              27:'Poor Driving Conditions',37:'Poor Driving Conditions',24:'Unsafe Infrastructure',\
              36:'Unsafe Infrastructure',49:'Unsafe Infrastructure',29:'Unsafe Infrastructure',\
              43:'Unsafe Infrastructure', 30:'External Factor', 38:'External Factor',3:'External Factor',\
              31:'External Factor',41:'External Factor',42:'External Factor',47:'Mechanical Failure',\
              5:'Mechanical Failure',44:'Mechanical Failure',0:'Mechanical Failure',21:'Mechanical Failure',\
              48:'Mechanical Failure',57:'Mechanical Failure',32:'Other',55:'Other',46:'Other',11:'Other',58:'Other'}
    
    #group into our designated categories
    df['REASON'] = df['REASON'].map(reason_map)
    
    return df

In [22]:
df = cleanData("~/Downloads/NYPD_Motor_Vehicle_Collisions.csv","1NOV2017","31OCT2018")
print("Size of data after cleaning: ",df.shape)
df.head(20)

  if self.run_code(code, result):


Size of data after cleaning:  (105641, 15)


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,REASON,CONTRIBUTING FACTOR VEHICLE 2
9163,2018-10-31,00:40:00,MANHATTAN,10022,0,0,0,0,0,0,0,0,Failure to Yield Right-of-Way,Bad Driving,Unspecified
9178,2018-10-31,00:00:00,BRONX,10467,0,0,0,0,0,0,0,0,Reaction to Uninvolved Vehicle,External Factor,Unspecified
9179,2018-10-31,00:00:00,BRONX,10473,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,Distracted Driving,Unspecified
9180,2018-10-31,00:00:00,BROOKLYN,11229,0,0,0,0,0,0,0,0,Reaction to Uninvolved Vehicle,External Factor,
9181,2018-10-31,00:00:00,MANHATTAN,10035,0,0,0,0,0,0,0,0,Backing Unsafely,Bad Driving,Unspecified
