# <font color='red'>_Accidents</font> in Montreal: Data Cleaning_

- See the documentation [> HERE< ](https://saaq.gouv.qc.ca/donnees-ouvertes/rapports-accident/rapports-accident-documentation.pdf)
- Get the data [> HERE <](http://donnees.ville.montreal.qc.ca/dataset/collisions-routieres)

### <font color='orange'>#TODO</font>
Make dict based on the documentation so the coded variables make some sense

#### Procedure
- Deleting Useless columns (see documentation)
- Make dummy variables for categorical/discrete variables

In [1]:
# imports
import pandas as pd
import numpy as np
import sys
import os 
os.chdir('c:/users/nicolas/documents/data/accidents-mtl')

In [2]:
# loading data
df = pd.read_csv('accidents_2012_2018.csv', index_col=0)

In [3]:
# resetting index
df = df.reset_index(drop=True)

In [4]:
# head
df.head()

Unnamed: 0,JR_SEMN_ACCDN,DT_ACCDN,CD_MUNCP,NO_CIVIQ_ACCDN,SFX_NO_CIVQ_ACCDN,BORNE_KM_ACCDN,RUE_ACCDN,TP_REPRR_ACCDN,ACCDN_PRES_DE,NB_METRE_DIST_ACCD,...,NB_VICTIMES_VELO,VITESSE_AUTOR,LOC_X,LOC_Y,LOC_COTE_Q,LOC_COTE_P,LOC_DETACHEE,LOC_IMPRECISION,LOC_LONG,LOC_LAT
0,ME,2012/02/01,66102.0,3501.0,,,ST CHARLES,2.0,STAT,,...,0,,276517.3795,5035127.0,A,3,O,N,-73.861616,45.455505
1,SA,2012/06/02,66023.0,,,,,,COTE VERTU ET AUT 40,,...,0,,287913.26,5038666.0,A,3,N,O,-73.716033,45.487715
2,JE,2012/06/28,66023.0,,,,COTE VERTU,1.0,DECARIE,,...,0,50.0,290518.82501,5041617.0,A,1,N,N,-73.682786,45.514324
3,ME,2012/07/11,66023.0,,,,ST MATHIEU,1.0,RENE LEVESQUE,50.0,...,0,50.0,298822.886,5039146.0,A,3,N,N,-73.576472,45.492212
4,LU,2012/01/02,66023.0,4849.0,,,ST JEAN,,,,...,0,,277433.35738,5038881.0,A,1,O,N,-73.850114,45.489319


#### Deleting Useless Columns

In [5]:
rows, columns = df.shape
print(f'We have {rows:,} rows and {columns} columns.')

We have 171,271 rows and 67 columns.


In [6]:
# deleting columns upon inspection of documentation
df.drop(['NO_CIVIQ_ACCDN', 'RUE_ACCDN', 'ACCDN_PRES_DE', 'CD_PNT_CDRNL_ROUTE', 
         'BORNE_KM_ACCDN', 'NB_METRE_DIST_ACCD', 'CD_PNT_CDRNL_REPRR', 
         'CD_SIT_PRTCE_ACCDN', 'nb_taxi', 'nb_urgence', 'nb_motoneige', 'nb_VHR', 
         'nb_autres_types', 'nb_veh_non_precise', 'CD_MUNCP', 'CD_ASPCT_ROUTE',
         'REG_ADM', 'MRC', 'LOC_DETACHEE', 'LOC_IMPRECISION'], 
         axis=1, inplace=True)

In [7]:
rows, columns = df.shape
print(f'We have {rows:,} rows and {columns} columns.')

We have 171,271 rows and 52 columns.


In [8]:
# columns we are going to delete
null_col = df.loc[:, df.isnull().sum(axis=0) < 5e4].columns # del more than 50,000 missing values
null_num = df.loc[:, null_col].isnull().sum(axis=0)
null_count = pd.DataFrame([null_col, null_num]).T
null_count.columns = ['Name', 'Missing Values']
null_count.sort_values(by='Missing Values', ascending=False).head(10)

Unnamed: 0,Name,Missing Values
9,CD_CONFG_ROUTE,18738
8,CD_LOCLN_ACCDN,15009
10,CD_COND_METEO,11922
4,CD_ECLRM,11403
3,CD_ETAT_SURFC,11273
2,CD_GENRE_ACCDN,9045
7,CD_ASPCT_ROUTE,8596
5,CD_ENVRN_ACCDN,5957
6,CD_CATEG_ROUTE,5115
38,LOC_Y,11


In [9]:
# keeping only rows with less than 20% of missing values
df = df.loc[:, df.isnull().sum(axis=0) < 5e4]

In [10]:
# new shape
rows, columns = df.shape
print(f'We now have {rows:,} rows and {columns} columns.')

We now have 171,271 rows and 45 columns.


In [11]:
# new number of missing values 
missing_values = df.isnull().sum().sum()/df.size*100
print('We have {}% missing values.'.format(np.round(missing_values, 2)))

We have 1.26% missing values.


In [12]:
# size of the dataframe
print(f'Our dataframe is {int(sys.getsizeof(df)/1e6)} MB.')

Our dataframe is 167 MB.


In [13]:
# getting dtypes
column_types = df.dtypes
column_types.head()

JR_SEMN_ACCDN      object
DT_ACCDN           object
CD_GENRE_ACCDN    float64
CD_ETAT_SURFC     float64
CD_ECLRM          float64
dtype: object

In [14]:
# dropping rows missing our target
df.dropna(subset=['GRAVITE'], inplace=True)

In [15]:
# getting columns by datatype
integers = df.columns[column_types == 'int64']
integers

Index(['NB_MORTS', 'NB_BLESSES_GRAVES', 'NB_BLESS_LEGERS', 'AN',
       'NB_VICTIMES_TOTAL', 'NB_DECES_PIETON', 'NB_BLESSES_PIETON',
       'NB_VICTIMES_PIETON', 'NB_DECES_MOTO', 'NB_BLESSES_MOTO',
       'NB_VICTIMES_MOTO', 'NB_DECES_VELO', 'NB_BLESSES_VELO',
       'NB_VICTIMES_VELO', 'LOC_COTE_P'],
      dtype='object')

In [16]:
# getting columns by datatype
floats = df.columns[column_types == 'float64']
floats

Index(['CD_GENRE_ACCDN', 'CD_ETAT_SURFC', 'CD_ECLRM', 'CD_ENVRN_ACCDN',
       'CD_CATEG_ROUTE', 'CD_ASPCT_ROUTE', 'CD_LOCLN_ACCDN', 'CD_CONFG_ROUTE',
       'CD_COND_METEO', 'NB_VEH_IMPLIQUES_ACCDN', 'nb_automobile_camion_leger',
       'nb_camionLourd_tractRoutier', 'nb_outil_equipement',
       'nb_tous_autobus_minibus', 'nb_bicyclette', 'nb_cyclomoteur',
       'nb_motocyclette', 'LOC_X', 'LOC_Y', 'LOC_LONG', 'LOC_LAT'],
      dtype='object')

In [17]:
# getting columns by datatype
objects = df.columns[column_types == 'object']
objects

Index(['JR_SEMN_ACCDN', 'DT_ACCDN', 'HR_ACCDN', 'GRAVITE', 'REG_ADM', 'MRC',
       'LOC_COTE_Q', 'LOC_DETACHEE', 'LOC_IMPRECISION'],
      dtype='object')

In [18]:
df.head()

Unnamed: 0,JR_SEMN_ACCDN,DT_ACCDN,CD_GENRE_ACCDN,CD_ETAT_SURFC,CD_ECLRM,CD_ENVRN_ACCDN,CD_CATEG_ROUTE,CD_ASPCT_ROUTE,CD_LOCLN_ACCDN,CD_CONFG_ROUTE,...,NB_BLESSES_VELO,NB_VICTIMES_VELO,LOC_X,LOC_Y,LOC_COTE_Q,LOC_COTE_P,LOC_DETACHEE,LOC_IMPRECISION,LOC_LONG,LOC_LAT
0,ME,2012/02/01,31.0,16.0,1.0,1.0,21.0,11.0,33.0,4.0,...,0,0,276517.3795,5035127.0,A,3,O,N,-73.861616,45.455505
1,SA,2012/06/02,59.0,11.0,1.0,3.0,11.0,22.0,37.0,1.0,...,0,0,287913.26,5038666.0,A,3,N,O,-73.716033,45.487715
2,JE,2012/06/28,31.0,11.0,1.0,3.0,13.0,11.0,32.0,3.0,...,0,0,290518.82501,5041617.0,A,1,N,N,-73.682786,45.514324
3,ME,2012/07/11,31.0,11.0,3.0,3.0,13.0,11.0,33.0,2.0,...,0,0,298822.886,5039146.0,A,3,N,N,-73.576472,45.492212
4,LU,2012/01/02,31.0,12.0,1.0,3.0,21.0,,,,...,0,0,277433.35738,5038881.0,A,1,O,N,-73.850114,45.489319


In [19]:
# function to make dummies
def make_dummies(col):
    global df
    dummies = pd.get_dummies(df[col], prefix_sep=': ', prefix=col)
    df = pd.concat([df, dummies], sort=False, axis=1)
    df.drop(col, axis=1, inplace=True)

In [37]:
# function to see value counts
def vc(col):
    return df[col].value_counts()

In [20]:
# jour semaine
df['JR_SEMN_ACCDN'].value_counts()
make_dummies('JR_SEMN_ACCDN')

In [21]:
# date
df['DT_ACCDN'] = df['DT_ACCDN'].str.split('/').str.get(1)

In [29]:
df.drop('DT_ACCDN', axis=1, inplace=True)

In [33]:
# genre d'accident
make_dummies('CD_GENRE_ACCDN')

In [40]:
# etat de la surface
make_dummies('CD_ETAT_SURFC')

In [43]:
# eclairement
make_dummies('CD_ECLRM')

In [46]:
# environment
make_dummies('CD_ENVRN_ACCDN')

In [49]:
# type route
make_dummies('CD_CATEG_ROUTE')

In [59]:
# localisation
make_dummies('CD_LOCLN_ACCDN')

In [65]:
# configuration
make_dummies('CD_CONFG_ROUTE')

In [68]:
# meteo
make_dummies('CD_COND_METEO')

In [77]:
# heure
make_dummies('HR_ACCDN')

In [85]:
# gravite 
make_dummies('GRAVITE')

In [108]:
# final columns
df.columns[:55]

Index(['NB_VEH_IMPLIQUES_ACCDN', 'NB_MORTS', 'NB_BLESSES_GRAVES',
       'NB_BLESS_LEGERS', 'AN', 'NB_VICTIMES_TOTAL',
       'nb_automobile_camion_leger', 'nb_camionLourd_tractRoutier',
       'nb_outil_equipement', 'nb_tous_autobus_minibus', 'nb_bicyclette',
       'nb_cyclomoteur', 'nb_motocyclette', 'NB_DECES_PIETON',
       'NB_BLESSES_PIETON', 'NB_VICTIMES_PIETON', 'NB_DECES_MOTO',
       'NB_BLESSES_MOTO', 'NB_VICTIMES_MOTO', 'NB_DECES_VELO',
       'NB_BLESSES_VELO', 'NB_VICTIMES_VELO', 'LOC_X', 'LOC_Y', 'LOC_COTE_Q',
       'LOC_COTE_P', 'LOC_LONG', 'LOC_LAT', 'JR_SEMN_ACCDN: DI',
       'JR_SEMN_ACCDN: JE', 'JR_SEMN_ACCDN: LU', 'JR_SEMN_ACCDN: MA',
       'JR_SEMN_ACCDN: ME', 'JR_SEMN_ACCDN: SA', 'JR_SEMN_ACCDN: VE',
       'CD_GENRE_ACCDN: 31.0', 'CD_GENRE_ACCDN: 32.0', 'CD_GENRE_ACCDN: 33.0',
       'CD_GENRE_ACCDN: 34.0', 'CD_GENRE_ACCDN: 35.0', 'CD_GENRE_ACCDN: 36.0',
       'CD_GENRE_ACCDN: 37.0', 'CD_GENRE_ACCDN: 38.0', 'CD_GENRE_ACCDN: 39.0',
       'CD_GENRE_ACCDN: 40

In [111]:
# final shape
rows, columns = df.shape
print(f'We have {rows:,} rows and {columns} columns.')

We have 171,271 rows and 160 columns.


In [109]:
# exports processed data to csv 
df.to_csv('a_dummies.csv', header=True, index=None)