In [1]:
#import columns
import pandas as pd
import numpy as np
import csv

In [2]:
# Data set listing the interventions carried out by the Montreal Fire Department (MIS)
# including the location of interventions and units deployed - 2015 onwards.
df1 = pd.read_csv('data/incident_data/donneesouvertes-interventions-sim.csv')
df1.head()

Unnamed: 0,INCIDENT_NBR,CREATION_DATE_TIME,INCIDENT_TYPE_DESC,DESCRIPTION_GROUPE,CASERNE,NOM_VILLE,NOM_ARROND,DIVISION,NOMBRE_UNITES,CIV,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE
0,1168,2015-01-03T20:56:02,Premier répondant,1-REPOND,33,Montréal,Sud-Ouest,8,1.0,1.0,297283.0,5035433.1,-73.596117,45.458786
1,1171,2015-01-03T21:03:52,Alarme privé ou locale,Alarmes-incendies,22,Montréal,Saint-Léonard,4,4.0,1.0,299180.0,5049284.2,-73.57202,45.583442
2,1172,2015-01-03T21:07:00,Premier répondant,1-REPOND,9,Montréal,Villeray / St-Michel / Parc Extension,4,1.0,1.0,297259.6,5046768.2,-73.596589,45.560784
3,1177,2015-01-03T21:41:46,Premier répondant,1-REPOND,37,Montréal,Villeray / St-Michel / Parc Extension,4,1.0,1.0,294738.3,5044817.2,-73.628847,45.543197
4,1178,2015-01-03T21:43:32,Aliments surchauffés,SANS FEU,16,Montréal,Plateau Mont-Royal,11,5.0,1.0,298505.7,5042556.1,-73.580573,45.522895


In [3]:
# Data set listing the interventions carried out by the Montreal Fire Department (MIS)
# including the location of interventions and units deployed - 2005 to 2014
df2 = pd.read_csv('data/incident_data/donneesouvertes-interventions-sim-2005-2014.csv')
df2.head()

Unnamed: 0,INCIDENT_NBR,CREATION_DATE_TIME,INCIDENT_TYPE_DESC,DESCRIPTION_GROUPE,CASERNE,NOM_VILLE,NOM_ARROND,DIVISION,LATITUDE,LONGITUDE,NOMBRE_UNITES
0,1,2005-01-01 00:03:47,Inondation,Sans incendie,18,Montréal-Nord,Montréal-Nord,21,45.620274,-73.619956,1.0
1,2,2005-01-01 00:03:56,Alarme privé ou locale,Alarmes-incendies,10,Montréal,Ville-Marie / Parc Jean-Drapeau / Centre-Sud,18,45.494087,-73.582587,7.0
2,3,2005-01-01 00:03:57,Alarme privé ou locale,Alarmes-incendies,72,Saint-Laurent,St-Laurent,13,45.484406,-73.693038,6.0
3,4,2005-01-01 00:05:01,Inondation,Sans incendie,13,Montréal,Mercier / Hochelaga-Maisonneuve,23,45.541383,-73.545944,1.0
4,5,2005-01-01 00:06:20,Inondation,Sans incendie,18,Montréal-Nord,Montréal-Nord,21,45.611304,-73.63244,1.0


In [4]:
# drop columns that we don't need
df1.drop(['INCIDENT_NBR', 'CIV', 'MTM8_X', 'MTM8_Y'], axis=1, inplace=True)
df2.drop(['INCIDENT_NBR'], axis=1, inplace=True)

In [5]:
#merge the two datasets
df = pd.concat([df1, df2], ignore_index=True, sort=True)

In [6]:
# convert to datetime
df['CREATION_DATE_TIME'] =  pd.to_datetime(df['CREATION_DATE_TIME'], infer_datetime_format=True)

In [7]:
# use Date/Time to generate Month, DOW, Hour, and Date column
df['MONTH'] =  df['CREATION_DATE_TIME'].dt.month
df['DAY_OF_WEEK'] =  df['CREATION_DATE_TIME'].dt.dayofweek
df['HOUR_OF_DAY'] =  df['CREATION_DATE_TIME'].dt.hour
df['Date'] =  df['CREATION_DATE_TIME'].dt.date

In [8]:
# add a column to indicate 'Time of Day' - bracket into 6 hr brackets
tod_bins = [0, 6, 12, 18, 24]
tod_labels = ['Night', 'Morning','Afternoon','Evening']
hours = df['CREATION_DATE_TIME'].dt.hour
df['TIME_OF_DAY'] = pd.cut(hours, bins=tod_bins, labels=tod_labels, right=False)

In [9]:
df.NOM_VILLE.value_counts()

Montréal                   1252567
Dollard-des-Ormeaux          27034
Pointe-Claire                26388
Westmount                    18049
Dorval                       17749
Mont-Royal                   14731
Kirkland                      9952
Beaconsfield                  9588
Côte St-Luc                   8900
Saint-Laurent                 6355
Montréal-Est                  5819
Montréal-Nord                 5033
Lasalle                       4562
Ste-Anne-de-Bellevue          4466
Hampstead                     4176
Saint-Léonard                 3816
Verdun / Ïle-des-Soeurs       3721
Pierrefonds                   3466
Baie d'Urfé                   3246
Lachine                       3245
Montréal-Ouest                2789
Outremont                     2615
Anjou                         2343
Senneville                    1022
Ile-Bizard                     678
Roxboro                        330
Ste-Geneviève                  319
Ile Dorval                      10
Indéterminé         

In [10]:
df.loc[df['NOM_VILLE'] == 'Indéterminé']

Unnamed: 0,CASERNE,CREATION_DATE_TIME,DESCRIPTION_GROUPE,DIVISION,INCIDENT_TYPE_DESC,LATITUDE,LONGITUDE,NOMBRE_UNITES,NOM_ARROND,NOM_VILLE,MONTH,DAY_OF_WEEK,HOUR_OF_DAY,Date,TIME_OF_DAY
357803,57,2018-07-19 05:15:35,1-REPOND,0,Premier répondant,45.511787,-73.764875,2.0,Indéterminé,Indéterminé,7,3,5,2018-07-19,Night
359556,35,2018-12-12 09:32:13,SANS FEU,0,Assistance serv. muni.,45.557494,-73.673288,5.0,Indéterminé,Indéterminé,12,2,9,2018-12-12,Morning
686258,54,2007-12-19 08:15:33,Fausses alertes/annulations,13,Fausse alerte 10-19,45.401301,-73.957345,2.0,Indéterminé,Indéterminé,12,2,8,2007-12-19,Morning
738131,74,2008-08-30 21:29:12,Premier répondant,13,Ac.véh./1R/s.v./V.R./29B/D,45.530144,-73.646591,5.0,Indéterminé,Indéterminé,8,5,21,2008-08-30,Evening


In [11]:
# if city is undetermined, then so is arrondisement
# we are using the location for prediction so we can remove these four values
df.drop(df.loc[df['NOM_VILLE'] == 'Indéterminé'].index, inplace=True)

In [12]:
df.NOM_ARROND.value_counts()

Ville-Marie                                     146026
Indéterminé                                     128162
Mercier / Hochelaga-Maisonneuve                 107885
Côte-des-Neiges / Notre-Dame-de-Grâce           107026
Villeray / St-Michel / Parc Extension            96114
Rosemont / Petite-Patrie                         94044
Ahuntsic / Cartierville                          90608
Plateau Mont-Royal                               84959
Montréal-Nord                                    73058
Rivière-des-Prairies / Pointe-aux-Trembles       64639
Sud-Ouest                                        64006
Saint-Laurent                                    61890
Lasalle                                          50370
Saint-Léonard                                    46148
Verdun                                           38974
Lachine                                          36323
Anjou                                            30871
Pierrefonds / Roxboro                            30235
Ville-Mari

In [13]:
df.loc[df['NOM_ARROND'] == 'Indéterminé']

Unnamed: 0,CASERNE,CREATION_DATE_TIME,DESCRIPTION_GROUPE,DIVISION,INCIDENT_TYPE_DESC,LATITUDE,LONGITUDE,NOMBRE_UNITES,NOM_ARROND,NOM_VILLE,MONTH,DAY_OF_WEEK,HOUR_OF_DAY,Date,TIME_OF_DAY
14,53,2015-01-04 15:35:19,AUTREFEU,2,Feu de champ *,45.425863,-73.865855,1.0,Indéterminé,Beaconsfield,1,6,15,2015-01-04,Afternoon
16,63,2015-01-04 15:37:42,SANS FEU,3,Problèmes électriques,45.449983,-73.772930,1.0,Indéterminé,Dorval,1,6,15,2015-01-04,Afternoon
23,54,2015-01-04 15:49:23,SANS FEU,2,Problèmes électriques,45.444707,-73.862867,1.0,Indéterminé,Kirkland,1,6,15,2015-01-04,Afternoon
24,55,2015-01-04 15:50:24,SANS FEU,2,Problèmes électriques,45.441050,-73.805702,1.0,Indéterminé,Pointe-Claire,1,6,15,2015-01-04,Afternoon
33,76,2015-01-05 07:51:03,1-REPOND,9,Premier répondant,45.484781,-73.592012,1.0,Indéterminé,Westmount,1,0,7,2015-01-05,Morning
37,53,2015-01-05 08:21:41,1-REPOND,2,Premier répondant,45.423985,-73.891119,1.0,Indéterminé,Beaconsfield,1,0,8,2015-01-05,Morning
50,78,2015-01-06 05:20:47,SANS FEU,4,Inondation,45.474510,-73.667879,1.0,Indéterminé,Côte St-Luc,1,1,5,2015-01-06,Night
54,63,2015-01-06 06:20:43,Alarmes-incendies,2,Appel de Cie de détection,45.443158,-73.724438,5.0,Indéterminé,Dorval,1,1,6,2015-01-06,Morning
56,61,2015-01-06 06:25:45,1-REPOND,1,Premier répondant,45.472582,-73.843193,1.0,Indéterminé,Dollard-des-Ormeaux,1,1,6,2015-01-06,Morning
78,63,2015-01-07 21:28:13,SANS FEU,2,Problèmes électriques,45.439392,-73.725381,4.0,Indéterminé,Dorval,1,2,21,2015-01-07,Evening


In [14]:
# arrondisement is missing for cities that are not Montreal
# if arrondisement is undertermined, use the name of the city as the arrondisement
df.loc[df['NOM_ARROND']=='Indéterminé', 'NOM_ARROND'] = df['NOM_VILLE']

In [15]:
df.head()

Unnamed: 0,CASERNE,CREATION_DATE_TIME,DESCRIPTION_GROUPE,DIVISION,INCIDENT_TYPE_DESC,LATITUDE,LONGITUDE,NOMBRE_UNITES,NOM_ARROND,NOM_VILLE,MONTH,DAY_OF_WEEK,HOUR_OF_DAY,Date,TIME_OF_DAY
0,33,2015-01-03 20:56:02,1-REPOND,8,Premier répondant,45.458786,-73.596117,1.0,Sud-Ouest,Montréal,1,5,20,2015-01-03,Evening
1,22,2015-01-03 21:03:52,Alarmes-incendies,4,Alarme privé ou locale,45.583442,-73.57202,4.0,Saint-Léonard,Montréal,1,5,21,2015-01-03,Evening
2,9,2015-01-03 21:07:00,1-REPOND,4,Premier répondant,45.560784,-73.596589,1.0,Villeray / St-Michel / Parc Extension,Montréal,1,5,21,2015-01-03,Evening
3,37,2015-01-03 21:41:46,1-REPOND,4,Premier répondant,45.543197,-73.628847,1.0,Villeray / St-Michel / Parc Extension,Montréal,1,5,21,2015-01-03,Evening
4,16,2015-01-03 21:43:32,SANS FEU,11,Aliments surchauffés,45.522895,-73.580573,5.0,Plateau Mont-Royal,Montréal,1,5,21,2015-01-03,Evening


In [16]:
# rename columns
df.rename(index=str, columns={"NOMBRE_UNITES": "NumUnits", "NOM_ARROND": "Arrond",
                              "MONTH": "Month", "DAY_OF_WEEK": "DayOfWeek",
                              "TIME_OF_DAY": "TimeOfDay"}, inplace=True)

In [17]:
columns_to_encode = ['Arrond', 'TimeOfDay']

In [18]:
def create_encoder_decoder(df, cols):
    "Returns encoder and decoder for provided columns."
    encoder = {}
    for col in cols:
        cats = pd.Categorical(df[col]).categories
        d = {}
        for i, cat in enumerate(cats):
            d[cat] = i
        encoder[col] = d
    
    decoder = {}
    for col, d in encoder.items():
        decoder[col] = {v:k for k, v in d.items()}
        
    return encoder, decoder

def write_dict_to_file(d, filename='temp.csv'):
    with open(filename, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in d.items():
            writer.writerow([key, value])
            
def read_dict_from_file(filename='temp.csv'):
    with open(filename) as csv_file:
        reader = csv.reader(csv_file)
        d = dict(reader)
        return d

def encode_columns(df, encoder):
    df.replace(encoder, inplace=True)

def decode_columns(df, decoder):
    df.replace(decoder, inplace=True)

In [19]:
encoder, decoder = create_encoder_decoder(df, columns_to_encode)
write_dict_to_file(encoder, filename='encoder_arrond_tod.csv')
write_dict_to_file(decoder, filename='decoder_arrond_tod.csv')
encode_columns(df, encoder)

In [20]:
# to decode arrondisement and time of day, use the following code
#read_dict_from_file(filename='decoder_arrond_tod.csv')
#decode_columns(df, decoder)
#df.head()

In [21]:
# drop columns that aren't required
df.drop(['CREATION_DATE_TIME', 'INCIDENT_TYPE_DESC', 'LATITUDE', 'LONGITUDE',
        'CASERNE', 'DESCRIPTION_GROUPE', 'DIVISION', 'NOM_VILLE', 'HOUR_OF_DAY'],
        axis=1, inplace=True)

In [22]:
df.isna().sum()

NumUnits     563
Arrond         0
Month          0
DayOfWeek      0
Date           0
TimeOfDay      0
dtype: int64

In [23]:
# drop rows where number of units is NA
df.drop(df.loc[df['NumUnits'].isnull()].index, inplace=True)

In [24]:
df.head()

Unnamed: 0,NumUnits,Arrond,Month,DayOfWeek,Date,TimeOfDay
0,1.0,38,1,5,2015-01-03,3
1,4.0,33,1,5,2015-01-03,3
2,1.0,43,1,5,2015-01-03,3
3,1.0,43,1,5,2015-01-03,3
4,5.0,27,1,5,2015-01-03,3


In [25]:
#write to file
df.to_pickle('incident_data.pkl')