In [2]:
import pandas as pd

### Purpose
The purpose of categorical preprocessing is to encode categorical features of certain columns.

Although the city of Toronto has already implemented codes for each category, most of these codes should not be considered ordinal. Therefore, one-hot encoding will take place for non-ordinal values, and an ordinal encoder will take place for ordinal values.

For the purpose of this hypothesis, however, we will only convert all categorical values into numerical values.

In [32]:
df = pd.read_csv('../../data/raw/Fire_Incidents_Data.csv')

#Columns that will under go one-hot encoding
categorical_columns= ['Area_of_Origin','Building_Status','Business_Impact','Extent_Of_Fire','Final_Incident_Type',
 'Fire_Alarm_System_Impact_on_Evacuation','Fire_Alarm_System_Operation','Fire_Alarm_System_Presence',
 'Ignition_Source','Initial_CAD_Event_Type','Material_First_Ignited','Method_Of_Fire_Control','Possible_Cause',
 'Property_Use','Smoke_Alarm_at_Fire_Origin','Smoke_Alarm_at_Fire_Origin_Alarm_Failure','Smoke_Alarm_at_Fire_Origin_Alarm_Type',
 'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation','Smoke_Spread','Sprinkler_System_Operation','Sprinkler_System_Presence',
 'Status_of_Fire_On_Arrival']

df[categorical_columns].to_csv('../../data/processed/categorical_Fire_Incidents_Data.csv')

  df = pd.read_csv('../../data/raw/Fire_Incidents_Data.csv')


#### Step 1. Cleansing columns.
Removing non-numeric Characters.

In [30]:
# Remove non-numeric characters
df_cleansed = df[:]

for col in categorical_columns:
    df_cleansed[col] = df[col].replace('[^0-9]', '', regex=True)

df_cleansed

Unnamed: 0,_id,Area_of_Origin,Building_Status,Business_Impact,Civilian_Casualties,Count_of_Persons_Rescued,Estimated_Dollar_Loss,Estimated_Number_Of_Persons_Displaced,Exposures,Ext_agent_app_or_defer_time,...,Smoke_Alarm_at_Fire_Origin_Alarm_Failure,Smoke_Alarm_at_Fire_Origin_Alarm_Type,Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation,Smoke_Spread,Sprinkler_System_Operation,Sprinkler_System_Presence,Status_of_Fire_On_Arrival,TFS_Alarm_Time,TFS_Arrival_Time,TFS_Firefighter_Casualties
0,2475341,81,,,0.0,0.0,15000.0,,,2018-02-24T21:12:00,...,,,,,,,7,2018-02-24T21:04:29,2018-02-24T21:10:11,0.0
1,2475342,75,,,0.0,0.0,50.0,,,2018-02-24T21:29:42,...,,,,,,,2,2018-02-24T21:24:43,2018-02-24T21:29:31,0.0
2,2475343,,,,0.0,0.0,,,,,...,,,,,,,,2018-02-25T13:29:59,2018-02-25T13:36:49,0.0
3,2475344,75,01,1,0.0,0.0,0.0,0.0,,2018-02-25T14:19:25,...,98,9,8,99,8,9,3,2018-02-25T14:13:39,2018-02-25T14:18:07,0.0
4,2475345,,,,0.0,0.0,,,,,...,,,,,,,,2018-02-25T18:20:43,2018-02-25T18:26:19,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29420,2504761,42,01,1,,0.0,0.0,0.0,0.0,2022-11-26T06:20:00,...,98,4,3,3,3,1,2,2022-11-26T06:06:11,2022-11-26T06:12:09,0.0
29421,2504762,81,,,,0.0,5000.0,,0.0,2022-11-26T06:33:56,...,,,,,,,3,2022-11-26T06:27:58,2022-11-26T06:32:01,0.0
29422,2504763,44,01,8,,0.0,2000.0,0.0,0.0,2022-11-26T08:32:05,...,98,4,8,4,2,1,2,2022-11-26T08:22:16,2022-11-26T08:26:04,0.0
29423,2504764,,,,,0.0,,,0.0,,...,,,,,,,,2022-11-26T09:13:13,2022-11-26T09:14:09,0.0


### One-hot encoding or Ordinal encoding
#### These variables are not used for the proposal, but may be useful later

Label encoding must be used to preserve ordering in columns.
One-hot encoding will be used when there is not any ordering between possible values in a column.

We will split the categorical columns into ordinal or non-ordinal.

In [38]:
#ordinal_cols = categorical_columns[2,3,20,21]
ordinal_cols = categorical_columns[2:4] + categorical_columns[20:22]
one_hotcols = list(set(categorical_columns) - set(ordinal_cols))
print(one_hotcols)
#df[categorical_columns]

['Ignition_Source', 'Possible_Cause', 'Property_Use', 'Smoke_Spread', 'Smoke_Alarm_at_Fire_Origin_Alarm_Failure', 'Sprinkler_System_Operation', 'Area_of_Origin', 'Fire_Alarm_System_Impact_on_Evacuation', 'Smoke_Alarm_at_Fire_Origin', 'Material_First_Ignited', 'Fire_Alarm_System_Presence', 'Final_Incident_Type', 'Method_Of_Fire_Control', 'Building_Status', 'Fire_Alarm_System_Operation', 'Smoke_Alarm_at_Fire_Origin_Alarm_Type', 'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation', 'Initial_CAD_Event_Type']


### Perform encoding.
First, we will convert all categorical features into numerical features using an ordinal encoder.
We do not use onehot encoder at this stage due to the amount of dimensionality it adds.

In [68]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Ordinal encoder
df_ordinal_encoded = df[:]

ordinal_encoder = OrdinalEncoder()  #instantiate ordinal encoder object, to be used in the next line
df_encoded = pd.DataFrame(ordinal_encoder.fit_transform(df[categorical_columns].values), columns=categorical_columns)
df_ordinal_encoded[categorical_columns] = df_encoded

df_ordinal_encoded



Unnamed: 0,_id,Area_of_Origin,Building_Status,Business_Impact,Civilian_Casualties,Count_of_Persons_Rescued,Estimated_Dollar_Loss,Estimated_Number_Of_Persons_Displaced,Exposures,Ext_agent_app_or_defer_time,...,Smoke_Alarm_at_Fire_Origin_Alarm_Failure,Smoke_Alarm_at_Fire_Origin_Alarm_Type,Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation,Smoke_Spread,Sprinkler_System_Operation,Sprinkler_System_Presence,Status_of_Fire_On_Arrival,TFS_Alarm_Time,TFS_Arrival_Time,TFS_Firefighter_Casualties
0,2475341,60.0,,,0.0,0.0,15000.0,,,2018-02-24T21:12:00,...,,,,,,,5.0,2018-02-24T21:04:29,2018-02-24T21:10:11,0.0
1,2475342,56.0,,,0.0,0.0,50.0,,,2018-02-24T21:29:42,...,,,,,,,1.0,2018-02-24T21:24:43,2018-02-24T21:29:31,0.0
2,2475343,,,,0.0,0.0,,,,,...,,,,,,,,2018-02-25T13:29:59,2018-02-25T13:36:49,0.0
3,2475344,56.0,0.0,0.0,0.0,0.0,0.0,0.0,,2018-02-25T14:19:25,...,9.0,5.0,5.0,9.0,5.0,3.0,2.0,2018-02-25T14:13:39,2018-02-25T14:18:07,0.0
4,2475345,,,,0.0,0.0,,,,,...,,,,,,,,2018-02-25T18:20:43,2018-02-25T18:26:19,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29420,2504761,23.0,0.0,0.0,,0.0,0.0,0.0,0.0,2022-11-26T06:20:00,...,9.0,3.0,2.0,2.0,2.0,0.0,1.0,2022-11-26T06:06:11,2022-11-26T06:12:09,0.0
29421,2504762,60.0,,,,0.0,5000.0,,0.0,2022-11-26T06:33:56,...,,,,,,,2.0,2022-11-26T06:27:58,2022-11-26T06:32:01,0.0
29422,2504763,25.0,0.0,5.0,,0.0,2000.0,0.0,0.0,2022-11-26T08:32:05,...,9.0,3.0,5.0,3.0,1.0,0.0,1.0,2022-11-26T08:22:16,2022-11-26T08:26:04,0.0
29423,2504764,,,,,0.0,,,0.0,,...,,,,,,,,2022-11-26T09:13:13,2022-11-26T09:14:09,0.0


### (Not used - ignore) Impute Missing Values
Fill in missing values using K-nearest Neighbor.

In [69]:
# from sklearn.impute import KNNImputer
# import numpy as np

# def impute_all_missing_values(df):  
#     knn_imputer = KNNImputer() #Instantiate KNN imputer object to be used in next few lines
#     numeric_columns = df.select_dtypes(include=[np.number]).columns
#     df[numeric_columns] = pd.DataFrame(knn_imputer.fit_transform(df[numeric_columns]), columns=numeric_columns)
    
#     return df #Return DF after we impute it.

# df_imputed = impute_all_missing_values(df_ordinal_encoded)

# df_imputed

Unnamed: 0,_id,Area_of_Origin,Building_Status,Business_Impact,Civilian_Casualties,Count_of_Persons_Rescued,Estimated_Dollar_Loss,Estimated_Number_Of_Persons_Displaced,Exposures,Ext_agent_app_or_defer_time,...,Smoke_Alarm_at_Fire_Origin_Alarm_Failure,Smoke_Alarm_at_Fire_Origin_Alarm_Type,Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation,Smoke_Spread,Sprinkler_System_Operation,Sprinkler_System_Presence,Status_of_Fire_On_Arrival,TFS_Alarm_Time,TFS_Arrival_Time,TFS_Firefighter_Casualties
0,2475341.0,60.0,1.4,3.0,0.0,0.0,15000.0,2.4,0.2,2018-02-24T21:12:00,...,9.0,3.8,2.2,1.4,3.2,1.4,5.0,2018-02-24T21:04:29,2018-02-24T21:10:11,0.0
1,2475342.0,56.0,0.0,1.0,0.0,0.0,50.0,2.0,0.0,2018-02-24T21:29:42,...,7.8,5.0,4.4,5.6,4.0,1.6,1.0,2018-02-24T21:24:43,2018-02-24T21:29:31,0.0
2,2475343.0,52.2,1.2,2.2,0.0,0.0,6410.0,2.0,1.4,,...,9.2,4.6,4.6,5.4,4.6,2.0,2.4,2018-02-25T13:29:59,2018-02-25T13:36:49,0.0
3,2475344.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-02-25T14:19:25,...,9.0,5.0,5.0,9.0,5.0,3.0,2.0,2018-02-25T14:13:39,2018-02-25T14:18:07,0.0
4,2475345.0,52.2,0.2,2.2,0.0,0.0,4310.0,2.0,1.4,,...,9.2,4.8,4.6,5.4,4.6,2.0,2.4,2018-02-25T18:20:43,2018-02-25T18:26:19,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29420,2504761.0,23.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-11-26T06:20:00,...,9.0,3.0,2.0,2.0,2.0,0.0,1.0,2022-11-26T06:06:11,2022-11-26T06:12:09,0.0
29421,2504762.0,60.0,1.2,2.0,1.4,0.0,5000.0,1.0,0.0,2022-11-26T06:33:56,...,8.8,1.0,0.4,3.8,4.4,2.2,2.0,2022-11-26T06:27:58,2022-11-26T06:32:01,0.0
29422,2504763.0,25.0,0.0,5.0,1.0,0.0,2000.0,0.0,0.0,2022-11-26T08:32:05,...,9.0,3.0,5.0,3.0,1.0,0.0,1.0,2022-11-26T08:22:16,2022-11-26T08:26:04,0.0
29423,2504764.0,40.6,2.0,2.2,1.2,0.0,2305.0,30.0,0.0,,...,9.0,2.8,4.2,4.6,4.6,1.8,2.2,2022-11-26T09:13:13,2022-11-26T09:14:09,0.0


### Drop all NaN values

In [71]:
df_ordinal_encoded.dropna(inplace=True)

df_ordinal_encoded.to_csv('../../data/processed/numerical_encoded_Fire_Incidents_Data.csv')

In [61]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# # Apply ordinal encoding to the df
# ordinal_encoder = OrdinalEncoder()
# df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols])

ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder() )]) #Use OrdinalEncoder. Fit it into a pipeline as per lesson #4.
    
onehot_categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) #Use OnehotEncoder

def applyTransformer(df, one_hotcols, ordinal_cols):
    one_hotcols_idx = [df.columns.get_loc(item) for item in one_hotcols]
    ordinal_cols_idx = [df.columns.get_loc(item) for item in ordinal_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_categorical_transformer, ordinal_cols_idx),
            ('non-ordinal', onehot_categorical_transformer, one_hotcols_idx)
        ])
    return preprocessor
