In [1]:
import sys
import os

root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_path)

In [2]:
import pandas as pd
import seaborn as sns

import tubular
from tubular.nominal import GroupRareLevelsTransformer
from tubular.capping import CappingTransformer

from utils.helpers import (read_data_from_github_zip,
                           preprocess_date_columns,
                           preprocess_time_columns,
                           calculate_attendance_time)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
url = 'https://github.com/MLOpsGDA/mlops_fire_fighter/raw/main/data/LFB_Incident.csv.zip'
df = read_data_from_github_zip(url)

In [6]:
df.shape

(340174, 39)

### Create DateOfCall_Month feature

In [7]:
df = preprocess_date_columns(df, 'DateOfCall')

In [8]:
df['DateOfCall_Month']

0         5
1         5
2         5
3         5
4         5
         ..
340169    4
340170    4
340171    4
340172    4
340173    4
Name: DateOfCall_Month, Length: 340174, dtype: int64

### Create PartOfDay feature

In [9]:
df = preprocess_time_columns(df, 'TimeOfCall')

In [10]:
df['PartOfDay'].value_counts(dropna=False)

Afternoon    92405
Night        87617
Morning      80424
Evening      79728
Name: PartOfDay, dtype: int64

### Grouping the StopCodeDescription feature
All the values below 5% will be grouped and labeled as **rare**.

In [11]:
df['StopCodeDescription'].value_counts()/df.shape[0]

AFA                                      0.378377
Special Service                          0.357238
False alarm - Good intent                0.099728
Primary Fire                             0.076249
Secondary Fire                           0.073257
False alarm - Malicious                  0.008931
Flood call attended - Batch mobilised    0.005785
Use of Special Operations Room           0.000244
Chimney Fire                             0.000135
Late Call                                0.000056
Name: StopCodeDescription, dtype: float64

In [12]:
StopCodeDescription_transformer = GroupRareLevelsTransformer(columns = 'StopCodeDescription',
                                                             cut_off_percent=0.05,
                                                             verbose=False
                                                            )

In [13]:
StopCodeDescription_transformer.fit(df)

In [14]:
X_transformed = StopCodeDescription_transformer.transform(df)

In [15]:
X_transformed['StopCodeDescription'].unique()

array(['False alarm - Good intent', 'Special Service', 'AFA',
       'Primary Fire', 'Secondary Fire', 'rare'], dtype=object)

In [16]:
X_transformed['StopCodeDescription'].value_counts()/X_transformed.shape[0]

AFA                          0.378377
Special Service              0.357238
False alarm - Good intent    0.099728
Primary Fire                 0.076249
Secondary Fire               0.073257
rare                         0.015151
Name: StopCodeDescription, dtype: float64

### Grouping the PropertyCategory feature

In [17]:
df['PropertyCategory'].value_counts()/df.shape[0]

Dwelling             0.564011
Non Residential      0.198925
Outdoor              0.079871
Road Vehicle         0.061245
Other Residential    0.051291
Outdoor Structure    0.042878
Rail Vehicle         0.000814
Boat                 0.000547
Aircraft             0.000417
Name: PropertyCategory, dtype: float64

In [18]:
PropertyCategory_transformer = GroupRareLevelsTransformer(columns = 'PropertyCategory',
                                                          cut_off_percent=0.05,
                                                          verbose=False
                                                         )

In [19]:
PropertyCategory_transformer.fit(df)

In [20]:
X_transformed = PropertyCategory_transformer.transform(df)

In [21]:
X_transformed['PropertyCategory'].unique()

array(['Dwelling', 'Non Residential', 'Road Vehicle', 'Other Residential',
       'Outdoor', 'rare'], dtype=object)

In [22]:
X_transformed['PropertyCategory'].value_counts()/X_transformed.shape[0]

Dwelling             0.564011
Non Residential      0.198925
Outdoor              0.079871
Road Vehicle         0.061245
Other Residential    0.051291
rare                 0.044657
Name: PropertyCategory, dtype: float64

### Grouping the PropertyType feature

In [23]:
df['PropertyType'].value_counts()/df.shape[0]

Purpose Built Flats/Maisonettes - 4 to 9 storeys      0.152055
House - single occupancy                              0.121573
Purpose Built Flats/Maisonettes - Up to 3 storeys     0.113189
Self contained Sheltered Housing                      0.047549
Converted Flat/Maisonettes - 3 or more storeys        0.041311
                                                        ...   
Mine or quarry (not above ground building)            0.000003
Other merchant vessel                                 0.000003
Sea                                                   0.000003
Helicopter                                            0.000003
Racecourse                                            0.000003
Name: PropertyType, Length: 285, dtype: float64

In [24]:
PropertyType_transformer = GroupRareLevelsTransformer(columns = 'PropertyType',
                                                      cut_off_percent=0.03,
                                                      verbose=False
                                                     )

In [25]:
PropertyType_transformer.fit(df)

In [26]:
X_transformed = PropertyType_transformer.transform(df)

In [27]:
X_transformed['PropertyType'].unique()

array(['Self contained Sheltered Housing ', 'House - single occupancy ',
       'Purpose Built Flats/Maisonettes - 10 or more storeys ', 'rare',
       'Purpose Built Flats/Maisonettes - Up to 3 storeys ',
       'Converted Flat/Maisonette - Up to 2 storeys ',
       'Purpose Built Flats/Maisonettes - 4 to 9 storeys ', 'Car ',
       'Converted Flat/Maisonettes - 3 or more storeys'], dtype=object)

In [28]:
X_transformed['PropertyType'].value_counts()/X_transformed.shape[0]

rare                                                     0.416490
Purpose Built Flats/Maisonettes - 4 to 9 storeys         0.152055
House - single occupancy                                 0.121573
Purpose Built Flats/Maisonettes - Up to 3 storeys        0.113189
Self contained Sheltered Housing                         0.047549
Converted Flat/Maisonettes - 3 or more storeys           0.041311
Purpose Built Flats/Maisonettes - 10 or more storeys     0.039271
Car                                                      0.035767
Converted Flat/Maisonette - Up to 2 storeys              0.032795
Name: PropertyType, dtype: float64

### Grouping the AddressQualifier feature

In [29]:
df['AddressQualifier'].value_counts()/df.shape[0]

Correct incident location                          0.586294
Within same building                               0.227795
On land associated with building                   0.061469
In street outside gazetteer location               0.060607
In street close to gazetteer location              0.026266
Open land/water - nearest gazetteer location       0.024091
Nearby address - street not listed in gazetteer    0.003581
Nearby address - no building in street             0.003163
On motorway / elevated road                        0.002996
In street remote from gazetteer location           0.002208
Railway land or rolling stock                      0.001532
Name: AddressQualifier, dtype: float64

In [30]:
AddressQualifier_transformer = GroupRareLevelsTransformer(columns = 'AddressQualifier',
                                                          cut_off_percent=0.05,
                                                          verbose=False
                                                         )

In [31]:
AddressQualifier_transformer.fit(df)

In [32]:
X_transformed = AddressQualifier_transformer.transform(df)

In [33]:
X_transformed['AddressQualifier'].unique()

array(['Within same building', 'Correct incident location',
       'In street outside gazetteer location', 'rare',
       'On land associated with building'], dtype=object)

In [34]:
X_transformed['AddressQualifier'].value_counts()/X_transformed.shape[0]

Correct incident location               0.586294
Within same building                    0.227795
rare                                    0.063835
On land associated with building        0.061469
In street outside gazetteer location    0.060607
Name: AddressQualifier, dtype: float64

### Grouping the IncGeo_WardName feature

In [35]:
df['IncGeo_WardName'].value_counts()/df.shape[0]

West End                       0.009713
St. James's                    0.007361
WEST END                       0.005671
FAIRFIELD                      0.004686
ST. JAMES'S                    0.004636
                                 ...   
Aldborough                     0.000003
Coleman Street                 0.000003
Selsdon & Addington Village    0.000003
Bishopsgate                    0.000003
De Beauvoir                    0.000003
Name: IncGeo_WardName, Length: 1293, dtype: float64

In [36]:
IncGeo_WardName_transformer = GroupRareLevelsTransformer(columns = 'IncGeo_WardName',
                                                         cut_off_percent=0.003,
                                                         verbose=False
                                                        )

In [37]:
IncGeo_WardName_transformer.fit(df)

In [38]:
X_transformed = IncGeo_WardName_transformer.transform(df)

In [39]:
X_transformed['IncGeo_WardName'].unique()

array(['rare', 'WADDON', "Regent's Park", 'Bloomsbury',
       'Holborn & Covent Garden', 'West End', 'HOXTON EAST & SHOREDITCH',
       "St. James's", 'Kilburn', 'Heathrow Villages', 'Marylebone',
       'FAIRFIELD', 'LONDON BRIDGE & WEST BERMONDSEY', "ST. JAMES'S",
       'WEST END'], dtype=object)

In [40]:
X_transformed['IncGeo_WardName'].value_counts()/X_transformed.shape[0]

rare                               0.937514
West End                           0.009713
St. James's                        0.007361
WEST END                           0.005671
FAIRFIELD                          0.004686
ST. JAMES'S                        0.004636
Heathrow Villages                  0.003922
Bloomsbury                         0.003669
Holborn & Covent Garden            0.003592
Regent's Park                      0.003404
Kilburn                            0.003242
HOXTON EAST & SHOREDITCH           0.003181
LONDON BRIDGE & WEST BERMONDSEY    0.003163
WADDON                             0.003148
Marylebone                         0.003098
Name: IncGeo_WardName, dtype: float64

### Calculate the target feature FirstPumpArriving_AttendanceTime in minutes

In [41]:
df = calculate_attendance_time(df)

### Cap the Notional Cost (£) feature

In [42]:
df['Notional Cost (£)'].describe()

count    327967.000000
mean        485.104678
std        1546.549090
min         346.000000
25%         352.000000
50%         352.000000
75%         364.000000
max      247520.000000
Name: Notional Cost (£), dtype: float64

In [43]:
cap_national_cost = CappingTransformer(capping_values={'Notional Cost (£)': [0, 400]})

In [44]:
cap_national_cost.fit(df)

In [45]:
X_transformed = cap_national_cost.transform(df)

In [46]:
X_transformed['Notional Cost (£)'].describe()

count    327967.000000
mean        361.149786
std          16.990619
min         346.000000
25%         352.000000
50%         352.000000
75%         364.000000
max         400.000000
Name: Notional Cost (£), dtype: float64

In [47]:
df.shape

(340174, 42)

In [48]:
df['NationalCost'] = df['Notional Cost (£)']
df.drop(columns = 'Notional Cost (£)', inplace=True)

### Devide the data to train and test based on the CalYear

In [49]:
df['CalYear'].value_counts()

2022    125390
2021    109577
2020     67519
2023     37688
Name: CalYear, dtype: int64

In [50]:
train = df[df['CalYear'].isin([2020, 2021, 2022])]
test = df[df['CalYear'] == 2023]

In [51]:
train.shape, test.shape

((302486, 42), (37688, 42))

### Drop the incidents where the target feature has na values

In [52]:
train.dropna(subset=['FirstPumpArriving_AttendanceTime_min'], inplace=True)
test.dropna(subset=['FirstPumpArriving_AttendanceTime_min'], inplace=True)

In [53]:
train.shape, test.shape

((284309, 42), (35723, 42))

### Store the train and test data as pickle

In [54]:
train.to_pickle(root_path + '/data/train.pkl')

In [55]:
test.to_pickle(root_path + '/data/test.pkl')