In [7]:
import pandas as pd
from datetime import datetime

In [8]:
def print_all(arr):
    for item in arr:
        print(item)

In [9]:
# load the dataset with columns we intend to use
usedcols = ['DateOfCall', 'CalYear', 'HourOfCall', 'IncidentGroup', 
'PropertyType', 'PumpHoursRoundUp', 'NumPumpsAttending', 
'Notional Cost (£)', 'PropertyCategory']
ds = pd.read_csv('data/lfb_incident.csv', usecols=usedcols)
print("Count:", len(ds))

Count: 1465060


In [10]:
# filter: remove row if there is nan in any other column
ds = ds.dropna()
print("Count:", len(ds))

Count: 1453312


In [11]:
# print
ds2 = pd.DataFrame(ds)
ds2[0:3]

Unnamed: 0,DateOfCall,CalYear,HourOfCall,IncidentGroup,PropertyCategory,PropertyType,NumPumpsAttending,PumpHoursRoundUp,Notional Cost (£)
0,01 Jan 2009,2009,0,Special Service,Road Vehicle,Car,2.0,1.0,255.0
2,01 Jan 2009,2009,0,Fire,Outdoor,Road surface/pavement,1.0,1.0,255.0
3,01 Jan 2009,2009,0,Fire,Outdoor,Domestic garden (vegetation not equipment),1.0,1.0,255.0


In [12]:
# duplicate the date for inner join
ds2['Date'] = ds2['DateOfCall']
ds2['Date'] = pd.to_datetime(ds2['Date']).dt.strftime('%m/%d/%Y')

In [13]:
# convert: IncidentDate from date to month only
ds2['DateOfCall'] = pd.to_datetime(ds2['DateOfCall']).dt.month

In [14]:
# convert: we want to make sure all columns below are integers
ds2['NumPumpsAttending'] = ds2['NumPumpsAttending'].astype('int64')
ds2['Notional Cost (£)'] = ds2['Notional Cost (£)'].astype('int64')
ds2['PumpHoursRoundUp'] = ds2['PumpHoursRoundUp'].astype('int64')
ds2['HourOfCall'] = ds2['HourOfCall'].astype('int64')

# convert: IncidentGroup to categorical (integers)
ds2['IncidentGroup'] = pd.factorize(ds2['IncidentGroup'])[0]

In [15]:
# convert: PropertyType to categorical (integers)
ds2['PropertyType'] = pd.factorize(ds2['PropertyType'])[0]

# convert: PropertyType to categorical (integers)
ds2['PropertyCategory'] = pd.factorize(ds2['PropertyCategory'])[0]

In [16]:
# save dataset to disk
# current dataset version: clean, but no weather information
ds2.to_csv('data/london_clean.csv', index=False)
len(ds2)

1453312

In [17]:
# load weather information
weather_data = pd.read_csv('data/london_weather.csv')

# convert data format, preparing for merge
weather_data['date'] = pd.to_datetime(weather_data['date'], format='%Y%m%d').dt.strftime('%m/%d/%Y')
weather_data = weather_data.rename(columns={'date': 'Date'})
weather_data[0:5]

Unnamed: 0,Date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,01/01/1979,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101900.0,9.0
1,01/02/1979,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102530.0,8.0
2,01/03/1979,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102050.0,4.0
3,01/04/1979,8.0,0.0,13.0,-0.3,-2.6,-6.5,0.0,100840.0,2.0
4,01/05/1979,6.0,2.0,29.0,5.6,-0.8,-1.4,0.0,102250.0,1.0


In [18]:
# do merge
ds_merged = pd.merge(ds2, weather_data, on='Date')
ds4 = None

# drop rows without mean temp
ds_merged = ds_merged[ds_merged['mean_temp'].notna()]
ds_merged[0:5]

Unnamed: 0,DateOfCall,CalYear,HourOfCall,IncidentGroup,PropertyCategory,PropertyType,NumPumpsAttending,PumpHoursRoundUp,Notional Cost (£),Date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,1,2009,0,0,0,0,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
1,1,2009,0,1,1,1,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
2,1,2009,0,1,1,2,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
3,1,2009,0,1,1,3,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
4,1,2009,0,2,2,4,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0


In [19]:
# create an array where notional cost is converted into categorical values
# attention! it must be done after join to guarantee all categories will be represented in the dataset
cost_cat = []
for item in ds_merged['Notional Cost (£)'].values:
    if item < 300:
        cost_cat.append(0)
    elif item >= 300 and item < 500:
        cost_cat.append(1)
    elif item >= 500 and item < 700:
        cost_cat.append(2)
    elif item >= 700 and item < 900:
        cost_cat.append(3)
    elif item >= 900 and item < 1100:
        cost_cat.append(4)    
    else:
        cost_cat.append(5)

# add cost categories to dataset
ds_merged['CostCat'] = cost_cat

# check wheter the categories exist and how many corresponding rows they have
print(ds_merged['CostCat'].value_counts())
print(len(ds_merged))

0    701094
1    425134
2    104288
5     24706
3     16375
4     15020
Name: CostCat, dtype: int64
1286617


In [20]:
# save dataset to disk
# current dataset version: clean and merged with weather information
ds_merged.to_csv('data/london_clean.csv', index=False)