In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np

In [4]:
def concatenate():
    # Create Super Dataset
    # From files downloaded from https://github.com/cityofaustin/hack-austin/tree/master/Austin%20Fire%20Department%20Data

    # 2012
    fire_2012 = pd.read_csv(os.path.join('..', 'AFD_CY12 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2012.dropna(inplace=True)
    fire_2012.index = pd.to_datetime(fire_2012.index)

    # 2013
    fire_2013 = pd.read_csv(os.path.join('..', 'AFD_CY13 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2013.dropna(inplace=True)
    fire_2013.index = pd.to_datetime(fire_2013.index)

    # 2014
    fire_2014 = pd.read_csv(os.path.join('..', 'AFD_CY14 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2014.dropna(inplace=True)
    fire_2014.index = pd.to_datetime(fire_2014.index)

    # 2015
    fire_2015 = pd.read_csv(os.path.join('..', 'AFD_CY15 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2015.dropna(inplace=True)
    fire_2015.index = pd.to_datetime(fire_2015.index)

    # 2016
    fire_2016 = pd.read_csv(os.path.join('..', 'AFD_CY16 - SOC Fire Data Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2016.dropna(inplace=True)
    fire_2016.index = pd.to_datetime(fire_2016.index)

    # 2017
    fire_2017 = pd.read_csv(os.path.join('..', 'AFD_CY17 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2017.dropna(inplace=True)
    fire_2017.index = pd.to_datetime(fire_2017.index)

    # Combine all years into single dataframe
    final_df = pd.concat([fire_2012, fire_2013, fire_2014, fire_2015, fire_2016, fire_2017])

    # Calculate time between AFD phone pickup and first unit arrival in seconds and in minutes
    final_df['First Unit Arrived'] = pd.to_datetime(final_df['First Unit Arrived'])
    final_df['Response Time (s)'] = (final_df['First Unit Arrived'] - final_df.index).astype('timedelta64[s]')
    final_df['Response Time (m)'] = ((final_df['First Unit Arrived'] - final_df.index).astype('timedelta64[s]')) / 60
    final_df.reset_index(inplace=True)

    # Read in incident detail reports for all years available
    # Downloaded from https://data.austintexas.gov/browse?q=AFD&sortBy=relevance&anonymous=true
    AFD_13 = pd.read_csv(os.path.join('..', 'AFD_Fire_Incidents_2013_January_Thru_December.csv'))
    AFD_14 = pd.read_csv(os.path.join('..', 'AFD_Fire_Incidents_2014_January_Thru_December.csv'))
    AFD_15 = pd.read_csv(os.path.join('..', 'AFD_Fire_Incidents_2015_January_Thru_December.csv'))
    AFD_16 = pd.read_csv(os.path.join('..', 'AFD_Fire_Incidents_2016_January_Thru_December.csv'))

    # Concatenate all years incident detail reports into one dataframe
    frames = [AFD_13, AFD_14, AFD_15, AFD_16]
    result = pd.concat(frames)
    result = result.rename(index=str, columns={"MasterIncidentNumber": "Master Incident Number"})

    # Join dataframe with response time information with problem detail dataframe
    detail_final_df = pd.merge(final_df, result, how='inner', on='Master Incident Number')
    detail_final_df['day_of_week'] = detail_final_df['First Unit Arrived'].dt.dayofweek
    detail_final_df['hour'] = detail_final_df['First Unit Arrived'].dt.hour
    detail_final_df['late_response'] = np.where(detail_final_df['Response Time (s)'] > (60 * 8), 1, 0)

    del detail_final_df['CalendarYear_y']
    del detail_final_df['PriorityDescription_y']
    del detail_final_df['Response Status_y']
    detail_final_df.rename(columns={'CalendarYear_x': 'CalendarYear', 'Response Status_x': 'Response Status',
                                    'PriorityDescription_x': 'PriorityDescription'}, inplace=True)

    detail_final_df.to_csv(os.path.join('..', 'All Years with Response Times and Problem Types.csv'))
    return detail_final_df

In [5]:
df = concatenate()

In [6]:
df.head()

Unnamed: 0,AFD Time Phone Pickup,Master Incident Number,CalendarYear,Cancellation Status,General Dispatched Problem,Calltaker Agency (AFD or EMS),Order of AFD Arrival,Response Area (COA Only),First Arriving Unit Name,Response Status,...,Call_Type,Problem,Battalion,Jurisdiction,ResponseArea,Districts,Location 1,day_of_week,hour,late_response
0,2013-01-01 00:08:18,13000006,2013,Not Cancelled,Fire,AFD,1st or Only AFD,00-3001,ENG23,Code 3,...,Fire,TRASH - Trash Fire,B3,AFD,00-3001,-,"(30368606, 97697237)",1,0,1
1,2013-01-01 00:37:29,13000016,2013,Not Cancelled,Fire,AFD,1st or Only AFD,00-2205,ENG22,Code 3,...,Fire,TRASH - Trash Fire,B5,AFD,00-2205,-,"(30222548, 97695303)",1,0,0
2,2013-01-01 17:00:25,13000263,2013,Not Cancelled,Fire,AFD,1st or Only AFD,00-2402,ENG24,Code 3,...,Fire,BOXL- Structure Fire,B5,AFD,00-2402,-,"(30204730, 97756601)",1,17,0
3,2013-01-01 18:42:02,13000281,2013,Not Cancelled,Fire,AFD,1st or Only AFD,00-0701,ENG07,Code 3,...,Fire,TRASH - Trash Fire,B5,AFD,00-0701,-,"(30251905, 97727779)",1,18,0
4,2013-01-01 19:21:55,13000291,2013,Not Cancelled,Fire,AFD,1st or Only AFD,00-2201,ENG22,Code 3,...,Fire,AUTO - Auto Fire,B5,AFD,00-2201,-,"(30230773, 97721155)",1,19,0


In [None]:
## Use Random Forest to find important features in 2016

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
my_columns = list(df_dummies.columns)

In [15]:
my_columns

['AFD Time Phone Pickup',
 'Master Incident Number',
 'CalendarYear',
 'Cancellation Status',
 'General Dispatched Problem ',
 'Calltaker Agency (AFD or EMS)',
 'Order of AFD Arrival',
 'Response Area (COA Only)',
 'First Arriving Unit Name',
 'Response Status',
 'PriorityDescription',
 'EMS Time Phone Pickup',
 'Earliest Time Phone Pickup',
 'First Unit Assigned',
 'Did not Reduce to Code 1',
 'Onscene Time Was Correct',
 'Latitude',
 'Longitude',
 'First Unit Enroute',
 'First Unit Arrived',
 'Response Time (s)',
 'Response Time (m)',
 'Month',
 'DayOfMonth',
 'Call_Type',
 'Battalion',
 'Jurisdiction',
 'Districts',
 'Location 1',
 'late_response',
 'Problem_AUTO - Auto Fire',
 'Problem_BBQ - Unsafe Cooking',
 'Problem_BOX -Structure Fire',
 'Problem_BOXL- Structure Fire',
 'Problem_BRSHL - Brush Alarm / Light',
 'Problem_DUMP - Dumpster Fire',
 'Problem_ELEC - Electrical Fire',
 'Problem_GRASS - Small Grass Fire',
 'Problem_TRASH - Trash Fire',
 'ResponseArea_00-0040',
 'ResponseAr

In [21]:
df_2016 = df[df['CalendarYear'] >= 2016]

In [22]:
df_dummies_2016 = pd.get_dummies(df_2016, columns=['Problem', 'ResponseArea', 'day_of_week', 'hour'])

In [23]:
df_dummies_2016.head()

Unnamed: 0,AFD Time Phone Pickup,Master Incident Number,CalendarYear,Cancellation Status,General Dispatched Problem,Calltaker Agency (AFD or EMS),Order of AFD Arrival,Response Area (COA Only),First Arriving Unit Name,Response Status,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
5744,2016-01-01 00:34:56,16000015,2016,Cancelled On Arrival,Fire,AFD,1st or Only AFD,00-2004,ENG20,Code 3,...,0,0,0,0,0,0,0,0,0,0
5745,2016-01-01 02:04:57,16000045,2016,Not Cancelled,Fire,AFD,1st or Only AFD,00-1801,ENG18,Code 3,...,0,0,0,0,0,0,0,0,0,0
5746,2016-01-01 10:59:44,16000165,2016,Not Cancelled,Fire,AFD,1st or Only AFD,00-4001,ENG40,Code 3,...,0,0,0,0,0,0,0,0,0,0
5747,2016-01-01 12:25:08,16000184,2016,Not Cancelled,Fire,AFD,1st or Only AFD,00-3001,ENG30,Code 3,...,0,0,0,0,0,0,0,0,0,0
5748,2016-01-01 12:36:12,16000188,2016,Not Cancelled,Fire,AFD,1st or Only AFD,00-0804,ENG08,Code 3,...,0,0,0,0,0,0,0,0,0,0


In [24]:
my_columns = list(df_dummies_2016.columns)

In [25]:
my_columns


['AFD Time Phone Pickup',
 'Master Incident Number',
 'CalendarYear',
 'Cancellation Status',
 'General Dispatched Problem ',
 'Calltaker Agency (AFD or EMS)',
 'Order of AFD Arrival',
 'Response Area (COA Only)',
 'First Arriving Unit Name',
 'Response Status',
 'PriorityDescription',
 'EMS Time Phone Pickup',
 'Earliest Time Phone Pickup',
 'First Unit Assigned',
 'Did not Reduce to Code 1',
 'Onscene Time Was Correct',
 'Latitude',
 'Longitude',
 'First Unit Enroute',
 'First Unit Arrived',
 'Response Time (s)',
 'Response Time (m)',
 'Month',
 'DayOfMonth',
 'Call_Type',
 'Battalion',
 'Jurisdiction',
 'Districts',
 'Location 1',
 'late_response',
 'Problem_AUTO - Auto Fire',
 'Problem_BBQ - Unsafe Cooking',
 'Problem_BOXL- Structure Fire',
 'Problem_BRSHL - Brush Alarm / Light',
 'Problem_DUMP - Dumpster Fire',
 'Problem_ELEC - Electrical Fire',
 'Problem_GRASS - Small Grass Fire',
 'Problem_TRASH - Trash Fire',
 'ResponseArea_00-0101',
 'ResponseArea_00-0102',
 'ResponseArea_00-0

In [26]:
features = [
 'Problem_AUTO - Auto Fire',
 'Problem_BBQ - Unsafe Cooking',
 'Problem_BOXL- Structure Fire',
 'Problem_BRSHL - Brush Alarm / Light',
 'Problem_DUMP - Dumpster Fire',
 'Problem_ELEC - Electrical Fire',
 'Problem_GRASS - Small Grass Fire',
 'Problem_TRASH - Trash Fire',
 'ResponseArea_00-0101',
 'ResponseArea_00-0102',
 'ResponseArea_00-0201',
 'ResponseArea_00-0202',
 'ResponseArea_00-0203',
 'ResponseArea_00-0301',
 'ResponseArea_00-0303',
 'ResponseArea_00-0401',
 'ResponseArea_00-0501',
 'ResponseArea_00-0502',
 'ResponseArea_00-0503',
 'ResponseArea_00-0504',
 'ResponseArea_00-0505',
 'ResponseArea_00-0601',
 'ResponseArea_00-0602',
 'ResponseArea_00-0603',
 'ResponseArea_00-0604',
 'ResponseArea_00-0701',
 'ResponseArea_00-0702',
 'ResponseArea_00-0801',
 'ResponseArea_00-0802',
 'ResponseArea_00-0803',
 'ResponseArea_00-0804',
 'ResponseArea_00-0805',
 'ResponseArea_00-0806',
 'ResponseArea_00-0807',
 'ResponseArea_00-0902',
 'ResponseArea_00-0903',
 'ResponseArea_00-1001',
 'ResponseArea_00-1002',
 'ResponseArea_00-1003',
 'ResponseArea_00-1101',
 'ResponseArea_00-1102',
 'ResponseArea_00-1103',
 'ResponseArea_00-1104',
 'ResponseArea_00-1201',
 'ResponseArea_00-1202',
 'ResponseArea_00-1204',
 'ResponseArea_00-1205',
 'ResponseArea_00-1206',
 'ResponseArea_00-1401',
 'ResponseArea_00-1403',
 'ResponseArea_00-1404',
 'ResponseArea_00-1405',
 'ResponseArea_00-1501',
 'ResponseArea_00-1504',
 'ResponseArea_00-1505',
 'ResponseArea_00-1601',
 'ResponseArea_00-1602',
 'ResponseArea_00-1603',
 'ResponseArea_00-1604',
 'ResponseArea_00-1605',
 'ResponseArea_00-1606',
 'ResponseArea_00-1701',
 'ResponseArea_00-1703',
 'ResponseArea_00-1705',
 'ResponseArea_00-1706',
 'ResponseArea_00-1707',
 'ResponseArea_00-1708',
 'ResponseArea_00-1709',
 'ResponseArea_00-1710',
 'ResponseArea_00-1712',
 'ResponseArea_00-1801',
 'ResponseArea_00-1802',
 'ResponseArea_00-1803',
 'ResponseArea_00-1804',
 'ResponseArea_00-1805',
 'ResponseArea_00-1901',
 'ResponseArea_00-1903',
 'ResponseArea_00-2001',
 'ResponseArea_00-2002',
 'ResponseArea_00-2004',
 'ResponseArea_00-2005',
 'ResponseArea_00-2006',
 'ResponseArea_00-2101',
 'ResponseArea_00-2102',
 'ResponseArea_00-2103',
 'ResponseArea_00-2104',
 'ResponseArea_00-2201',
 'ResponseArea_00-2202',
 'ResponseArea_00-2203',
 'ResponseArea_00-2204',
 'ResponseArea_00-2205',
 'ResponseArea_00-2206',
 'ResponseArea_00-2301',
 'ResponseArea_00-2303',
 'ResponseArea_00-2304',
 'ResponseArea_00-2305',
 'ResponseArea_00-2306',
 'ResponseArea_00-2401',
 'ResponseArea_00-2402',
 'ResponseArea_00-2403',
 'ResponseArea_00-2404',
 'ResponseArea_00-2501',
 'ResponseArea_00-2502',
 'ResponseArea_00-2504',
 'ResponseArea_00-2601',
 'ResponseArea_00-2606',
 'ResponseArea_00-2607',
 'ResponseArea_00-2608',
 'ResponseArea_00-2701',
 'ResponseArea_00-2704',
 'ResponseArea_00-2705',
 'ResponseArea_00-2706',
 'ResponseArea_00-27SV',
 'ResponseArea_00-2801',
 'ResponseArea_00-2803',
 'ResponseArea_00-2804',
 'ResponseArea_00-2805',
 'ResponseArea_00-2806',
 'ResponseArea_00-2808',
 'ResponseArea_00-2901',
 'ResponseArea_00-2902',
 'ResponseArea_00-2903',
 'ResponseArea_00-2904',
 'ResponseArea_00-3001',
 'ResponseArea_00-3002',
 'ResponseArea_00-3003',
 'ResponseArea_00-3004',
 'ResponseArea_00-3005',
 'ResponseArea_00-3102',
 'ResponseArea_00-3105',
 'ResponseArea_00-3106',
 'ResponseArea_00-3201',
 'ResponseArea_00-3203',
 'ResponseArea_00-3209',
 'ResponseArea_00-3303',
 'ResponseArea_00-3304',
 'ResponseArea_00-3401',
 'ResponseArea_00-3403',
 'ResponseArea_00-3404',
 'ResponseArea_00-3501',
 'ResponseArea_00-3502',
 'ResponseArea_00-3503',
 'ResponseArea_00-3504',
 'ResponseArea_00-3601',
 'ResponseArea_00-3602',
 'ResponseArea_00-3603',
 'ResponseArea_00-3604',
 'ResponseArea_00-3606',
 'ResponseArea_00-3607',
 'ResponseArea_00-3608',
 'ResponseArea_00-3611',
 'ResponseArea_00-3802',
 'ResponseArea_00-3808',
 'ResponseArea_00-3809',
 'ResponseArea_00-3810',
 'ResponseArea_00-3901',
 'ResponseArea_00-3906',
 'ResponseArea_00-3907',
 'ResponseArea_00-3908',
 'ResponseArea_00-4001',
 'ResponseArea_00-4002',
 'ResponseArea_00-4003',
 'ResponseArea_00-4004',
 'ResponseArea_00-4005',
 'ResponseArea_00-4101',
 'ResponseArea_00-4103',
 'ResponseArea_00-4105',
 'ResponseArea_00-4201',
 'ResponseArea_00-4209',
 'ResponseArea_00-4302',
 'ResponseArea_00-4401',
 'ResponseArea_00-4502',
 'ResponseArea_00-4504',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'hour_0',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23'
]

In [27]:
rf = RandomForestClassifier()
rf.fit(df_dummies_2016[features], df_dummies_2016['late_response'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
zipped = zip(features, rf.feature_importances_)
zipped_sorted = sorted(zipped, key=lambda x: x[1], reverse=True)
for feat, importance in zipped_sorted:
    print('feature: {f}, importance: {i}'.format(f=feat, i=importance))

feature: ResponseArea_00-2404, importance: 0.05433250954518354
feature: ResponseArea_00-3602, importance: 0.031840875751433975
feature: ResponseArea_00-2001, importance: 0.031504890636066415
feature: ResponseArea_00-3209, importance: 0.0314533349566982
feature: ResponseArea_00-3608, importance: 0.028693426552989433
feature: Problem_DUMP - Dumpster Fire, importance: 0.024955308182608486
feature: Problem_TRASH - Trash Fire, importance: 0.02390222868824215
feature: day_of_week_6, importance: 0.023299993103289243
feature: ResponseArea_00-2704, importance: 0.023014184827794524
feature: ResponseArea_00-2607, importance: 0.02258141858737828
feature: ResponseArea_00-3504, importance: 0.02148484318241485
feature: Problem_AUTO - Auto Fire, importance: 0.02113666617721444
feature: day_of_week_0, importance: 0.02110220235393532
feature: ResponseArea_00-1712, importance: 0.020226249923031108
feature: hour_15, importance: 0.020189277103751433
feature: ResponseArea_00-4401, importance: 0.019890170298