In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import settings
import pandas as pd
import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt

import graphviz
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression


  from pandas.core import datetools


## Assemble the master dataset from smaller datasets

In [2]:
def concatenate():
    # Create Super Dataset
    # From files downloaded from https://github.com/cityofaustin/hack-austin/tree/master/Austin%20Fire%20Department%20Data

    # 2012
    fire_2012 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY12 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2012.dropna(inplace=True)
    fire_2012.index = pd.to_datetime(fire_2012.index)

    # 2013
    fire_2013 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY13 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2013.dropna(inplace=True)
    fire_2013.index = pd.to_datetime(fire_2013.index)

    # 2014
    fire_2014 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY14 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2014.dropna(inplace=True)
    fire_2014.index = pd.to_datetime(fire_2014.index)

    # 2015
    fire_2015 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY15 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2015.dropna(inplace=True)
    fire_2015.index = pd.to_datetime(fire_2015.index)

    # 2016
    fire_2016 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY16 - SOC Fire Data Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2016.dropna(inplace=True)
    fire_2016.index = pd.to_datetime(fire_2016.index)

    # 2017
    fire_2017 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_CY17 - SOC Filtered Data_Generalized.csv'), index_col='AFD Time Phone Pickup')
    fire_2017.dropna(inplace=True)
    fire_2017.index = pd.to_datetime(fire_2017.index)

    # Combine all years into single dataframe
    final_df = pd.concat([fire_2012, fire_2013, fire_2014, fire_2015, fire_2016, fire_2017])

    # Calculate time between AFD phone pickup and first unit arrival in seconds and in minutes
    final_df['First Unit Arrived'] = pd.to_datetime(final_df['First Unit Arrived'])
    final_df['Response Time (s)'] = (final_df['First Unit Arrived'] - final_df.index).astype('timedelta64[s]')
    final_df['Response Time (m)'] = ((final_df['First Unit Arrived'] - final_df.index).astype('timedelta64[s]')) / 60
    final_df.reset_index(inplace=True)

    # Read in incident detail reports for all years available
    # Downloaded from https://data.austintexas.gov/browse?q=AFD&sortBy=relevance&anonymous=true
    AFD_13 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_Fire_Incidents_2013_January_Thru_December.csv'))
    AFD_14 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_Fire_Incidents_2014_January_Thru_December.csv'))
    AFD_15 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_Fire_Incidents_2015_January_Thru_December.csv'))
    AFD_16 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_Fire_Incidents_2016_January_Thru_December.csv'))
    AFD_17 = pd.read_csv(os.path.join('..', settings.DATA_DIR, 'AFD_Fire_Incidents_2017_January_Thru_December.csv'))

    # Concatenate all years incident detail reports into one dataframe
    frames = [AFD_13, AFD_14, AFD_15, AFD_16, AFD_17]
    result = pd.concat(frames)
    result = result.rename(index=str, columns={"MasterIncidentNumber": "Master Incident Number"})

    # Join dataframe with response time information with problem detail dataframe
    detail_final_df = pd.merge(final_df, result, how='inner', on='Master Incident Number')
    detail_final_df['day_of_week'] = detail_final_df['First Unit Arrived'].dt.dayofweek
    detail_final_df['hour'] = detail_final_df['First Unit Arrived'].dt.hour
    detail_final_df['late_response'] = np.where(detail_final_df['Response Time (s)'] > (60 * 8), 1, 0)

    del detail_final_df['CalendarYear_y']
    del detail_final_df['PriorityDescription_y']
    del detail_final_df['Response Status_y']
    detail_final_df.rename(columns={'CalendarYear_x': 'CalendarYear', 'Response Status_x': 'Response Status',
                                    'PriorityDescription_x': 'PriorityDescription'}, inplace=True)

    detail_final_df.to_csv(os.path.join('..', settings.PROCESSED_DIR, 'All Years with Response Times and Problem Types.csv'))
    df = detail_final_df
    return df

def create_dummies(df):
    df = pd.get_dummies(df, columns=['Problem', 'ResponseArea', 'day_of_week', 'hour'])
    return df

def write_data():
    df.to_csv(os.path.join('..', settings.PROCESSED_DIR, "all_years_with_RT-and-PT-dummies.csv"), index_label='index')
    pass

In [3]:
df = concatenate()
df = create_dummies(df)
write_data()

## Fit decision tree classifier to see features with highest splits in classifying a late response

In [4]:
def read_data():
    df = pd.read_csv(os.path.join('..', settings.PROCESSED_DIR, "all_years_with_RT-and-PT.csv"))
    return df

def limit_time_period(df):
    df = df[df['CalendarYear'] >= settings.YEARS_OF_ANALYSES]
    return df

# Fit decision tree regressor using features and response variable set in settings.py
# Prints feature importances sorted by descending by magnitude
def sort_important_features(df):
    dt = DecisionTreeClassifier()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    model = dt.fit(df[predictors], df[settings.TARGET])
    results = {name: score for name, score in zip(predictors, dt.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print("Feature Importances for the Response Variable: {}".format(settings.TARGET))
    for feat, importance in sorted_results:
        print("feature: {f}, importance: {i}".format(f=feat, i=importance))
    accuracy = dt.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))
    return model

def print_tree(model):
    # Saves the figure of the decision tree in the 'images' folder
    PROJECT_ROOT_DIR = ".."
    IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
    path = os.path.join(IMAGES_PATH, "Decision Tree Predicting {}".format(settings.TARGET))
    predictors = df.columns.tolist()
    features = [p for p in predictors if p not in settings.NON_PREDICTORS]
    data = export_graphviz(model, out_file=None, feature_names=features)
    graph = graphviz.Source(data)
    graph.render(path, view=True)
    pass

In [5]:
df = read_data()
df = limit_time_period(df)
model = sort_important_features(df)
print_tree(model)

Feature Importances for the Response Variable: late_response
feature: ResponseArea_00-2404, importance: 0.052518401296182474
feature: hour_15, importance: 0.050388846073004964
feature: Problem_DUMP - Dumpster Fire, importance: 0.04561059342815105
feature: hour_23, importance: 0.04256988719960766
feature: day_of_week_2, importance: 0.040542749713912044
feature: day_of_week_4, importance: 0.030407062285434033
feature: ResponseArea_00-1505, importance: 0.02752311816469794
feature: ResponseArea_00-4005, importance: 0.027409620770204788
feature: ResponseArea_00-4209, importance: 0.027296823976911723
feature: ResponseArea_00-2001, importance: 0.027184722030394706
feature: ResponseArea_00-3808, importance: 0.027073309235187797
feature: ResponseArea_00-3209, importance: 0.026962579954062198
feature: ResponseArea_00-3102, importance: 0.026852528607311407
feature: ResponseArea_00-3106, importance: 0.026743149672046912
feature: ResponseArea_00-3608, importance: 0.026634437681511112
feature: Respo

  if self.run_code(code, result):


## Run random forest classifier to find important features in late responses

In [7]:
# Read in the data
def read_data():
    df = pd.read_csv(os.path.join('..', settings.PROCESSED_DIR, "all_years_with_RT-and-PT.csv"))
    return df

def limit_time_period(df):
    df = df[df['CalendarYear'] >= settings.YEARS_OF_ANALYSES]
    return df

# Fit random forest regressor and store feature importances and column names in a dictionary
# Print sorted feature importances descending by magnitude
def sort_important_features(df):
    rf = RandomForestClassifier()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    model = rf.fit(df[predictors], df[settings.TARGET])
    importances = rf.feature_importances_

    results = {name: score for name, score in zip(predictors, importances)}
    print("Feature Importances for the Response Variable: {}".format(settings.TARGET))
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    for feat, importance in sorted_results:
        print('feature: {f}, importance: {i}'.format(f=feat, i=importance))
    accuracy = rf.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))
    names = [k[0] for k in sorted_results]
    return model, importances, names

# Create horizontal bar chart showing feature importances descending by magnitude
def plot_feature_importances(df, importances, names):
    reversed_names = names[::-1]
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    features = predictors
    indices = np.argsort(importances)

    plt.figure(figsize=(30, 90))
    matplotlib.rcParams.update({'font.size': 22})
    plt.title('Feature Importances Predicting {}'.format(settings.TARGET))
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), reversed_names)
    plt.xlabel('Relative Importance')
    save_fig('Feature Importances Predicting {}'.format(settings.TARGET))
    pass

# Save the figure of feature importances in the 'images' folder
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    # Where to save the figures
    PROJECT_ROOT_DIR = ".."
    IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    pass

In [8]:
df = read_data()
rf, importances, names = sort_important_features(df)
plot_feature_importances(df, importances, names)

  if self.run_code(code, result):


Feature Importances for the Response Variable: late_response
feature: day_of_week_0, importance: 0.027817449764420698
feature: day_of_week_4, importance: 0.027101820889883006
feature: day_of_week_6, importance: 0.025825292403702748
feature: day_of_week_1, importance: 0.024154610195773238
feature: day_of_week_2, importance: 0.023954975673587446
feature: day_of_week_3, importance: 0.023615438610396437
feature: day_of_week_5, importance: 0.022949185255625733
feature: Problem_ELEC - Electrical Fire, importance: 0.020945482005449562
feature: ResponseArea_00-3603, importance: 0.019710320610650277
feature: hour_17, importance: 0.018634038039287704
feature: hour_18, importance: 0.017998259500823288
feature: Problem_GRASS - Small Grass Fire, importance: 0.01771994346660672
feature: hour_15, importance: 0.01729695658208947
feature: Problem_TRASH - Trash Fire, importance: 0.014450292406309795
feature: hour_21, importance: 0.014256084678473373
feature: hour_14, importance: 0.014007801607837055
fea

Saving figure Feature Importances Predicting late_response


## Run linear regression and logistic regression on days of week and problem types

In [9]:
# Read in the data
def read_data():
    df = pd.read_csv(os.path.join('..', settings.PROCESSED_DIR, "all_years_with_RT-and-PT.csv"))
    return df

def limit_time_period(df):
    df = df[df['CalendarYear'] >= settings.YEARS_OF_ANALYSES]
    return df

# Fits a linear model using features and 'Response Time (m)' ratings specified here (not set in the settings.py script)
# Prints the summary page to show coefficients, p-values, and R squared
def create_summary_linear(df):
    lr = LinearRegression()
    predictors = ['day_of_week_0', 'day_of_week_1', 'day_of_week_2','day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'Problem_AUTO - Auto Fire', 'Problem_BBQ - Unsafe Cooking', 'Problem_BOX -Structure Fire', 'Problem_BOXL- Structure Fire',  'Problem_DUMP - Dumpster Fire', 'Problem_ELEC - Electrical Fire', 'Problem_GRASS - Small Grass Fire','Problem_TRASH - Trash Fire']
    lr.fit(df[predictors], df['Response Time (m)'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.OLS(df['Response Time (m)'], X2)
    est2 = est.fit()
    print(est2.summary())
    pass

# Fits a linear model using features and 'late_response' specified here (not set in the settings.py script)
# Prints the summary page to show coefficients, p-values, and R squared
def create_summary_logistic(df):
    # import chisqprob to prevent error as noted in https://github.com/statsmodels/statsmodels/issues/3931
    from scipy import stats
    stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
    lr = LogisticRegression()
    # omits to put 'day_of_week_6' or 'Problem_BRSHL - Brush Alarm / Light' in the model as these will be the constants
    predictors = ['day_of_week_0', 'day_of_week_1', 'day_of_week_2','day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'Problem_AUTO - Auto Fire', 'Problem_BBQ - Unsafe Cooking', 'Problem_BOX -Structure Fire', 'Problem_BOXL- Structure Fire',  'Problem_DUMP - Dumpster Fire', 'Problem_ELEC - Electrical Fire', 'Problem_GRASS - Small Grass Fire','Problem_TRASH - Trash Fire']
    lr.fit(df[predictors], df['late_response'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.Logit(df['late_response'], X2)
    est2 = est.fit()
    print(est2.summary())

In [10]:
df = read_data()
create_summary_linear(df)
create_summary_logistic(df)

                            OLS Regression Results                            
Dep. Variable:      Response Time (m)   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     4.065
Date:                Fri, 09 Feb 2018   Prob (F-statistic):           4.40e-07
Time:                        15:03:58   Log-Likelihood:                -15841.
No. Observations:                6240   AIC:                         3.171e+04
Df Residuals:                    6225   BIC:                         3.181e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

  if self.run_code(code, result):
