# Predicting Fire Brigade Call Outs at a Local Level.

In [1]:
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [2]:
df = pd.read_csv('FireBrigadeCallOuts_StationArea.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
df["EVENT"] = df["EVENT"].fillna("No Event")
df['MONTH'] = df['MONTH'].astype(object)
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype(object)
df['DAY'] = df['DAY'].astype(object)
df['HOUR'] = df['HOUR'].astype(object)
df = df.loc[(df['STATION_AREA'] == "Tallaght") | (df['STATION_AREA'] == "Dolphins Barn")]
df = df.set_index('STATION_AREA')

## Baseline - Poisson With Time Features

In [3]:
mask = np.random.rand(len(df)) < 0.7
df_train = df[mask]
df_test = df[~mask]
print('Train Set ='+str(len(df_train)))
print('Test Set ='+str(len(df_test)))
expr = """COUNT ~ DESCRIPTION + HOUR + DAY + MONTH + DAY_OF_WEEK"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
while X_train.shape[1] != X_test.shape[1]:
    mask = np.random.rand(len(df)) < 0.7
    df_train = df[mask]
    df_test = df[~mask]
    y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
    y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
print(X_train.shape)
print(X_test.shape)

Train Set =7415
Test Set =3128
(7415, 117)
(3128, 117)


In [4]:
expr = """COUNT ~ DESCRIPTION + HOUR + DAY + MONTH + DAY_OF_WEEK"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [5]:
poisson_training_results1 = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                 7415
Model:                            GLM   Df Residuals:                     7298
Model Family:                 Poisson   Df Model:                          116
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.0035e+06
Date:                Mon, 11 May 2020   Deviance:                   1.9295e+06
Time:                        16:16:27   Pearson chi2:                 1.86e+06
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [6]:
poisson_predictions1 = poisson_training_results1.get_prediction(X_test)
predictions_summary_frame1 = poisson_predictions1.summary_frame()

In [7]:
poisson1mean = predictions_summary_frame1["mean"]

## Negative Binomial Model #1 - With Time Features

In [8]:
expr = """COUNT ~ DESCRIPTION + HOUR"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [9]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[6028.31873568 4923.82542616 6040.21243087 ... 5579.83942114 5351.96638823
 5987.54430973]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()
print(aux_olsr_results.params)
aux_olsr_results.tvalues

LAMBDA    0.044511
dtype: float64


LAMBDA    133.214691
dtype: float64

In [11]:
nb2_training_results0 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results0.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                 7415
Model:                            GLM   Df Residuals:                     7345
Model Family:        NegativeBinomial   Df Model:                           69
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -63228.
Date:                Mon, 11 May 2020   Deviance:                       8292.9
Time:                        16:16:33   Pearson chi2:                 7.61e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [12]:
nb2_predictions0 = nb2_training_results0.get_prediction(X_test)
nb2_predictions_summary_frame0 = nb2_predictions0.summary_frame()

In [13]:
nb0mean = nb2_predictions_summary_frame0["mean"]

## Negative Binomial Model #2 - With Weather Features

In [14]:
expr = """COUNT ~ DESCRIPTION + HOUR + WIND_SPEED"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [15]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[6025.62250929 4921.9223432  6038.60251043 ... 5572.36554585 5344.33709043
 5978.99917736]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [16]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [17]:
nb2_training_results1 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                 7415
Model:                            GLM   Df Residuals:                     7344
Model Family:        NegativeBinomial   Df Model:                           70
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -63228.
Date:                Mon, 11 May 2020   Deviance:                       8292.8
Time:                        16:16:37   Pearson chi2:                 7.61e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [18]:
nb2_predictions1 = nb2_training_results1.get_prediction(X_test)
nb2_predictions_summary_frame1 = nb2_predictions1.summary_frame()

In [19]:
nb1mean = nb2_predictions_summary_frame1["mean"]

## Negative Binomial Model #3 - With Event Features

In [20]:
expr = """COUNT ~ DESCRIPTION + HOUR"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [21]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[6028.31873568 4923.82542616 6040.21243087 ... 5579.83942114 5351.96638823
 5987.54430973]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [22]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [23]:
nb2_training_results2 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                 7415
Model:                            GLM   Df Residuals:                     7345
Model Family:        NegativeBinomial   Df Model:                           69
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -63228.
Date:                Mon, 11 May 2020   Deviance:                       8292.9
Time:                        16:16:40   Pearson chi2:                 7.61e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [24]:
nb2_predictions2 = nb2_training_results2.get_prediction(X_test)
nb2_predictions_summary_frame2 = nb2_predictions2.summary_frame()

In [25]:
nb2mean = nb2_predictions_summary_frame2["mean"]

## Evaluation

In [26]:
X_test["ACTUAL"] = y_test["COUNT"]
actual = X_test["ACTUAL"]
X_test["POISSON1_PREDICTIONS"] = poisson1mean
X_test["NB0_PREDICTIONS"] = nb0mean
X_test["NB1_PREDICTIONS"] = nb1mean
X_test["NB2_PREDICTIONS"] = nb2mean

In [27]:
XT1 = X_test.loc[['Tallaght']]
XT2 = X_test.loc[['Dolphins Barn']]

xt1actual = XT1["ACTUAL"]
xt1pois1 = XT1["POISSON1_PREDICTIONS"]
xt1nb0 = XT1["NB0_PREDICTIONS"]
xt1nb1 = XT1["NB1_PREDICTIONS"]
xt1nb2 = XT1["NB2_PREDICTIONS"]

xt2actual = XT2["ACTUAL"]
xt2pois1 = XT2["POISSON1_PREDICTIONS"]
xt2nb0 = XT2["NB0_PREDICTIONS"]
xt2nb1 = XT2["NB1_PREDICTIONS"]
xt2nb2 = XT2["NB2_PREDICTIONS"]

In [28]:
poisson1MAE = mean_absolute_percentage_error(xt1actual, xt1pois1)
nb0MAE = mean_absolute_percentage_error(xt1actual, xt1nb0)
nb1MAE = mean_absolute_percentage_error(xt1actual, xt1nb1)
nb2MAE = mean_absolute_percentage_error(xt1actual, xt1nb2)

print("Mean Absolute Percentage Error per Model for Call Outs from Tallaght are as follows:")
print("Poisson Model with Time Features              = {:.4f}".format(poisson1MAE))
print("Negative Binomial Model with Time Features    = {:.4f}".format(nb0MAE))
print("Negative Binomial Model with Weather Features = {:.4f}".format(nb1MAE))
print("Negative Binomial Model with Event Features   = {:.4f}".format(nb2MAE))

Mean Absolute Percentage Error per Model for Call Outs from Tallaght are as follows:
Poisson Model with Time Features              = 9.1742
Negative Binomial Model with Time Features    = 8.9957
Negative Binomial Model with Weather Features = 8.9995
Negative Binomial Model with Event Features   = 8.9957


In [29]:
poisson1MAE = mean_absolute_percentage_error(xt2actual, xt2pois1)
nb0MAE = mean_absolute_percentage_error(xt2actual, xt2nb0)
nb1MAE = mean_absolute_percentage_error(xt2actual, xt2nb1)
nb2MAE = mean_absolute_percentage_error(xt2actual, xt2nb2)

print("Mean Absolute Percentage Error per Model for Call Outs from Dolphins Barn are as follows:")
print("Poisson Model with Time Features              = {:.4f}".format(poisson1MAE))
print("Negative Binomial Model with Time Features    = {:.4f}".format(nb0MAE))
print("Negative Binomial Model with Weather Features = {:.4f}".format(nb1MAE))
print("Negative Binomial Model with Event Features   = {:.4f}".format(nb2MAE))

Mean Absolute Percentage Error per Model for Call Outs from Dolphins Barn are as follows:
Poisson Model with Time Features              = 46.0356
Negative Binomial Model with Time Features    = 45.9221
Negative Binomial Model with Weather Features = 45.9162
Negative Binomial Model with Event Features   = 45.9221
