# Predicting Fire Brigade Call Outs on a Specific Date.

In [3]:
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
df = pd.read_csv('FireBrigadeCallOuts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
df["EVENT"] = df["EVENT"].fillna("No Event")
df['MONTH'] = df['MONTH'].astype(object)
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype(object)
df['DAY'] = df['DAY'].astype(object)
df['HOUR'] = df['HOUR'].astype(object)

## Baseline - Poisson With Time Features

In [5]:
mask = np.random.rand(len(df)) < 0.7
df_train = df[mask]
df_test = df[~mask]
print('Train Set ='+str(len(df_train)))
print('Test Set ='+str(len(df_test)))
expr = """COUNT ~ MONTH + DAY + DAY_OF_WEEK + DESCRIPTION + HOUR + STATION_AREA"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
while X_train.shape[1] != X_test.shape[1]:
    mask = np.random.rand(len(df)) < 0.7
    df_train = df[mask]
    df_test = df[~mask]
    y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
    y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
print(X_train.shape)
print(X_test.shape)

Train Set =27028
Test Set =11527
(27003, 140)
(11552, 140)


In [6]:
poisson_training_results1 = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                27003
Model:                            GLM   Df Residuals:                    26863
Model Family:                 Poisson   Df Model:                          139
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.1804e+05
Date:                Mon, 11 May 2020   Deviance:                       88497.
Time:                        16:12:07   Pearson chi2:                 8.91e+04
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [7]:
poisson_predictions1 = poisson_training_results1.get_prediction(X_test)
predictions_summary_frame1 = poisson_predictions1.summary_frame()

In [8]:
poisson1mean = predictions_summary_frame1["mean"]

## Negative Binomial Model #1 - With Time Features

In [9]:
expr = """COUNT ~ MONTH + DAY + DAY_OF_WEEK + DESCRIPTION + STATION_AREA + HOUR"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [10]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[31.71761476 32.75888391 31.72030322 ... 58.8174092  56.29342642
 58.01230911]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [11]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()
print(aux_olsr_results.params)
aux_olsr_results.tvalues

LAMBDA    0.068395
dtype: float64


LAMBDA    87.815553
dtype: float64

In [12]:
nb2_training_results0 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results0.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                27003
Model:                            GLM   Df Residuals:                    26863
Model Family:        NegativeBinomial   Df Model:                          139
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.0213e+05
Date:                Mon, 11 May 2020   Deviance:                       21827.
Time:                        16:12:17   Pearson chi2:                 2.19e+04
No. Iterations:                     8                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [13]:
nb2_predictions0 = nb2_training_results0.get_prediction(X_test)
nb2_predictions_summary_frame0 = nb2_predictions0.summary_frame()

In [14]:
nb0mean = nb2_predictions_summary_frame0["mean"]

## Negative Binomial Model #2 - With Weather Features

In [15]:
expr = """COUNT ~ MONTH + DAY + DAY_OF_WEEK + CLOUD_COVER + DESCRIPTION + WIND_SPEED + TEMPERATURE + PRECIPITATION + STATION_AREA + HOUR"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [16]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[32.74735774 33.72343609 32.77734369 ... 55.26246316 53.15382259
 54.44863548]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [17]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [18]:
nb2_training_results1 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                27003
Model:                            GLM   Df Residuals:                    26859
Model Family:        NegativeBinomial   Df Model:                          143
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.0142e+05
Date:                Mon, 11 May 2020   Deviance:                       21668.
Time:                        16:12:25   Pearson chi2:                 2.18e+04
No. Iterations:                     8                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [19]:
nb2_predictions1 = nb2_training_results1.get_prediction(X_test)
nb2_predictions_summary_frame1 = nb2_predictions1.summary_frame()

In [20]:
nb1mean = nb2_predictions_summary_frame1["mean"]

## Negative Binomial Model #3 - With Event Features

In [21]:
expr = """COUNT ~ MONTH + DAY + DAY_OF_WEEK + DESCRIPTION + EVENT + STATION_AREA + HOUR"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

In [22]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.mu)
df_train['LAMBDA'] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['COUNT'] - x['LAMBDA'])**2 - x['COUNT']) / x['LAMBDA'], axis=1)

[31.74369303 32.79852354 31.77352137 ... 58.77253305 56.20367914
 57.96601061]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [24]:
ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [25]:
nb2_training_results2 = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
print(nb2_training_results2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  COUNT   No. Observations:                27003
Model:                            GLM   Df Residuals:                    26859
Model Family:        NegativeBinomial   Df Model:                          143
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.0212e+05
Date:                Mon, 11 May 2020   Deviance:                       21832.
Time:                        16:12:41   Pearson chi2:                 2.19e+04
No. Iterations:                     8                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [26]:
nb2_predictions2 = nb2_training_results2.get_prediction(X_test)
nb2_predictions_summary_frame2 = nb2_predictions2.summary_frame()

In [27]:
nb2mean = nb2_predictions_summary_frame2["mean"]

## Evaluation

In [28]:
X_test["ACTUAL"] = y_test["COUNT"]
actual = X_test["ACTUAL"]
X_test["POISSON1_PREDICTIONS"] = poisson1mean
X_test["NB0_PREDICTIONS"] = nb0mean
X_test["NB1_PREDICTIONS"] = nb1mean
X_test["NB2_PREDICTIONS"] = nb2mean

In [29]:
poisson1R2 = r2_score(actual, poisson1mean)
nb0R2 = r2_score(actual, nb0mean)
nb1R2 = r2_score(actual, nb1mean)
nb2R2 = r2_score(actual, nb2mean)

print("R2 Score for Poisson Model with Time Features              =  {:.4f}".format(poisson1R2))
print("R2 Score for Negative Binomial Model with Time Features    =  {:.4f}".format(nb0R2))
print("R2 Score for Negative Binomial Model with Weather Features =  {:.4f}".format(nb1R2))
print("R2 Score for Negative Binomial Model with Event Features   =  {:.4f}".format(nb2R2))

R2 Score for Poisson Model with Time Features              =  0.3025
R2 Score for Negative Binomial Model with Time Features    =  0.2922
R2 Score for Negative Binomial Model with Weather Features =  0.3125
R2 Score for Negative Binomial Model with Event Features   =  0.2779


In [30]:
poisson1MAE = mean_absolute_percentage_error(actual, poisson1mean)
nb0MAE = mean_absolute_percentage_error(actual, nb0mean)
nb1MAE = mean_absolute_percentage_error(actual, nb1mean)
nb2MAE = mean_absolute_percentage_error(actual, nb2mean)

print("Mean Absolute Percentage Error for Poisson Model with Time Features              = {:.4f}".format(poisson1MAE))
print("Mean Absolute Percentage Error for Negative Binomial Model with Time Features    = {:.4f}".format(nb0MAE))
print("Mean Absolute Percentage Error for Negative Binomial Model with Weather Features = {:.4f}".format(nb1MAE))
print("Mean Absolute Percentage Error for Negative Binomial Model with Event Features   = {:.4f}".format(nb2MAE))

Mean Absolute Percentage Error for Poisson Model with Time Features              = 29.5767
Mean Absolute Percentage Error for Negative Binomial Model with Time Features    = 29.0974
Mean Absolute Percentage Error for Negative Binomial Model with Weather Features = 28.9231
Mean Absolute Percentage Error for Negative Binomial Model with Event Features   = 27.6388
