In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data
df = pd.read_pickle('shared/Project-3_NYC_311_Calls.pkl')

In [3]:
# Move the ‘Created Date’ to the dataframe’s index
df = df.set_index(pd.DatetimeIndex(df['Created Date']))
del df['Created Date']

In [4]:
df

Unnamed: 0_level_0,Unique Key,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,City,Resolution Description,Borough,Open Data Channel Type
Created Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-04-06 00:00:00,20184537,HPD,Department of Housing Preservation and Develop...,HEATING,HEAT,RESIDENTIAL BUILDING,10002.0,NEW YORK,More than one complaint was received for this ...,MANHATTAN,UNKNOWN
2011-04-06 00:00:00,20184538,HPD,Department of Housing Preservation and Develop...,GENERAL CONSTRUCTION,WINDOWS,RESIDENTIAL BUILDING,11236.0,BROOKLYN,The Department of Housing Preservation and Dev...,BROOKLYN,UNKNOWN
2011-04-06 00:00:00,20184539,HPD,Department of Housing Preservation and Develop...,PAINT - PLASTER,WALLS,RESIDENTIAL BUILDING,10460.0,BRONX,The Department of Housing Preservation and Dev...,BRONX,UNKNOWN
2022-07-08 11:14:43,54732265,DSNY,Department of Sanitation,Dirty Condition,Trash,Sidewalk,10467.0,BRONX,The Department of Sanitation investigated this...,BRONX,PHONE
2011-04-06 00:00:00,20184540,HPD,Department of Housing Preservation and Develop...,NONCONST,VERMIN,RESIDENTIAL BUILDING,10460.0,BRONX,The Department of Housing Preservation and Dev...,BRONX,UNKNOWN
...,...,...,...,...,...,...,...,...,...,...,...
2011-04-06 00:00:00,20184532,HPD,Department of Housing Preservation and Develop...,HEATING,HEAT,RESIDENTIAL BUILDING,10468,BRONX,The Department of Housing Preservation and Dev...,BRONX,UNKNOWN
2011-04-06 00:00:00,20184533,HPD,Department of Housing Preservation and Develop...,HEATING,HEAT,RESIDENTIAL BUILDING,10018,NEW YORK,More than one complaint was received for this ...,MANHATTAN,UNKNOWN
2011-04-06 00:00:00,20184534,HPD,Department of Housing Preservation and Develop...,GENERAL CONSTRUCTION,STAIRS,RESIDENTIAL BUILDING,10460,BRONX,The Department of Housing Preservation and Dev...,BRONX,UNKNOWN
2011-04-06 00:00:00,20184535,HPD,Department of Housing Preservation and Develop...,GENERAL CONSTRUCTION,GAS,RESIDENTIAL BUILDING,11236,BROOKLYN,The Department of Housing Preservation and Dev...,BROOKLYN,UNKNOWN


In [5]:
# Filtering time series to 2022 and resampling dataset in daily aspect
df2022 = df.loc['2022']
daily_complaints_2022 = df2022['Unique Key'].resample('D').count()
average_daily_complaints_2022 = daily_complaints_2022.mean()
print("Average number of daily complaints in 2022 is ",average_daily_complaints_2022)

Average number of daily complaints in 2022 is  8684.320547945206


In [6]:
# On 2020-08-04 were the maximum number of calls received
daily_complaints = df['Unique Key'].resample('D').count()
daily_complaints.sort_values(ascending=False)

Created Date
2020-08-04    24415
2020-08-05    19560
2020-07-05    16916
2020-06-21    15883
2020-06-20    15825
              ...  
2011-04-24     2264
2012-10-28     2237
2011-07-03     2123
2011-08-27     1730
2023-08-04      384
Name: Unique Key, Length: 4964, dtype: int64

In [7]:
# On the date the maximum number of calls were received, Damaged Tree was the most important complaint type?
maxdf = df.loc['2020-08-04']
maxdf['Complaint Type'].value_counts()

Complaint Type
Damaged Tree                           14863
Noise - Residential                      982
Request Large Bulky Item Collection      909
Street Light Condition                   617
Overgrown Tree/Branches                  609
                                       ...  
Bus Stop Shelter Placement                 1
Unsanitary Pigeon Condition                1
Public Payphone Complaint                  1
For Hire Vehicle Report                    1
Bridge Condition                           1
Name: count, Length: 125, dtype: int64

In [None]:
#Group the data by month throughout several years
# December has the fewest number of calls.
monthly_complaints = df['Unique Key'].resample('ME').size()
monthly_summary = monthly_complaints.groupby(monthly_complaints.index.month).sum()
monthly_summary.sort_values()

In [None]:
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
#count the number of calls by days and perform ETS decomposition based on additive model
daily_call = df['Unique Key'].resample('D').size()
result = seasonal_decompose(daily_call, model = 'additive') 
seasonal = result.seasonal
seasonal_value_on_christmas = seasonal.loc['2020-12-25']
print("the value of the seasonal component on 2020-12-25 is",round(seasonal_value_on_christmas))

In [None]:
#Autocorrelation of the number of daily calls with the number of calls the day prior
autocorrelation_lag_1 = daily_call.autocorr(lag=1)
print("Autocorrelation of the number of daily calls with the number of calls the day prior is", autocorrelation_lag_1)

In [None]:
from prophet import Prophet

In [None]:
#add columns of no. of daily calls to the dataframe
daily_call_df = daily_call.to_frame(name='Daily Calls')

In [None]:
#Forecast the daily series with a test set of 90 days using the Prophet
train_set = daily_call_df.iloc[:-90]
test_set = daily_call_df.iloc[-90:]

print("Training set: ", train_set.shape[0])
print("Test set: ", test_set.shape[0])

In [None]:
# Create the ds and y columns for Prophet
train_set_prophet = train_set.reset_index()
train_set_prophet = train_set_prophet[['Created Date', 'Daily Calls']]
train_set_prophet.columns = ['ds', 'y']
train_set_prophet.head()

In [None]:
model = Prophet()
model.fit(train_set_prophet)

In [None]:
future = model.make_future_dataframe(periods=90,freq = 'd')

In [None]:
forecast = model.predict(future)

In [None]:
preds = pd.DataFrame({'Prediction': forecast.yhat[-90:]})
preds.index = pd.to_datetime(forecast.ds[-90:])
preds.index.names = ['Date']
preds

In [None]:
# Calculate Evaluation Metrics
y_test = test_set['Daily Calls'] 
y_pred = preds['Prediction']
pd.DataFrame({'y_test': y_test, 'y_pred' : y_pred, 'diff':y_test - y_pred})

In [None]:
# Model evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('MSE = ', mean_squared_error(y_test,y_pred))
print('RMSE = ', np.sqrt(mean_squared_error(y_test,y_pred)))
print('MAE = ', mean_absolute_error(y_test,y_pred))