In [1]:
# Import necessary libraries
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Read the pickle file into a DataFrame
df = pd.read_pickle('shared/Project-3_NYC_311_Calls.pkl')

# Extract relevant columns and convert 'Created Date' to datetime
df['Created Date'] = pd.to_datetime(df['Created Date'])
df = df.set_index('Created Date')[['Unique Key', 'Complaint Type']]

# Resample the data to a daily series
daily_series = df.resample('D').count()['Unique Key']

In [2]:
# Question 1
average_daily_complaints_2022 = daily_series['2022'].mean()
print("Average number of daily complaints received in 2022:", round(average_daily_complaints_2022))

Average number of daily complaints received in 2022: 8684


In [3]:
# Question 2
max_calls_date = daily_series.idxmax()
print("Date with the maximum number of calls:", max_calls_date)

Date with the maximum number of calls: 2020-08-04 00:00:00


In [4]:
# Question 3
max_calls_complaint_type = df[df.index.date == max_calls_date.date()]['Complaint Type'].value_counts().idxmax()
print("Most important complaint type on the date with the maximum number of calls:", max_calls_complaint_type)

Most important complaint type on the date with the maximum number of calls: Damaged Tree


In [5]:
# Question 4
quietest_month = daily_series.resample('M').sum().idxmin().strftime('%b')
print("Quietest month:", quietest_month)

Quietest month: Aug


In [6]:
# Question 5
result = seasonal_decompose(daily_series, model='additive')
seasonal_component_20201225 = result.seasonal['2020-12-25']
print("Value of the seasonal component on 2020-12-25:", round(seasonal_component_20201225))


Value of the seasonal component on 2020-12-25: 183


In [7]:
# Question 6
autocorrelation = daily_series.autocorr(1)
print("Autocorrelation of the number of daily calls with the number of calls the day prior:", round(autocorrelation, 2))

Autocorrelation of the number of daily calls with the number of calls the day prior: 0.75


In [8]:
# Question 7 - Using ARIMA for forecasting
train_size = len(daily_series) - 90
train, test = daily_series.iloc[:train_size], daily_series.iloc[train_size:]

# Fit the ARIMA model
order = (5, 1, 0)  # Example order, you may need to fine-tune
model = ARIMA(train, order=order)
model_fit = model.fit()

# Make predictions for the test set
forecast = model_fit.predict(start=len(train), end=len(train) + len(test) - 1, typ='levels')

# Calculate RMSE
rmse = mean_squared_error(test, forecast, squared=False)
print("RMSE on the test set:", round(rmse))

RMSE on the test set: 1279


