# Calculating the RMSE on top performing zipcodes

### Objective

To calculate the RMSE on the zipcodes retrieved from the SARIMA model in order to test our model predictions. The calculations will be performed between the test_df, generated after the data retrieval, and the predictions dataframe generated from the SARIMA model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
import datetime
import seaborn as sns
import statsmodels.api as sm
import warnings
import pickle
import functions as fn

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')

In [None]:
#Openning the corresponding files.

with open('test.pickle', 'rb') as f:
    test_df = pickle.load(f)
    
with open('pred_20zipcodes.pickle', 'rb') as f:
    pred_29zipcodes = pickle.load(f)

In [None]:
#Converting the SARIMA model predictions into a meaningful dataframe

preds = pd.DataFrame(data=None)
for i in pred_29zipcodes.keys():
    df=pd.DataFrame()
    df['date']= pred_29zipcodes[i].conf_int().index
    df['RegionName'] = i
    df['pred_mean'] = np.array(pred_29zipcodes[i].predicted_mean)
    preds = pd.concat([preds, df], axis=0)

In [None]:
#Ensuring that the SARIMA selected zipcodes exist in the test_df and making a list
#of the combined zipcodes.

pred_zips = list(preds['RegionName'].unique())

test_zips = list(test_df['RegionName'].unique())

intersection = list(set(pred_zips).intersection(test_zips))

In [None]:
#Running the retrieving_zipcode_info function to retrieve a zipcode specific dictionary
#with the corresponding monthly mean zipcode value.

test_dict = fn.retrieving_zipcode_info(test_df, intersection)

In [None]:
test_dict.keys()

In [None]:
#Converting the dictionary into a dataframe.

test_df_merged = pd.DataFrame(data=None)
for i in test_dict.keys():
    df = pd.DataFrame()
    df['date'] = test_dict[i].index
    df['RegionName'] = i
    df['actual_returns'] = np.array(test_dict[i]['value'])
    test_df_merged = pd.concat([test_df_merged, df], axis=0)
    
test_df_merged.reset_index(inplace=True)
test_df_merged.drop('index', axis=1, inplace=True)

In [None]:
test_df_merged.head()

Predictions dataframe contains data for the next 3 years, however, the test_df only has data until July 2019. Therefore we need to aling both dataframes before merging.


In [None]:
#Dropping unnecesary dates.

preds = preds.drop(preds[preds['date'] > datetime.date(2019, 7, 1)].index)
preds.reset_index(inplace=True)
preds.drop('index', axis=1, inplace=True)

In [None]:
#Concatinate both dataframes to have both the predictions and the test values in the same df.

divergence = pd.merge(test_df_merged, preds, left_index=True, right_index=True)

In [None]:
divergence.head()

In [None]:
#Dropping repeated columns.

divergence.drop(['date_y', 'RegionName_y'], axis=1, inplace=True)

Calculating the root_mean_squared_error between the predicted data and the testing data.

In [None]:
rmse_sarima_zipcodes = fn.rmse(divergence)

In [None]:
rmse_sarima_zipcodes

In [None]:
#Saving the sorted list of rmse into a pickle.

with open('rmse_sarima_zipcodes.pickle', 'wb') as f:
    pickle.dump(rmse_sarima_zipcodes, f)