In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
import numpy as np

actual_test = 'revealed_test.csv'

model_filename = 'hist_gradient_model.joblib'
loaded_model = joblib.load(model_filename)

mbd = pd.read_csv("MBD_CFIP_20monthlag.csv")

# Dropping rows from CFIPS #28055 which contains values of 0, in order to calculate SMAPE
mbd = mbd.drop(mbd[mbd["CFIP"]==28055].index)
mbd = mbd.to_numpy()

In [None]:

# get only the last segment of months from the dataframe, for each county
mbd = mbd[mbd['FirstDateOrder'] == 16]
# this is because we have the dataframe setup as having 20 regressors for 1 target
# we want the 20 regressors. This also means our 'firstdateorder' is +1
mbd['FirstDateOrder'] += 1
mbd = mbd.drop(columns=['MBD_0'], axis=1)

mbd_np = mbd.to_numpy()
# we want the last 20 
# this is predicting the first month of unknown
predicted0 = loaded_model.predict(mbd_np[:,-20:])
# add the first predicted month to the array
mbd_np = np.hstack((mbd_np, predicted0.reshape(-1,1)))
# now we want to predict the 2nd month of unknwon
predicted1 = loaded_model.predict(mbd_np[:,-20:])

mbd_np = np.hstack((mbd_np, predicted1.reshape(-1,1)))

new_colnames = list(mbd.columns) + ['P_MBD_21','P_MBD_22']
mbd_with_preds = pd.DataFrame(mbd_np, columns=new_colnames)

mbd_original_form = mbd_with_preds.loc[:,['CFIP','P_MBD_21','P_MBD_22']]

mbd_original_form = mbd_original_form.melt(id_vars=['CFIP'], value_vars=['P_MBD_21', 'P_MBD_22'], var_name=['Month'], value_name='MBD')
mbd_original_form['Month'] = mbd_original_form['Month'].replace('P_MBD_21', '2022-11-01')
mbd_original_form['Month'] = mbd_original_form['Month'].replace('P_MBD_22', '2022-12-01')

# join mbd_original_form onto 

mbd_to_compare = pd.merge(actual, mbd_original_form, left_on=['cfips','first_day_of_month'], right_on=['CFIP','Month'])
def smape(y_test, y_pred):
    numerator = np.abs(y_test-y_pred)
    denominator = (y_test + np.abs(y_pred)) /200
    return np.mean(np.divide(numerator,denominator))

smape_overall = smape(mbd_to_compare['microbusiness_density'],mbd_to_compare['MBD'])
first_month_to_compare = mbd_to_compare[mbd_to_compare['first_day_of_month'] == '2022-11-01']
smape_first = smape(first_month_to_compare['microbusiness_density'],first_month_to_compare['MBD'])
second_month_to_compare = mbd_to_compare[mbd_to_compare['first_day_of_month'] == '2022-12-01']
smape_second = smape(second_month_to_compare['microbusiness_density'],second_month_to_compare['MBD'])

print('Overall Model Test SMAPE: {}'.format(smape_overall))
print('First Month Model Test SMAPE: {}'.format(smape_first))
print('Second Month Model Test SMAPE: {}'.format(smape_second))
