# Predict Housing Prices
- pass your own list of zip codes to predict future housing prices
- data from: https://www.zillow.com/research/data/ (download new data for more recent results)

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm as log_progress
from ipywidgets import widgets
from IPython.display import display, clear_output
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scalecast import GridGenerator
from scalecast.Forecaster import Forecaster
from scalecast.Forecaster import _determine_best_by_

In [2]:
# add more/different models to this tuple
models = ('knn','svr','elasticnet','mlp','gbt','prophet')
sns.set(rc={'figure.figsize':(12,8)})

In [3]:
GridGenerator.get_example_grids() # use overwrite=True to overwrite an existing Grids file

In [4]:
def results_vis(f_dict,plot_type='forecast'):
    """ visualize the forecast results
        leverages Jupyter widgets
    """
    def display_user_selections(ts_selection,mo_selection,lv_selection,me_selection):
        matplotlib.use('nbAgg')
        %matplotlib inline
        sns.set(rc={'figure.figsize':(16,8)})
        selected_data = f_dict[ts_selection]
        if plot_type == 'forecast':
            print(ts_selection)
            selected_data.plot(models=f'top_{mo_selection}',order_by=me_selection,level=lv_selection,
                               print_attr=['TestSetRMSE','TestSetR2','LevelTestSetRMSE','TestSetMAPE','LevelTestSetR2',
                                           'LevelTestSetMAPE','Scaler','HyperParams','Xvars','models','Integration'])
        elif plot_type == 'test':
            print(ts_selection)
            selected_data.plot_test_set(models=f'top_{mo_selection}',order_by=me_selection,include_train=52,level=lv_selection)

    def on_button_clicked(b):
        mo_selection = mo_dd.value
        ts_selection = ts_dd.value
        lv_selection = lv_dd.value
        me_selection = me_dd.value
        with output:
            clear_output()
            display_user_selections(ts_selection,mo_selection,lv_selection,me_selection)
    
    all_models = models + ('weighted','avg')
    ts_dd = widgets.Dropdown(options=f_dict.keys(), description = 'Time Series:')
    mo_dd = widgets.Dropdown(options=range(1,len(all_models)+1), description = 'No. Models')
    lv_dd = widgets.Dropdown(options=[True,False],description='View Level')
    me_dd = widgets.Dropdown(options=sorted([e for e in _determine_best_by_ if e is not None]),description='Order By')

    # never changes
    button = widgets.Button(description="Select Time Series")
    output = widgets.Output()

    display(ts_dd,mo_dd,lv_dd,me_dd)
    display(button, output)
    
    button.on_click(on_button_clicked)

In [None]:
df = pd.read_csv('Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv',dtype={'RegionName':str}).set_index('RegionName')

In [None]:
df.head()

## Select Zip Codes

In [None]:
# overwrite these zips with a list of your own zips (str types)
zips = ['84009', '84095', '34746', '32303', '30328']
zips

In [None]:
# loads sliced dataframe into forecaster objects stored in a dict
preds = {}
for z in zips:
    data_load = df.loc[z].transpose()
    f = Forecaster(y=data_load.values[8:],current_dates=data_load.index[8:],name=z)
    f.set_test_length(6)
    preds[str(z)] = f

## EDA
- last series only
- use only training set to avoid leakage

In [None]:
f.plot_acf(diffy=2,train_only=True)
plt.show()

In [None]:
f.plot_pacf(diffy=2,train_only=True)
plt.show()

In [None]:
f.seasonal_decompose(diffy=2,train_only=True).plot()
plt.show()

## Predict future values
- using second differenced data for stationarity

In [None]:
# full forecast process
for k, f in log_progress(preds.items()):
    print(k)
    f.generate_future_dates(12)
    f.set_validation_length(6)
    f.add_ar_terms(3)
    f.add_AR_terms((4,3))
    f.add_AR_terms((2,12))
    f.integrate()
    f.add_seasonal_regressors('month','quarter',raw=False,dummy=True)
    f.add_seasonal_regressors('year')
    f.add_other_regressor(called='2008_recession',start='2007-10-01',end='2009-06-30')
    f.add_time_trend(called='t')
    f.add_poly_term('t',pwr=3)
    
    for m in log_progress(models):
        f.set_estimator(m)
        f.tune()
        f.auto_forecast()
        
    f.set_estimator('combo')
    # top based on performance of each model in tuning process
    f.manual_forecast(how='simple',models='top_3',call_me='avg')
    f.manual_forecast(how='weighted',models='top_5',call_me='weighted')

## Write out model stats

In [None]:
# writes model summaries to a csv file
model_summaries = pd.DataFrame()
for k, f in preds.items():
    df = f.export(dfs='model_summaries',determine_best_by='LevelTestSetMAPE')
    df['Name'] = k
    model_summaries = pd.concat([model_summaries,df],ignore_index=True)
    
model_summaries.to_csv('model_summaries.csv',index=False)

## Visualize results

In [None]:
results_vis(preds,'forecast')

In [None]:
results_vis(preds,'test')