## Model Iterations 

### Goal
- To automate model iterations for shortlisted zipcodes
- Build models
- use models to predict on test data
- Filter out ill performing models
- Retrain models on the entire time series?
- Make future Forecasts
- Caluculate ROI for one year and select top 5


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import cleaning_functions as cfs
from matplotlib.pylab import rcParams
plt.style.use('fivethirtyeight')
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
import itertools
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.easter import easter
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error

Firstly let's make a function that subsets the dataframe as per Chicago Metro Area, calculates 5year ROI, 2 year ROI and creates a list of top regions, and converts the wide format to long and returns a time series dataframe with zipcodes as columns 

In [3]:
df = cfs.load_df()
melted_df = cfs.melt_data(df)
chicago_df = df[(df['Metro'] == 'Chicago') & (df['State'] == 'IL')]
chicago_df['ROI_5yrs'] = ((chicago_df['2018-04'] - chicago_df['2013-04'])/chicago_df['2013-04'])*100
chicago_df['ROI_2yrs'] = ((chicago_df['2018-04'] - chicago_df['2016-04'])/chicago_df['2016-04'])*100
top_15_ROI5 = chicago_df.sort_values('ROI_5yrs', ascending=False)[:15].RegionName
top_15_ROI2 = chicago_df.sort_values('ROI_2yrs', ascending=False)[:15].RegionName

In [4]:
top_15_ROI5.values

array([60804, 60085, 60153, 60110, 60104, 60505, 60651, 60073, 60436,
       60639, 60120, 60165, 60160, 60641, 60042])

In [5]:
top_15_ROI2.values

array([60085, 60432, 60436, 60163, 60456, 60034, 60180, 60633, 60099,
       60505, 60162, 60165, 60408, 60639, 60160])

These are teh repeats in our top 15 zipcodes based on calculated ROIs
[60085, 60505, 60436]

In [20]:
regions = top_15_ROI5.values + top_15_ROI2.values

array([120889, 120517, 120589, 120273, 120560, 120539, 120831, 120706,
       120535, 121144, 120282, 120330, 120568, 121280, 120202])

In [None]:
chicago_top_27 = pd.DataFrame()
chicago_top_30['time'] = pd.date_range(start='1996-04-01', end='2018-04-01', freq='MS')
chicago_top_30.set_index('time', inplace=True)
for region in top_30.values:
    chicago_top_30[region] = melted_df[melted_df['RegionName'] == region].value.values

In [None]:
def chicago_df():
    """This function returns the dataframe with
    chicago metropolitan zipcodes and the median 
    home saleprices in the long format
    """
    #helper funstion to load csv as df
    #path = '../../data/zillow_data.csv'
    df = load_df()
    
    #shortlisting chicago metro area
    chicago_df = df[(df['Metro'] == 'Chicago') & (df['State'] == 'IL')]
    
    #caluculating ROI 5yrs & 2yrs
    chicago_df['ROI_5yrs'] = ((chicago_df['2018-04'] -chicago_df['2013-04'])/
                              chicago_df['2013-04'])*100
    chicago_df['ROI_2yrs'] = ((chicago_df['2018-04'] - chicago_df['2016-04'])/
                              chicago_df['2016-04'])*100
    
    #sorting dataframe based on calculated ROIs
    top_30 = chicago_df.sort_values(
        'ROI_5yrs', ascending=False)[:100].sort_values(
        'ROI_2yrs',ascending=False)[:30]
    
    #wide to long format
    melted = pd.melt(top_30, id_vars=['RegionName', 'City','State',
                                      'Metro', 'CountyName','RegionID',
                                      'SizeRank', 'ROI_5yrs', 'ROI_2yrs' ],
                     var_name='time')
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True)
    melted = melted.dropna(subset=['value'])
    
    #top 30 zipcodes
    regions = melted.sort_values('ROI_2yrs', ascending=False).RegionName.unique()
    
    #setting up timeseries dataframe
    new_df = pd.DataFrame()
    new_df['time'] = pd.date_range(start='1996-04-01', end='2018-04-01', freq='MS')
    new_df.set_index('time', inplace=True)
    for region in regions:
        new_df[region] = melted[melted['RegionName'] == region].value.values
    
    return new_df, regions


In [None]:
sorted_list = sorted(dict_name, key= lambda x: x[1], reverse=True)