In [1]:
#required packages
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from pymc3 import Model
import pymc3 as pm
import theano.tensor as tt
import datetime
from IPython.display import display, Markdown

In [2]:
# Load dataset
df = pd.read_csv('../data/denver_prop_listing_clean.csv')

#convert to datetime format
df["list_date"] = pd.to_datetime(df["list_date"])
df["sale_date"] = pd.to_datetime(df["sale_date"])

In [3]:
mapping = {k: v for v, k in enumerate(df.property_type.unique())}
mapping

{'SINGLE': 0,
 'CONDO': 1,
 'OTHER': 2,
 'MULTI_FAMILY': 3,
 'TOWNHOUSE': 4,
 'LAND': 5}

In [4]:
df['property_type_code'] = df['property_type'].map(mapping)

In [5]:
df.head()

Unnamed: 0,property_id,zipcode,fips,latitude,longitude,sqft,property_type,has_central_air,has_jacuzzi,has_pool,has_solar,has_garage,list_date,sale_date,sale_price,bedrooms,full_baths,ct_key,sale_price_per_sqft,property_type_code
0,138574001,80002,8059.0,39.7927,-105.13026,3047.0,SINGLE,True,False,True,False,True,2020-09-24,2020-11-02,630000.0,5.0,3.0,8059010000.0,206.760748,0
1,138573806,80002,8059.0,39.79424,-105.12171,4382.0,SINGLE,True,False,False,False,True,2020-09-24,2020-10-29,805000.0,4.0,2.0,8059010000.0,183.70607,0
2,138572945,80002,8059.0,39.79565,-105.11674,2654.0,SINGLE,False,False,False,False,True,2020-09-24,2020-10-19,665000.0,4.0,2.0,8059010000.0,250.565185,0
3,138568013,80002,8059.0,39.79655,-105.09923,2265.0,SINGLE,False,False,False,False,False,2020-09-24,2020-10-28,607000.0,4.0,2.0,8059010000.0,267.99117,0
4,138564524,80002,8059.0,39.79399,-105.06972,1431.0,SINGLE,False,False,False,False,False,2020-09-24,2020-10-16,425000.0,4.0,1.0,8059010000.0,296.995108,0


In [6]:
# Sale           331537
# Withdrawn       48375
# Active          33218
# Expired         22163
# Pending         15960
# Other             711
# Foreclosure       316
# For Rent          123
# Pending,           95
# Cancelled          86
# LOT                70
# Coming Soon         6

Current approach is trained on a particular discretized timeframe (say a quarter / 3 months). One potential exploration is to train the heirarchical models on separate discretized quarters and generate a time series on the changing sub-market classifications across the quarters. However, one concern is when a house is sold before a particular quarter and the model would assume that its corresponding sub-market has low supply?

In [7]:
def gen_y(t_disc, data, t0=None):
    ''' 
    t_disc: datetime.timedelta(days = XX)
    t0: datetime.datetime(YYYY,MM,DD)
    '''

    if t0 is not None:
        listed = np.array(((data['list_date'] >= t0) & (data['list_date'] < t0 + t_disc)) | ((data['list_date'] < t0) & (data['sale_date'] >= t0)), dtype=np.int8)
        sale = np.array((data['sale_date'] >= t0) & (data['sale_date'] < t0 + t_disc), dtype = np.int8)
        return np.vstack((listed, sale)).T
    #else:
        #TODO


In [8]:
y_2019Q2 = gen_y(datetime.timedelta(days = 90), df, datetime.datetime(2019,4,1))

In [9]:
sum([y_2019Q2[i,0] > y_2019Q2[i,1] for i in range(y_2019Q2.shape[0])])

5373

In [10]:
# X = df.drop(columns=['property_id', 'zipcode', 'list_date', 'sale_date', 'current_status', 'property_type', 'has_jacuzzi', 'has_garage'])
X = df.drop(columns=['property_id', 'zipcode', 'list_date', 'sale_date', 'property_type', 'has_jacuzzi', 'has_garage', 'sale_price', 'ct_key', 'sale_price_per_sqft'])

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86787 entries, 0 to 86786
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fips                86787 non-null  float64
 1   latitude            86787 non-null  float64
 2   longitude           86787 non-null  float64
 3   sqft                86787 non-null  float64
 4   has_central_air     86787 non-null  bool   
 5   has_pool            86787 non-null  bool   
 6   has_solar           86787 non-null  bool   
 7   bedrooms            86787 non-null  float64
 8   full_baths          86787 non-null  float64
 9   property_type_code  86787 non-null  int64  
dtypes: bool(3), float64(6), int64(1)
memory usage: 4.9 MB


In [11]:
X_array = X.to_numpy(dtype=np.float64)

In [12]:
sigmoid = lambda x : 1/(1+np.exp(-x))

In [13]:
X_array.shape

(86787, 10)

In [14]:
K = 5 # Number of submarkets
n_homes, n_features = X_array.shape
h_prior_var = 10 # Prior variance of home features
b_prior_var = 1  # Prior variance of "hedonic" supply / demand regression parameters
y_prior_var = 1  # Prior variance of y

with Model() as baseline_mod:

    submarket = pm.Categorical('submarket', p=np.ones(K), shape=n_homes)

    mu_k = pm.Deterministic('mu_k', tt.as_tensor_variable(np.array([np.zeros(n_features) for _ in range(K)])))

    sigma_k = pm.Deterministic('sigma_k', tt.as_tensor_variable(np.array([h_prior_var * np.eye(n_features) for _ in range(K)])))

    homes = pm.MvNormal('homes', mu_k[submarket], sigma_k[submarket].reshape((n_features,n_features)), observed=X_array)

    # Generate betas for "hedonic" regression
    mu_b = pm.Deterministic('mu_b', tt.as_tensor_variable(np.array([np.zeros(n_features * 2) for _ in range(K)])))

    sigma_b = pm.Deterministic('sigma_b', tt.as_tensor_variable(np.array([b_prior_var * np.eye(n_features * 2) for _ in range(K)])))

    beta = pm.MvNormal('betas', mu_b[submarket], sigma_b[submarket].reshape((n_features,n_features)), shape=n_features)
    
    sigma_y = pm.Deterministic('sigma_y', tt.as_tensor_variable(y_prior_var * np.eye(2)))

    # Final layer
    y = pm.MvNormal('y', mu = np.dot(homes,beta.reshape((n_features,2))), sigma=sigma_y.reshape((2,2)))

    pred = pm.Deterministic('pred', 1/(1+np.exp(-y)), observed = y_2019Q2)

ValueError: cannot reshape array of size 8678700 into shape (10,10)

In [None]:
K = 5 # Number of submarkets
n_homes, n_features = X_array.shape
h_prior_var = 1 # Prior variance of home features
b_prior_var = 1  # Prior variance of "hedonic" supply / demand regression parameters
y_prior_var = 1  # Prior variance of y
mu_k_prior_var = 10

with Model() as baseline_mod:

    p = pm.Dirichlet('p', a=np.ones(K), shape=K)
    submarket = pm.Categorical('submarket', p=p, shape=n_homes)

    mu_k = [pm.Normal('mu_k_%d' % i, mu=np.zeros(n_features), sigma=10, shape=n_features) for i in range(K)]
    sigma_k = [pm.Normal('sigma_%d' % i, mu=h_prior_var*np.ones(n_features**2), sigma=10, shape=n_features**2) for i in range(K)]

    homes = pm.MvNormal('homes', mu_k[submarket], sigma_k[submarket].reshape((n_features,n_features)), observed=X_array)

    # Generate betas for "hedonic" regression
    mu_k = pm.Normal('mu_k', mu=np.zeros(K*n_features), sigma=10, shape=K*n_features)
    sigma_k = pm.Normal('sigma_k', mu=np.zeros(K*n_features**2), sigma=10, shape=K*n_features**2)
    mu_b = pm.Deterministic('mu_b', tt.as_tensor_variable(np.array([np.zeros(n_features * 2) for _ in range(K)])))

    sigma_b = pm.Deterministic('sigma_b', tt.as_tensor_variable(np.array([b_prior_var * np.eye(n_features * 2) for _ in range(K)])))

    beta = pm.MvNormal('betas', mu_b[submarket], sigma_b[submarket].reshape((n_features,n_features)), shape=n_features)
    
    sigma_y = pm.Deterministic('sigma_y', tt.as_tensor_variable(y_prior_var * np.eye(2)))

    # Final layer
    y = pm.MvNormal('y', mu = np.dot(homes,beta.reshape((n_features,2))), sigma=sigma_y.reshape((2,2)))

    pred = pm.Deterministic('pred', 1/(1+np.exp(-y)), observed = y_2019Q2)

In [None]:
K = 5 # Number of submarkets
n_homes, n_features = X_array.shape
mu_k_prior_var = 10
sigma_k_prior_var = 10
mu_b1_prior_var = 10
sigma_b1_prior_var = 10
mu_b2_prior_var = 10
sigma_b2_prior_var = 10
sigma_y_prior_var = 10

with Model() as baseline_mod:

    submarket = pm.Categorical('submarket', p=np.ones(K), shape=n_homes)

    mu_k = pm.Normal('mu_k', mu=0, sigma=mu_k_prior_var, shape=(K,n_features))

    sigma_k = pm.Normal('sigma_k', mu=0, sigma=sigma_k_prior_var, shape=(n_features,n_features))
    # sigma_k = pm.Normal('sigma_k', mu=0, sigma=sigma_k_prior_var, shape=(K,n_features**2))

    homes = pm.MvNormal('homes', mu_k[submarket], sigma_k, observed=X_array)

    # Generate betas for "hedonic" regression
    mu_b1 = pm.Normal('mu_b1', mu=0, sigma=mu_b1_prior_var, shape=(K,n_features))

    sigma_b1 = pm.Normal('sigma_b1', mu=0, sigma=sigma_b1_prior_var, shape=(n_features,n_features))

    beta1 = pm.MvNormal('beta1', mu_b1[submarket], sigma_b1, shape=(n_homes,n_features))

    mu_b2 = pm.Normal('mu_b2', mu=0, sigma=mu_b2_prior_var, shape=(K,n_features))

    sigma_b2 = pm.Normal('sigma_b2', mu=0, sigma=sigma_b2_prior_var, shape=(n_features,n_features))

    beta2 = pm.MvNormal('beta2', mu_b2[submarket], sigma_b2, shape=(n_homes,n_features))
    
    sigma_y1 = pm.Uniform("sigma_y1", lower=0, upper=20)
    
    sigma_y2 = pm.Uniform("sigma_y2", lower=0, upper=20)

    # Final layer
    # y1 = pm.Normal('y1', mu=tt.dot(tt.matrix(homes),tt.transpose(tt.matrix(beta1)))[0,:], sigma=sigma_y1, shape=n_homes)
    y1 = pm.Normal('y1', mu=1/(1+tt.exp(tt.tensordot(homes,tt.transpose(beta1)))), sigma=sigma_y1, observed = y_2019Q2[:,0])
    y2 = pm.Normal('y2', mu=1/(1+tt.exp(tt.tensordot(homes,tt.transpose(beta2)))), sigma=sigma_y2, observed = y_2019Q2[:,1])
    
    # Deterministic cannot have observed argument
    # pred1 = pm.Deterministic('pred1', 1/(1+np.exp(-y1)), observed = y_2019Q2[:,0])
    # pred2 = pm.Deterministic('pred1', 1/(1+np.exp(-y2)), observed = y_2019Q2[:,1])

In [None]:
with baseline_mod:

    step = pm.Metropolis(vars=[y1])
    tr = pm.sample(1000, step = step)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1c850c61-d934-4c85-b16d-3cb283df0c84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>