# Submarket baseline

Steps include:

1. kmeans to find submarkets
2. Fit logistic regressions on the submarkets
3. Compare accuracy scores against other submarket & non-submarket approaches

In [34]:
#required packages
import sys
sys.path.insert(0, '../scripts/')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import chardet
import datetime

import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

from models import AE, BaselineNet

In [4]:
# Load dataset
df = pd.read_csv('../data/denver_prop_with_ct_clean_small.csv')

#convert to datetime format
df["list_date"] = pd.to_datetime(df["list_date"])
df["sale_date"] = pd.to_datetime(df["sale_date"])



In [5]:
#function to define the y variables
def gen_y(t_disc, data, t0=None):
    ''' 
    t_disc: datetime.timedelta(days = XX)
    t0: datetime.datetime(YYYY,MM,DD)
    '''

    if t0 is not None:
        listed = np.array(((data['list_date'] >= t0) & (data['list_date'] < t0 + t_disc)) | ((data['list_date'] < t0) & (data['sale_date'] >= t0)), dtype=np.int8)
        sale = np.array((data['sale_date'] >= t0) & (data['sale_date'] < t0 + t_disc), dtype = np.int8)
        return np.vstack((listed, sale)).T

In [6]:
y_2019Q2 = gen_y(datetime.timedelta(days = 90), df, datetime.datetime(2019,4,1))

# Remove all rows that aren't listed in that period
listed_index = np.where(y_2019Q2[:,0] == 1)
df_2019Q2 = df.iloc[listed_index]


In [8]:
# Build X matrix and Y target vector

X = df_2019Q2.drop(columns=['list_date','sale_date','rex_property_id'])
X = X.reset_index(drop=True)

Y = y_2019Q2[np.where(y_2019Q2[:,0]==1)]

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42336 entries, 0 to 42335
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   18-59                               42336 non-null  float64
 1   mean_household_income               42336 non-null  int64  
 2   built 1995 or later                 42336 non-null  float64
 3   OTHER                               42336 non-null  bool   
 4   mobile_home_pct                     42336 non-null  float64
 5   annual_births_per_resident          42336 non-null  float64
 6   farm_score                          42336 non-null  int64  
 7   luxury_communities_score            42336 non-null  float64
 8   CONDO                               42336 non-null  bool   
 9   property_crime_rate                 42336 non-null  float64
 10  bathfull                            42336 non-null  int64  
 11  small_apt_buildings_pct             42336

In [9]:
#attempt kmeans clustering
#unsupervised approach
K = 5
init_clustering = KMeans(n_clusters=K, random_state=0).fit(X)

In [10]:
cluster_labels = init_clustering.labels_

In [11]:
for k in range(K):
    print('Submarket {}: {} units'.format(k, sum(cluster_labels == k)))

Submarket 0: 13563 units
Submarket 1: 11009 units
Submarket 2: 488 units
Submarket 3: 15300 units
Submarket 4: 1976 units


In [12]:
#create a new df with has the clustering labels on it
X_withlabels = X.copy()
X_withlabels['labels'] = cluster_labels 

## Fitting logistic regression to identified clusters from Kmeans

In [60]:
def print_results(model,thresh=0.5):
  print('Train set accuracy: {}'.format(accuracy_score(y_train[:,1], np.where(pred_k_train[:,1]>thresh,1,0))))
  print('Test set accuracy: {}'.format(accuracy_score(y_test[:,1], np.where(pred_k[:,1]>thresh,1,0))))

  return accuracy_score(y_train[:,1], np.where(pred_k_train[:,1]>thresh,1,0)), accuracy_score(y_test[:,1], np.where(pred_k[:,1]>thresh,1,0))

In [14]:
train_test_dict = {}

#for loop which extracts the X & y df for each cluster
#these will then be used to run the logistic regression 
for i in range(5):
    X2 = X_withlabels.loc[X_withlabels['labels'] == i]
    X2 = X2.drop(columns = 'labels')
    y2 = Y[X_withlabels['labels'] == i]
    X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state =297)

    train_test_dict[f'cluster {i}'] = (X_train, X_test, y_train, y_test)

In [61]:
#now to fit for each cluster and get final results

results_dict = {}

train_acc_agg = 0
train_size = 0
test_acc_agg = 0
test_expsales_agg = 0
test_sales_agg = 0
test_size = 0

test = np.array([])
pred = np.array([])

for i in range(5):
    logit_reg = LogisticRegression()
    X_train = train_test_dict[f'cluster {i}'][0]
    X_test = train_test_dict[f'cluster {i}'][1]
    y_train = train_test_dict[f'cluster {i}'][2]
    y_test = train_test_dict[f'cluster {i}'][3]

    #fit the logistic regression to each cluster
    logit_reg.fit(X_train, y_train[:,1])
    
    #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    print(f'For cluster {i}')
    pred_k_train = logit_reg.predict_proba(X_train)
    pred_k = logit_reg.predict_proba(X_test)


    train_accuracy, test_accuracy = print_results(logit_reg)
    exp_sales = sum(logit_reg.predict_proba(X_test)[:,1])
    actual_sales = sum(y_test[:,1])
    
    print("Number of Homes: {}".format(len(y_test)))
    print("Expected #Sales: {}".format(round(exp_sales)))
    print("Actual #Sales: {}\n".format(actual_sales))
    print('=======')

 
    AUC_score = roc_auc_score(y_test[:,1], pred_k[:,1])

    results_dict[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score, round(exp_sales),actual_sales]
    
    #the below code is used to find the weighted average
    test = np.append(test,y_test[:,1])
    pred = np.append(pred,pred_k[:,1])
    train_size += len(y_train)
    train_acc_agg += train_accuracy * len(y_train)
    test_size += len(y_test)
    test_acc_agg += test_accuracy * len(y_test)
    test_expsales_agg += round(exp_sales)
    test_sales_agg += actual_sales

results_df = pd.DataFrame.from_dict(results_dict, orient='index')

For cluster 0
Train set accuracy: 0.60195884230219
Test set accuracy: 0.6016532618409294
Number of Homes: 4476
Expected #Sales: 1778
Actual #Sales: 1783

For cluster 1
Train set accuracy: 0.6591648590021691
Test set accuracy: 0.6732727773190201
Number of Homes: 3633
Expected #Sales: 1241
Actual #Sales: 1187

For cluster 2
Train set accuracy: 0.7055214723926381
Test set accuracy: 0.7962962962962963
Number of Homes: 162
Expected #Sales: 47
Actual #Sales: 33

For cluster 3
Train set accuracy: 0.6166227685103892
Test set accuracy: 0.6219053277876807
Number of Homes: 5049
Expected #Sales: 1932
Actual #Sales: 1911

For cluster 4
Train set accuracy: 0.7135298563869993
Test set accuracy: 0.7029096477794793
Number of Homes: 653
Expected #Sales: 187
Actual #Sales: 194



In [59]:
results_df = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC', 3:'expec sales', 4:'actual sales'})
results_df

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC,expec sales,actual sales
cluster 0,0.544734,0.536193,0.545864,1778,1783
cluster 1,0.659165,0.673273,0.526305,1241,1187
cluster 2,0.705521,0.796296,0.620625,47,33
cluster 3,0.56414,0.560309,0.547272,1932,1911
cluster 4,0.71353,0.70291,0.528345,187,194


In [30]:
auc_agg = roc_auc_score(test, pred)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y[:,1])))
print("Training Accuracy: {:.4f}%".format(train_acc_agg*100/train_size))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg*100/test_size))
print("AUC: {:.6f}".format(auc_agg))
print(f'number of homes predicted sold {test_expsales_agg}')
print(f'number of homes actually sold {test_sales_agg}')

Marketwide Demand Prediction
Number of Homes: 42336
Training Accuracy: 62.8530%
Testing Accuracy: 63.4581%
AUC: 0.559084
number of homes predicted sold 5185
number of homes actually sold 5108


# Neural network on kmeans

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 


results_dict_nn = {}

train_acc_agg_nn = 0
train_size_nn = 0
test_acc_agg_nn = 0
test_size_nn = 0

test_nn = np.array([])
pred_nn = np.array([])

for i in range(5):
    print(f'For cluster {i}')
    X_train = train_test_dict[f'cluster {i}'][0]
    X_test = train_test_dict[f'cluster {i}'][1]
    y_train = train_test_dict[f'cluster {i}'][2]
    y_test = train_test_dict[f'cluster {i}'][3]
    
    X_train = torch.tensor(X_train.astype(float).values, dtype=torch.float64).to(device)
    y_train = torch.tensor(y_train[:,1], dtype=torch.int64).to(device)
    y_train = torch.nn.functional.one_hot(y_train)
    #fit the logistic regression to each cluster
    net = BaselineNet(X_train.shape[1], y_train.shape[1], epochs=5000, debug=False).to(device)
    net.feed(X_train, y_train)
    
    #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    with torch.no_grad():
      net.eval()
      X_test = torch.tensor(X_test.astype(float).values, dtype=torch.float64).to(device)
      y_test = torch.tensor(y_test[:,1], dtype=torch.int64).to(device)
      y_test = torch.nn.functional.one_hot(y_test)

      y_pred = net.forward(X_test.float()) 
      labels = torch.max(y_test, 1)[1]
      correct = 0
      total = y_pred.shape[0]
      _, predicted = torch.max(y_pred.data, 1)
      correct += (predicted == labels).sum()
      accuracy = 100 * correct/total
      print('Accuracy: {}'.format(accuracy))
    print('=======')
    
    y_test = y_test.cpu()
    AUC_score = roc_auc_score(y_test.cpu(), predicted.cpu().reshape(-1,1))

    results_dict_nn[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score]
    
    #the below code is used to find the weighted average
    test_nn = np.append(test_nn,y_test[:,1])
    pred_nn = np.append(pred_nn, predicted.cpu())
    train_size_nn += len(y_train)
    train_acc_agg_nn += train_accuracy * len(y_train)
    test_size_nn += len(y_test)
    test_acc_agg_nn += test_accuracy * len(y_test)

results_df_nn = pd.DataFrame.from_dict(results_dict_nn, orient='index')

For cluster 0
Accuracy: 66.51531982421875
For cluster 1
Accuracy: 81.39534759521484
For cluster 2
Accuracy: 66.7741928100586
For cluster 3
Accuracy: 75.59632873535156
For cluster 4
Accuracy: 73.64865112304688


In [None]:
results_df_nn = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC'})
results_df_nn

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC
cluster 0,0.665733,0.677639,0.523043
cluster 1,0.836207,0.831395,0.595089
cluster 2,0.726115,0.658065,0.504673
cluster 3,0.774457,0.776147,0.582498
cluster 4,0.734388,0.733108,0.57729


In [None]:
auc_agg_nn = roc_auc_score(test_nn, pred_nn)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y[:,1])))
print("Training Accuracy: {:.4f}%".format(train_acc_agg_nn*100/train_size_nn))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg_nn*100/test_size_nn))
print("AUC: {:.6f}".format(auc_agg_nn))

Marketwide Demand Prediction
Number of Homes: 7567
Training Accuracy: 73.4388%
Testing Accuracy: 73.3108%
AUC: 0.579349


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1c850c61-d934-4c85-b16d-3cb283df0c84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>