# Submarket baseline

Steps include:

1. kmeans to find submarkets
2. Fit logistic regressions on the submarkets
3. Compare accuracy scores against other submarket & non-submarket approaches

In [1]:
#required packages
import sys
sys.path.insert(0, '../scripts/')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import chardet
import datetime

import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from models import AE, BaselineNet

from utils import *



In [2]:
# Load dataset
df = pd.read_csv('../data/denver_dataset_milestone3.csv', index_col=0)

#convert to datetime format
df["list_date"] = pd.to_datetime(df["list_date"])
df["sale_date"] = pd.to_datetime(df["sale_date"])

df = df.drop(columns=['rex_property_id'])
df = df.dropna()
res = gen_dataset(df, '2019-04-01', 90)

{'bool': ['OTHER', 'CONDO', 'MULTI_FAMILY'], 'int64': ['mean_household_income', 'farm_score', 'bathfull', 'bedrooms'], 'float64': ['18-59', 'built 1995 or later', 'mobile_home_pct', 'annual_births_per_resident', 'luxury_communities_score', 'property_crime_rate', 'small_apt_buildings_pct', 'standardized_test_score_percentile', 'latitude', 'longitude'], 'datetime64[ns]': ['list_date', 'sale_date']}


In [3]:
X = res['X']
Y = res['y']

X_lat = X.copy()
X = X.drop(columns=['latitude', 'longitude'])

In [4]:
def print_results(model,thresh=0.5):
  print('Train set accuracy: {}'.format(accuracy_score(y_train, np.where(pred_k_train[:,1]>thresh,1,0))))
  print('Test set accuracy: {}'.format(accuracy_score(y_test, np.where(pred_k[:,1]>thresh,1,0))))

  return accuracy_score(y_train, np.where(pred_k_train[:,1]>thresh,1,0)), accuracy_score(y_test, np.where(pred_k[:,1]>thresh,1,0))

In [5]:
#attempt kmeans clustering
#unsupervised approach
#finding the optimum value of k

mse_list = []

for clus_num in range(3,20,1):
    K = clus_num
    init_clustering = KMeans(n_clusters=K, random_state=0).fit(X_lat)

    cluster_labels = init_clustering.labels_

    for k in range(K):
        print('Submarket {}: {} units'.format(k, sum(cluster_labels == k)))

    #create a new df with has the clustering labels on it
    X_withlabels = X.copy()
    X_withlabels['labels'] = cluster_labels 

    # Fitting logistic regression to identified clusters from Kmeans
    train_test_dict = {}

    #for loop which extracts the X & y df for each cluster
    #these will then be used to run the logistic regression 
    for i in range(K):
        X2 = X_withlabels.loc[X_withlabels['labels'] == i]
        X2 = X2.drop(columns = 'labels')
        y2 = Y[X_withlabels['labels'] == i]
        X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state =297)

        train_test_dict[f'cluster {i}'] = (X_train, X_test, y_train, y_test)

    #now to fit for each cluster and get final results

    

    train_acc_agg = 0
    train_size = 0
    test_acc_agg = 0
    test_expsales_agg = 0
    test_sales_agg = 0
    test_size = 0

    test = np.array([])
    pred = np.array([])

    #fit k number of logistic regressions and get results
    for i in range(K):
        logit_reg = LogisticRegression()
        X_train = train_test_dict[f'cluster {i}'][0]
        X_test = train_test_dict[f'cluster {i}'][1]
        y_train = train_test_dict[f'cluster {i}'][2]
        y_test = train_test_dict[f'cluster {i}'][3]

        #fit the logistic regression to each cluster
        logit_reg.fit(X_train, y_train)

            #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
        print(f'For cluster {i}')
        pred_k_train = logit_reg.predict_proba(X_train)
        pred_k = logit_reg.predict_proba(X_test)

        train_accuracy, test_accuracy = print_results(logit_reg)
        exp_sales = sum(logit_reg.predict_proba(X_test)[:,1])
        actual_sales = sum(y_test)
    
        print("Number of Homes: {}".format(len(y_test)))
        print("Expected #Sales: {}".format(round(exp_sales)))
        print("Actual #Sales: {}\n".format(actual_sales))
        print('=======')

        
        #the below code is used to find the weighted average
        test = np.append(test,y_test)
        pred = np.append(pred,pred_k[:,1])
        train_size += len(y_train)
        train_acc_agg += train_accuracy * len(y_train)
        test_size += len(y_test)
        test_acc_agg += test_accuracy * len(y_test)
        test_expsales_agg += round(exp_sales)
        test_sales_agg += actual_sales

    #find MSE for this k value and append to list
    #we will choose the submarket with the lowest MSE
    mse_list.append(mean_squared_error([test_expsales_agg],[test_sales_agg]))
    


Expected #Sales: 665
Actual #Sales: 645

For cluster 2
Train set accuracy: 0.7692307692307693
Test set accuracy: 0.7464788732394366
Number of Homes: 71
Expected #Sales: 16
Actual #Sales: 18

For cluster 3
Train set accuracy: 0.6455696202531646
Test set accuracy: 0.6741071428571429
Number of Homes: 896
Expected #Sales: 317
Actual #Sales: 292

For cluster 4
Train set accuracy: 0.5845661735305878
Test set accuracy: 0.5644768856447688
Number of Homes: 1233
Expected #Sales: 512
Actual #Sales: 537

For cluster 5
Train set accuracy: 0.6974358974358974
Test set accuracy: 0.7508650519031141
Number of Homes: 289
Expected #Sales: 87
Actual #Sales: 72

For cluster 6
Train set accuracy: 0.64037558685446
Test set accuracy: 0.625158831003812
Number of Homes: 1574
Expected #Sales: 564
Actual #Sales: 591

For cluster 7
Train set accuracy: 0.6409618573797679
Test set accuracy: 0.6156433978132885
Number of Homes: 1189
Expected #Sales: 425
Actual #Sales: 457

For cluster 8
Train set accuracy: 0.6328125
Te

In [6]:
print(f'the mininum MSE score is where there are {[m for m in range(3,20,1)][np.argmin(mse_list)]} submarkets')

the mininum MSE score is where there are 19 submarkets


### Run on eleven submarkets

In [7]:
#attempt kmeans clustering
#unsupervised approach
#finding the optimum value of k


K = 11
init_clustering = KMeans(n_clusters=K, random_state=0).fit(X)

cluster_labels = init_clustering.labels_

for k in range(K):
    print('Submarket {}: {} units'.format(k, sum(cluster_labels == k)))

#create a new df with has the clustering labels on it
X_withlabels = X.copy()
X_withlabels['labels'] = cluster_labels 

    # Fitting logistic regression to identified clusters from Kmeans
train_test_dict = {}

    #for loop which extracts the X & y df for each cluster
#these will then be used to run the logistic regression 
for i in range(K):
    X2 = X_withlabels.loc[X_withlabels['labels'] == i]
    X2 = X2.drop(columns = 'labels')
    y2 = Y[X_withlabels['labels'] == i]
    X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state =297)

    train_test_dict[f'cluster {i}'] = (X_train, X_test, y_train, y_test)

#now to fit for each cluster and get final results

results_dict = {}

train_acc_agg = 0
train_size = 0
test_acc_agg = 0
test_expsales_agg = 0
test_sales_agg = 0
test_size = 0

test = np.array([])
pred = np.array([])

#fit k number of logistic regressions and get results
for i in range(K):
    logit_reg = LogisticRegression()
    X_train = train_test_dict[f'cluster {i}'][0]
    X_test = train_test_dict[f'cluster {i}'][1]
    y_train = train_test_dict[f'cluster {i}'][2]
    y_test = train_test_dict[f'cluster {i}'][3]

    #fit the logistic regression to each cluster
    logit_reg.fit(X_train, y_train)

            #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    print(f'For cluster {i}')
    pred_k_train = logit_reg.predict_proba(X_train)
    pred_k = logit_reg.predict_proba(X_test)

    train_accuracy, test_accuracy = print_results(logit_reg)
    exp_sales = sum(logit_reg.predict_proba(X_test)[:,1])
    actual_sales = sum(y_test)
    
    print("Number of test Homes: {}".format(len(y_test)))
    print("Expected test #Sales: {}".format(round(exp_sales)))
    print("Actual test #Sales: {}\n".format(actual_sales))
    print('=======')

    AUC_score = roc_auc_score(y_test, pred_k[:,1])

    results_dict[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score, round(exp_sales),actual_sales]
    
    #the below code is used to find the weighted average
    test = np.append(test,y_test)
    pred = np.append(pred,pred_k[:,1])
    train_size += len(y_train)
    train_acc_agg += train_accuracy * len(y_train)
    test_size += len(y_test)
    test_acc_agg += test_accuracy * len(y_test)
    test_expsales_agg += round(exp_sales)
    test_sales_agg += actual_sales

#find MSE for this k value and append to list
#we will choose the submarket with the lowest MSE
results_df = pd.DataFrame.from_dict(results_dict, orient='index')


Submarket 0: 2826 units
Submarket 1: 5992 units
Submarket 2: 214 units
Submarket 3: 4857 units
Submarket 4: 192 units
Submarket 5: 3803 units
Submarket 6: 4034 units
Submarket 7: 874 units
Submarket 8: 5977 units
Submarket 9: 4098 units
Submarket 10: 1240 units
For cluster 0
Train set accuracy: 0.6772319070258849
Test set accuracy: 0.6763129689174705
Number of test Homes: 933
Expected test #Sales: 301
Actual test #Sales: 302

For cluster 1
Train set accuracy: 0.5817140009965122
Test set accuracy: 0.5783619817997978
Number of test Homes: 1978
Expected test #Sales: 827
Actual test #Sales: 834

For cluster 2
Train set accuracy: 0.7692307692307693
Test set accuracy: 0.7464788732394366
Number of test Homes: 71
Expected test #Sales: 16
Actual test #Sales: 18

For cluster 3
Train set accuracy: 0.6444376152427781
Test set accuracy: 0.6188396756082346
Number of test Homes: 1603
Expected test #Sales: 567
Actual test #Sales: 613

For cluster 4
Train set accuracy: 0.6328125
Test set accuracy: 0.68

In [8]:
results_df = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC', 3:'expec sales', 4:'actual sales'})
results_df

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC,expec sales,actual sales
cluster 0,0.677232,0.676313,0.516693,301,302
cluster 1,0.581714,0.578362,0.478264,827,834
cluster 2,0.769231,0.746479,0.546122,16,18
cluster 3,0.644438,0.61884,0.554638,567,613
cluster 4,0.632812,0.6875,0.585795,24,20
cluster 5,0.658556,0.650996,0.545809,431,438
cluster 6,0.629164,0.633634,0.471704,496,488
cluster 7,0.697436,0.750865,0.53437,87,72
cluster 8,0.607393,0.599088,0.497564,774,791
cluster 9,0.624044,0.636364,0.570326,520,475


In [9]:
auc_agg = roc_auc_score(test, pred)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y)))
print("Training Accuracy: {:.4f}%".format(train_acc_agg*100/train_size))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg*100/test_size))
print("AUC: {:.6f}".format(auc_agg))
print(f'number of homes predicted sold {test_expsales_agg}')
print(f'number of homes actually sold {test_sales_agg}')

Marketwide Demand Prediction
Number of Homes: 34107
Training Accuracy: 63.0395%
Testing Accuracy: 62.6676%
AUC: 0.550720
number of homes predicted sold 4174
number of homes actually sold 4189


# Neural network on kmeans

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

mse_list = []

for clus_num in range(3,20,1):
    K = clus_num
    results_dict_nn = {}

    train_acc_agg_nn = 0
    train_size_nn = 0
    test_acc_agg_nn = 0
    test_size_nn = 0

    test_nn = np.array([])
    pred_nn = np.array([])

    for i in range(K):
        print(f'For cluster {i}')
        X_train = train_test_dict[f'cluster {i}'][0]
        X_test = train_test_dict[f'cluster {i}'][1]
        y_train = train_test_dict[f'cluster {i}'][2]
        y_test = train_test_dict[f'cluster {i}'][3]

        X_train = torch.tensor(X_train.astype(float).values, dtype=torch.float64).to(device)
        y_train = torch.tensor(y_train, dtype=torch.int64).to(device)
        y_train = torch.nn.functional.one_hot(y_train)
        #fit the logistic regression to each cluster
        net = BaselineNet(X_train.shape[1], y_train.shape[1], epochs=5000, debug=False).to(device)
        net.feed(X_train, y_train)

        #the below code is used in order to create a df containing a breakdown of metrics
        #on a per cluster basis
        with torch.no_grad():
          net.eval()
          X_test = torch.tensor(X_test.astype(float).values, dtype=torch.float64).to(device)
          y_test = torch.tensor(y_test, dtype=torch.int64).to(device)
          y_test = torch.nn.functional.one_hot(y_test)

          y_pred = net.forward(X_test.float()) 
          labels = torch.max(y_test, 1)[1]
          correct = 0
          total = y_pred.shape[0]
          _, predicted = torch.max(y_pred.data, 1)
          correct += (predicted == labels).sum()
          test_accuracy = 100 * correct/total
          print('Test accuracy: {}'.format(test_accuracy))

          y_pred_train = net.forward(X_train.float()) 
          labels_train = torch.max(y_train, 1)[1]
          correct_train = 0
          total_train = y_pred_train.shape[0]
          _, predicted_train = torch.max(y_pred_train.data, 1)
          correct_train += (predicted_train == labels_train).sum()
          train_accuracy = 100 * correct_train/total_train
          print('Train accuracy: {}'.format(train_accuracy))
        print('=======')

        y_test = y_test.cpu()
        AUC_score = roc_auc_score(y_test.cpu(), predicted.cpu().reshape(-1,1))

        results_dict_nn[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score]

        #the below code is used to find the weighted average
        test_nn = np.append(test_nn,y_test[:,1])
        pred_nn = np.append(pred_nn, predicted.cpu())
        train_size_nn += len(y_train)
        train_acc_agg_nn += train_accuracy * len(y_train)
        test_size_nn += len(y_test)
        test_acc_agg_nn += test_accuracy * len(y_test)
    mse_list.append(mean_squared_error([test_expsales_agg],[test_sales_agg]))

For cluster 0
Test accuracy: 32.36870193481445
Train accuracy: 32.27680969238281
For cluster 1
Test accuracy: 57.83619689941406
Train accuracy: 58.17140197753906
For cluster 2


KeyboardInterrupt: 

In [13]:
print(f'the mininum MSE score is where there are {[m for m in range(3,20,1)][np.argmin(mse_list)]} submarkets')

ValueError: attempt to get argmin of an empty sequence

In [14]:
auc_agg_nn = roc_auc_score(test_nn, pred_nn)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y)))
print("Training Accuracy: {:.4f}%".format(train_acc_agg_nn*100/train_size_nn))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg_nn*100/test_size_nn))
print("AUC: {:.6f}".format(auc_agg_nn))

Marketwide Demand Prediction
Number of Homes: 34107
Training Accuracy: 4987.3032%
Testing Accuracy: 4967.3652%
AUC: 0.455176


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1c850c61-d934-4c85-b16d-3cb283df0c84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>