# Submarket baseline

Steps include:

1. kmeans to find submarkets
2. Fit logistic regressions on the submarkets
3. Compare accuracy scores against other submarket & non-submarket approaches

In [None]:
#required packages
import sys
sys.path.insert(0, '../scripts/')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import chardet
import datetime

import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

from models import AE, BaselineNet



In [None]:
# Load dataset
df = pd.read_csv('../data/denver_prop_listing_clean.csv')

#convert to datetime format
df["list_date"] = pd.to_datetime(df["list_date"])
df["sale_date"] = pd.to_datetime(df["sale_date"])

mapping = {k: v for v, k in enumerate(df.property_type.unique())}

df['property_type_code'] = df['property_type'].map(mapping)


In [None]:
#function to define the y variables
def gen_y(t_disc, data, t0=None):
    ''' 
    t_disc: datetime.timedelta(days = XX)
    t0: datetime.datetime(YYYY,MM,DD)
    '''

    if t0 is not None:
        listed = np.array(((data['list_date'] >= t0) & (data['list_date'] < t0 + t_disc)) | ((data['list_date'] < t0) & (data['sale_date'] >= t0)), dtype=np.int8)
        sale = np.array((data['sale_date'] >= t0) & (data['sale_date'] < t0 + t_disc), dtype = np.int8)
        return np.vstack((listed, sale)).T

In [None]:
y_2019Q2 = gen_y(datetime.timedelta(days = 90), df, datetime.datetime(2019,4,1))

# Remove all rows that aren't listed in that period
listed_index = np.where(y_2019Q2[:,0] == 1)
df_2019Q2 = df.iloc[listed_index]


In [None]:
# Build X matrix and Y target vector

# X = df.drop(columns=['property_id', 'zipcode', 'list_date', 'sale_date', 'current_status', 'property_type', 'has_jacuzzi', 'has_garage'])
X = df_2019Q2.drop(columns=['fips','property_id', 'zipcode', 'list_date', 'sale_date', 'property_type', 'has_jacuzzi', 'has_garage', 'sale_price', 'ct_key', 'sale_price_per_sqft', 'property_type_code'])
X = X.reset_index(drop=True)
columns_to_scale = ['latitude', 'longitude', 'sqft']
scaler = StandardScaler()
X[columns_to_scale] =  scaler.fit_transform(X[columns_to_scale])


Y = y_2019Q2[np.where(y_2019Q2[:,0]==1)]



X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7567 entries, 0 to 7566
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   latitude         7567 non-null   float64
 1   longitude        7567 non-null   float64
 2   sqft             7567 non-null   float64
 3   has_central_air  7567 non-null   bool   
 4   has_pool         7567 non-null   bool   
 5   has_solar        7567 non-null   bool   
 6   bedrooms         7567 non-null   float64
 7   full_baths       7567 non-null   float64
dtypes: bool(3), float64(5)
memory usage: 317.9 KB


In [None]:
#attempt kmeans clustering
#unsupervised approach
K = 5
init_clustering = KMeans(n_clusters=K, random_state=0).fit(X)

In [None]:
cluster_labels = init_clustering.labels_

In [None]:
for k in range(K):
    print('Submarket {}: {} units'.format(k, sum(cluster_labels == k)))

Submarket 0: 2667 units
Submarket 1: 520 units
Submarket 2: 938 units
Submarket 3: 1649 units
Submarket 4: 1793 units


In [None]:
#create a new df with has the clustering labels on it
X_withlabels = X.copy()
X_withlabels['labels'] = cluster_labels 

## Fitting logistic regression to identified clusters from Kmeans

In [None]:
def print_results(model):
  print('Train set accuracy: {}'.format(model.score(X_train, y_train[:,1])))
  print('Test set accuracy: {}'.format(model.score(X_test, y_test[:,1])))

  return model.score(X_train, y_train[:,1]), model.score(X_test, y_test[:,1])

In [None]:
train_test_dict = {}

#for loop which extracts the X & y df for each cluster
#these will then be used to run the logistic regression 
for i in range(5):
    X2 = X_withlabels.loc[X_withlabels['labels'] == i]
    X2 = X2.drop(columns = 'labels')
    y2 = Y[X_withlabels['labels'] == i]
    X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state =297)

    train_test_dict[f'cluster {i}'] = (X_train, X_test, y_train, y_test)

In [None]:
#now to fit for each cluster and get final results

results_dict = {}

train_acc_agg = 0
train_size = 0
test_acc_agg = 0
test_size = 0

test = np.array([])
pred = np.array([])

for i in range(5):
    logit_reg = LogisticRegression()
    X_train = train_test_dict[f'cluster {i}'][0]
    X_test = train_test_dict[f'cluster {i}'][1]
    y_train = train_test_dict[f'cluster {i}'][2]
    y_test = train_test_dict[f'cluster {i}'][3]

    #fit the logistic regression to each cluster
    logit_reg.fit(X_train, y_train[:,1])
    
    #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    print(f'For cluster {i}')
    train_accuracy, test_accuracy = print_results(logit_reg)
    print('=======')

    pred_k = logit_reg.predict(X_test)
    AUC_score = roc_auc_score(y_test[:,1], pred_k)

    results_dict[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score]
    
    #the below code is used to find the weighted average
    test = np.append(test,y_test[:,1])
    pred = np.append(pred,pred_k)
    train_size += len(y_train)
    train_acc_agg += train_accuracy * len(y_train)
    test_size += len(y_test)
    test_acc_agg += test_accuracy * len(y_test)

results_df = pd.DataFrame.from_dict(results_dict, orient='index')

For cluster 0
Train set accuracy: 0.6657334826427772
Test set accuracy: 0.677639046538025
For cluster 1
Train set accuracy: 0.8362068965517241
Test set accuracy: 0.8313953488372093
For cluster 2
Train set accuracy: 0.7261146496815286
Test set accuracy: 0.6580645161290323
For cluster 3
Train set accuracy: 0.7744565217391305
Test set accuracy: 0.7761467889908257
For cluster 4
Train set accuracy: 0.7343880099916736
Test set accuracy: 0.7331081081081081


In [None]:
results_df = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC'})
results_df

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC
cluster 0,0.665733,0.677639,0.523043
cluster 1,0.836207,0.831395,0.595089
cluster 2,0.726115,0.658065,0.504673
cluster 3,0.774457,0.776147,0.582498
cluster 4,0.734388,0.733108,0.57729


In [None]:
auc_agg = roc_auc_score(test, pred)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y[:,1])))
print("Training Accuracy: {:.4f}%".format(train_acc_agg*100/train_size))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg*100/test_size))
print("AUC: {:.6f}".format(auc_agg))

Marketwide Demand Prediction
Number of Homes: 7567
Training Accuracy: 72.4887%
Testing Accuracy: 72.0400%
AUC: 0.547254


# Neural network on kmeans

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 


results_dict_nn = {}

train_acc_agg_nn = 0
train_size_nn = 0
test_acc_agg_nn = 0
test_size_nn = 0

test_nn = np.array([])
pred_nn = np.array([])

for i in range(5):
    print(f'For cluster {i}')
    X_train = train_test_dict[f'cluster {i}'][0]
    X_test = train_test_dict[f'cluster {i}'][1]
    y_train = train_test_dict[f'cluster {i}'][2]
    y_test = train_test_dict[f'cluster {i}'][3]
    
    X_train = torch.tensor(X_train.astype(float).values, dtype=torch.float64).to(device)
    y_train = torch.tensor(y_train[:,1], dtype=torch.int64).to(device)
    y_train = torch.nn.functional.one_hot(y_train)
    #fit the logistic regression to each cluster
    net = BaselineNet(X_train.shape[1], y_train.shape[1], epochs=5000, debug=False).to(device)
    net.feed(X_train, y_train)
    
    #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    with torch.no_grad():
      net.eval()
      X_test = torch.tensor(X_test.astype(float).values, dtype=torch.float64).to(device)
      y_test = torch.tensor(y_test[:,1], dtype=torch.int64).to(device)
      y_test = torch.nn.functional.one_hot(y_test)

      y_pred = net.forward(X_test.float()) 
      labels = torch.max(y_test, 1)[1]
      correct = 0
      total = y_pred.shape[0]
      _, predicted = torch.max(y_pred.data, 1)
      correct += (predicted == labels).sum()
      accuracy = 100 * correct/total
      print('Accuracy: {}'.format(accuracy))
    print('=======')
    
    y_test = y_test.cpu()
    AUC_score = roc_auc_score(y_test.cpu(), predicted.cpu().reshape(-1,1))

    results_dict_nn[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score]
    
    #the below code is used to find the weighted average
    test_nn = np.append(test_nn,y_test[:,1])
    pred_nn = np.append(pred_nn, predicted.cpu())
    train_size_nn += len(y_train)
    train_acc_agg_nn += train_accuracy * len(y_train)
    test_size_nn += len(y_test)
    test_acc_agg_nn += test_accuracy * len(y_test)

results_df_nn = pd.DataFrame.from_dict(results_dict_nn, orient='index')

For cluster 0
Accuracy: 66.51531982421875
For cluster 1
Accuracy: 81.39534759521484
For cluster 2
Accuracy: 66.7741928100586
For cluster 3
Accuracy: 75.59632873535156
For cluster 4
Accuracy: 73.64865112304688


In [None]:
results_df_nn = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC'})
results_df_nn

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC
cluster 0,0.665733,0.677639,0.523043
cluster 1,0.836207,0.831395,0.595089
cluster 2,0.726115,0.658065,0.504673
cluster 3,0.774457,0.776147,0.582498
cluster 4,0.734388,0.733108,0.57729


In [None]:
auc_agg_nn = roc_auc_score(test_nn, pred_nn)

print('Marketwide Demand Prediction')
print("Number of Homes: {}".format(len(Y[:,1])))
print("Training Accuracy: {:.4f}%".format(train_acc_agg_nn*100/train_size_nn))
print("Testing Accuracy: {:.4f}%".format(test_acc_agg_nn*100/test_size_nn))
print("AUC: {:.6f}".format(auc_agg_nn))

Marketwide Demand Prediction
Number of Homes: 7567
Training Accuracy: 73.4388%
Testing Accuracy: 73.3108%
AUC: 0.579349


# PCA then Kmeans

In [None]:
#get top two PCA components
pca = PCA(n_components=3)
x_pca = pca.fit_transform (X)
print(f'The variance contained in the top three components is {np.sum(pca.explained_variance_ratio_):.3f}')

The variance contained in the top three components is 0.832


In [None]:
#attempt kmeans clustering
#unsupervised approach
K = 5
init_clustering_pca = KMeans(n_clusters=K, random_state=0).fit(x_pca)

pca_cluster_labels = init_clustering_pca.labels_

for k in range(K):
    print('Submarket {}: {} units'.format(k, sum(pca_cluster_labels == k)))

Submarket 0: 2633 units
Submarket 1: 1828 units
Submarket 2: 1580 units
Submarket 3: 545 units
Submarket 4: 981 units


In [None]:
pca_df = pd.DataFrame(x_pca).rename(columns={0:'pca1', 1:'pca2',2:'pca3'})

#add the clustering labels on to df
pca_df['labels'] = pca_cluster_labels 

## Fitting logistic regression to identified clusters from PCA + Kmeans

In [None]:
train_test_dict_pca = {}

#for loop which extracts the X & y df for each cluster
#these will then be used to run the logistic regression 
for i in range(5):
    X2 = pca_df.loc[pca_df['labels'] == i]
    
    X2 = X2.drop(columns = 'labels')
    
    y2 = Y[pca_df['labels'] == i]
    X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state =297)

    train_test_dict_pca[f'cluster {i}'] = (X_train, X_test, y_train, y_test)

In [None]:
#now to fit for each cluster and get final results

results_dict_pca = {}



for i in range(5):
    logit_reg = LogisticRegression()
    X_train = train_test_dict_pca[f'cluster {i}'][0]
    X_test = train_test_dict_pca[f'cluster {i}'][1]
    y_train = train_test_dict_pca[f'cluster {i}'][2]
    y_test = train_test_dict_pca[f'cluster {i}'][3]

    #fit the logistic regression to each cluster
    logit_reg.fit(X_train, y_train[:,1])
    
    #the below code is used in order to create a df containing a breakdown of metrics
    #on a per cluster basis
    print(f'For cluster {i}')
    train_accuracy, test_accuracy = print_results(logit_reg)
    print('=======')

    pred_k = logit_reg.predict(X_test)
    AUC_score = roc_auc_score(y_test[:,1], pred_k)

    results_dict_pca[f'cluster {i}'] = [train_accuracy, test_accuracy, AUC_score]
    

results_df_pca = pd.DataFrame.from_dict(results_dict_pca, orient='index')

For cluster 0
Train set accuracy: 0.6655328798185941
Test set accuracy: 0.6593785960874569
For cluster 1
Train set accuracy: 0.7205882352941176
Test set accuracy: 0.7019867549668874
For cluster 2
Train set accuracy: 0.7372400756143668
Test set accuracy: 0.7720306513409961
For cluster 3
Train set accuracy: 0.810958904109589
Test set accuracy: 0.7944444444444444
For cluster 4
Train set accuracy: 0.7077625570776256
Test set accuracy: 0.6820987654320988


In [None]:
results_df_pca = results_df.rename(columns = {0:'train_accuracy', 1:'test_accuracy', 2:'Test_AUC'})
results_df_pca

Unnamed: 0,train_accuracy,test_accuracy,Test_AUC
cluster 0,0.665733,0.677639,0.523043
cluster 1,0.836207,0.831395,0.595089
cluster 2,0.726115,0.658065,0.504673
cluster 3,0.774457,0.776147,0.582498
cluster 4,0.734388,0.733108,0.57729


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1c850c61-d934-4c85-b16d-3cb283df0c84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>