In [None]:
import sys
sys.path.insert(0, '../scripts/')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from itertools import count
import random
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from models import AE, BaselineNet
from utils import *

import seaborn as sns
from tqdm import tqdm

import scipy as sp

color = sns.color_palette()

In [None]:
df = pd.read_csv('../data/denver_prop_with_ct_clean.csv')

In [None]:
df = df.astype({
  'list_date': 'datetime64[ns]',
  'sale_date': 'datetime64[ns]'
})
df = df.drop(columns=['rex_property_id'])

In [None]:
res = gen_dataset(df, '2017-04-02', 90)

In [None]:
X = res['X']
y = res['y']

In [None]:
X[res['bool']] = X[res['bool']] * 1

In [None]:
X = X.to_numpy()

In [None]:
K = 5 # Number of submarkets

n_cont = len(res['float']) # Number of continuous attributes
n_bool = len(res['bool']) # Number of boolean attributes
n_int = len(res['int']) # Number of integer attributes

n_homes, n_features = X.shape
assert(n_cont + n_bool + n_int == n_features)

X_cont = X[:,:n_cont]
X_bool = X[:,n_cont:(n_cont+n_bool)]
X_int = X[:,(n_cont+n_bool):]

In [None]:
underflow_scaling = 1e10

# Initialization

init_clustering = KMeans(n_clusters=K, random_state=0).fit(X)
prior_var = 30
reg = 1 # Regularization strength (inverse)

mu_init = init_clustering.cluster_centers_[:,:n_cont]
sigma_init = np.array([prior_var*np.eye(n_cont) for _ in range(K)])
p_init = init_clustering.cluster_centers_[:,n_cont:(n_cont+n_bool)]
lam_init = init_clustering.cluster_centers_[:,(n_cont+n_bool):]
pi_init = np.ones(K)/K
f_init = [LogisticRegression(C=reg).fit(X,y) for _ in range(K)]

# EM Implementation

mu = mu_init
sigma = sigma_init
p = p_init
lam = lam_init
pi = pi_init
f = f_init

max_iter = 5000
store_freq = 100

params = {int(i*store_freq): {} for i in range(int(max_iter/store_freq)+1)}

for i in tqdm(range(max_iter),desc='Fitting...'):

    # E-step

    r_unnormalized = np.array([(underflow_scaling * pi[k] * 
                                sp.stats.multivariate_normal(mean=mu[k],cov=sigma[k]).pdf(X_cont) *
                                sp.stats.bernoulli(p[k]).pmf(X_bool).prod(axis=1) *
                                sp.stats.poisson(lam[k]).pmf(X_int).prod(axis=1) *
                                sp.stats.bernoulli(f[k].predict_proba(X)[:,1]).pmf(y))
                               for k in range(K)]).T
    r_unnormalized[np.where(~r_unnormalized.any(axis=1))[0]] = 1/K
    r = r_unnormalized / r_unnormalized.sum(axis=1).reshape((-1,1))

    if i % store_freq == 0:
        params[i] = {'mu': mu,
                     'sigma': sigma,
                     'p': p,
                     'lambda': lam,
                     'pi': pi,
                     'f': f}

    # M-step

    f = [LogisticRegression(C=reg).fit(X,y,sample_weight=r[:,k]) for k in range(K)]
    pi = r.sum(axis=0)/n_homes
    means = np.array([np.array([r[n,k]*X[n] for n in range(n_homes)]).sum(axis=0) for k in range(K)]) / r.sum(axis=0).reshape((-1,1))
    mu, p, lam = means[:,:n_cont], means[:,n_cont:(n_cont+n_bool)], means[:,(n_cont+n_bool):]
    sigma = np.array([np.array([r[n,k]*np.matmul((X_cont[n]-mu[k]).reshape((-1,1)),(X_cont[n]-mu[k]).reshape((1,-1))) for n in range(n_homes)]).sum(axis=0) for k in range(K)])  / r.sum(axis=0).reshape((-1,1,1))
    
params[max_iter] = {'mu': mu,
                    'sigma': sigma,
                    'p': p,
                    'lambda': lam,
                    'pi': pi,
                    'f': f}

r_unnormalized = np.array([(underflow_scaling * pi[k] * 
                            sp.stats.multivariate_normal(mean=mu[k],cov=sigma[k]).pdf(X_cont) *
                            sp.stats.bernoulli(p[k]).pmf(X_bool).prod(axis=1) *
                            sp.stats.poisson(lam[k]).pmf(X_int).prod(axis=1) *
                            sp.stats.bernoulli(f[k].predict_proba(X)[:,1]).pmf(y))
                            for k in range(K)]).T
r_unnormalized[np.where(~r_unnormalized.any(axis=1))[0]] = 1/K
r = r_unnormalized / r_unnormalized.sum(axis=1).reshape((-1,1))

submarket = np.argmax(r,axis=1)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative 

KernelInterrupted: Execution interrupted by the Jupyter kernel.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1c850c61-d934-4c85-b16d-3cb283df0c84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>