In [50]:
 # ----------- Overhead -------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from scipy.spatial import distance
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import tensorflow
from keras.callbacks import TensorBoard
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense

In [51]:
df = pd.read_csv('NY property csv.csv')

In [52]:
# --------------- Fill in NAs and 0s ----------

# Create new variable BORO
df['BORO'] = ''
df['BORO'] = df.BBLE.str[0]

# ZIP, sort by BBLE and forward fill
df = df.sort_values(by = ['BBLE'])
df['ZIP'].fillna(method='ffill',inplace = True)

# Create ZIP3
df['ZIP3']= df["ZIP"].apply(lambda x: int(str(x)[0:3]))


df['ll_rate'] = df['LTFRONT']/df['LTDEPTH']
mean_llrate = np.mean(df['ll_rate'].dropna())
df['LTFRONT'] = df['LTFRONT'].fillna(value = (df['LTDEPTH'] * mean_llrate))
df['LTFRONT'] = df['LTDEPTH'].fillna(value = (df['LTFRONT'] / mean_llrate))
np.std(df['LTFRONT'])
# std = 73.7 without group
# two methods to set 0 values as missing values
df.loc[df['LTFRONT'] == 0, 'LTFRONT'] = np.nan
# df['LTFRONT'] = df['LTFRONT'].replace(0,np.nan)

# fill NA with average values grouped by BORO (if using TAXCLASS as well, 
# the filled values will be too big and there will be 2 missing values)
df['LTFRONT'] = df.groupby(['BORO','TAXCLASS'])['LTFRONT'].transform(lambda x: x.fillna(x.mean()))
df['LTFRONT'] = df.groupby(['TAXCLASS'])['LTFRONT'].transform(lambda x: x.fillna(x.mean()))


df.loc[df['LTDEPTH'] == 0, 'LTDEPTH'] = np.nan
df['LTDEPTH'] = df.groupby(['BORO','TAXCLASS'])['LTDEPTH'].transform(lambda x: x.fillna(x.mean()))
df['LTDEPTH'] = df.groupby(['TAXCLASS'])['LTDEPTH'].transform(lambda x: x.fillna(x.mean()))

# np.isnan(df['LTDEPTH']).sum()

# Calculate BLDFRONT/BLDDEPTH ratio
df.loc[df['BLDFRONT'] == 0, 'BLDFRONT'] = np.nan
df.loc[df['BLDDEPTH'] == 0, 'BLDDEPTH'] = np.nan

df['bb_rate'] = df['BLDFRONT']/df['BLDDEPTH']
# distribution of the ratio
# sns.distplot(df['BLDFRONT']/df['BLDDEPTH'].dropna(), bins = 1000,kde=False)
# plt.xlim(0,2)

median_bbrate = np.median(df['bb_rate'].dropna())
df['BLDFRONT'] = df['BLDFRONT'].fillna(value = (df['BLDDEPTH'] * median_bbrate))
df['BLDDEPTH'] = df['BLDDEPTH'].fillna(value = (df['BLDFRONT'] / median_bbrate))


# For BLDFRONT and BLDDEPTH, if group by both BORO and TAXCLASS, there will appear NaN as well.
df['BLDFRONT'] = df.groupby(['BORO','TAXCLASS'])['BLDFRONT'].transform(lambda x: x.fillna(x.mean()))
df['BLDFRONT'] = df.groupby(['TAXCLASS'])['BLDFRONT'].transform(lambda x: x.fillna(x.mean()))

# np.isnan(df['BLDFRONT']).sum()
df['BLDDEPTH'] = df.groupby(['BORO','TAXCLASS'])['BLDDEPTH'].transform(lambda x: x.fillna(x.mean()))
df['BLDDEPTH'] = df.groupby(['TAXCLASS'])['BLDDEPTH'].transform(lambda x: x.fillna(x.mean()))

np.isnan(df['BLDFRONT']).sum()
np.isnan(df['BLDDEPTH']).sum()

df['STORIES'] = df.groupby(['ZIP','TAXCLASS'])['STORIES'].transform(lambda x: x.fillna(x.mean()))
df['STORIES'] = df.groupby(['TAXCLASS'])['STORIES'].transform(lambda x: x.fillna(x.mean()))

# np.isnan(df['STORIES']).sum()

# building volume and bins accordingly
df['bldvol'] = df['BLDFRONT'] * df['BLDDEPTH'] * df['STORIES']

df['bldvol_bin'] = pd.qcut(df['bldvol'], 100, labels = False, duplicates = 'drop')

# FULLVAL
df.loc[df['FULLVAL'] == 0, 'FULLVAL'] = np.nan

# df['FULLVAL']
df['FULLVAL'] = df.groupby(['BORO','bldvol_bin'])['FULLVAL'].transform(lambda x: x.fillna(x.mean()))
# np.isnan(df['FULLVAL']).sum()


# AVLAND
df.loc[df['AVLAND'] == 0, 'AVLAND'] = np.nan
df['AVLAND'] = df.groupby(['BORO','bldvol_bin'])['AVLAND'].transform(lambda x: x.fillna(x.mean()))
# np.isnan(df['AVLAND']).sum()


# AVTOT
df.loc[df['AVTOT'] == 0, 'AVTOT'] = np.nan
df['AVTOT'] = df.groupby(['BORO','bldvol_bin'])['AVTOT'].transform(lambda x: x.fillna(x.mean()))
# np.isnan(df['AVTOT']).sum()

In [53]:
## Build 3 sizes
## Build LotArea, Building Area, Building Volume
df['lotarea'] = df['LTDEPTH'] * df['LTFRONT']
df['bldarea'] = df['BLDFRONT'] * df['BLDDEPTH']
df['bldvol'] = df['bldarea'] * df['STORIES']

## Build 9 values
## FV = FULLVAL, AL = AVLAND, AT = AVTOT
## LA = LOTAREA, BA = BLDAREA, BV = BLDVOL
df['fv_la'] = df['FULLVAL'] / df['lotarea']
df['fv_ba'] = df['FULLVAL'] / df['bldarea']
df['fv_bv'] = df['FULLVAL'] / df['bldvol']
df['al_la'] = df['AVLAND'] / df['lotarea']
df['al_ba'] = df['AVLAND'] / df['bldarea']
df['al_bv'] = df['AVLAND'] / df['bldvol']
df['at_la'] = df['AVTOT'] / df['lotarea']
df['at_ba'] = df['AVTOT'] / df['bldarea']
df['at_bv'] = df['AVTOT'] / df['bldvol']

## Build 45 Group by values



In [54]:
ninevars = ['fv_la','fv_ba','fv_bv','al_la','al_ba','al_bv','at_la','at_ba','at_bv']
df = df.join(df.groupby(['ZIP'])[ninevars].mean(), on='ZIP', rsuffix='_zip')
df = df.join(df.groupby(['ZIP3'])[ninevars].mean(), on='ZIP3', rsuffix='_zip3')
df = df.join(df.groupby(['TAXCLASS'])[ninevars].mean(), on='TAXCLASS', rsuffix='_taxclass')
df = df.join(df.groupby(['BORO'])[ninevars].mean(), on='BORO', rsuffix='_boro')
# add stories bins
df['story_bin'] = pd.qcut(df['STORIES'], 10, labels=False, duplicates = 'drop')
df = df.join(df.groupby(['story_bin'])[ninevars].mean(), on='story_bin', rsuffix='_story')
df = df.join(df.groupby(['BORO','TAXCLASS'])[ninevars].mean(), on=['BORO','TAXCLASS'], rsuffix='_BoroTax')


In [55]:
newdf = df[['fv_la',
 'fv_ba',
 'fv_bv',
 'al_la',
 'al_ba',
 'al_bv',
 'at_la',
 'at_ba',
 'at_bv',
 'fv_la_zip',
 'fv_ba_zip',
 'fv_bv_zip',
 'al_la_zip',
 'al_ba_zip',
 'al_bv_zip',
 'at_la_zip',
 'at_ba_zip',
 'at_bv_zip',
 'fv_la_zip3',
 'fv_ba_zip3',
 'fv_bv_zip3',
 'al_la_zip3',
 'al_ba_zip3',
 'al_bv_zip3',
 'at_la_zip3',
 'at_ba_zip3',
 'at_bv_zip3',
 'fv_la_taxclass',
 'fv_ba_taxclass',
 'fv_bv_taxclass',
 'al_la_taxclass',
 'al_ba_taxclass',
 'al_bv_taxclass',
 'at_la_taxclass',
 'at_ba_taxclass',
 'at_bv_taxclass',
 'fv_la_boro',
 'fv_ba_boro',
 'fv_bv_boro',
 'al_la_boro',
 'al_ba_boro',
 'al_bv_boro',
 'at_la_boro',
 'at_ba_boro',
 'at_bv_boro',
 'fv_la_story',
 'fv_ba_story',
 'fv_bv_story',
 'al_la_story',
 'al_ba_story',
 'al_bv_story',
 'at_la_story',
 'at_ba_story',
 'at_bv_story',
 'fv_la_BoroTax',
 'fv_ba_BoroTax',
 'fv_bv_BoroTax',
 'al_la_BoroTax',
 'al_ba_BoroTax',
 'al_bv_BoroTax',
 'at_la_BoroTax',
 'at_ba_BoroTax',
 'at_bv_BoroTax']]

# newdf.head()

In [56]:
# ---- Z scale to prepare for dimensionality reduction ----

# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(newdf)
scaled_df = pd.DataFrame(scaled_df, columns=list(newdf))
# scaled_df.head()


In [57]:
# ---- PCA: reduce dimensions ----
pca = PCA(n_components = .85, svd_solver = 'full')
pca.fit(scaled_df)
pca.n_components_

10

In [58]:
# Percentage of variance explained by each of the selected components.
print(pca.explained_variance_ratio_)

[0.25435587 0.15936895 0.09257827 0.07293785 0.05871896 0.04988091
 0.04670502 0.04424925 0.04028856 0.03322871]


In [59]:
# transformed dataframe
pca_df = pd.DataFrame(data = pca.transform(scaled_df), columns =  ["PC" + str(i) for i in range(1, pca.n_components_+1)])

In [60]:
# ---- Z scale to PCA dataframe ----

# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_pca = scaler.fit_transform(pca_df)
scaled_pca = pd.DataFrame(scaled_pca, columns=list(pca_df))
# scaled_pca.head()

In [61]:
## Variables to test
input_dim = pca.n_components_
encoder_dim = int(input_dim/2)
nb_epoch = 2
batch_size = 10

## Autoencoder
input_l = Input(shape=(input_dim, ))
encoder_l = Dense(encoder_dim, input_shape=(input_dim,), activation='relu')(input_l)
decoder_l = Dense(input_dim, activation='sigmoid')(encoder_l)
autoencoder = Model(input_l, decoder_l)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

tensorboard = TensorBoard(log_dir='/tmp/logs', histogram_freq=0, write_graph=True, write_images=True)
history = autoencoder.fit(scaled_pca, scaled_pca, epochs=nb_epoch, batch_size=batch_size, verbose=1, callbacks=[tensorboard]).history
ae_preds = autoencoder.predict(scaled_pca)



Epoch 1/2
Epoch 2/2


In [66]:
ae_preds = autoencoder.predict(scaled_pca)

In [88]:
# scaled_pca
scaled_pca["MAHAL"]=(scaled_pca["PC1"]**2+scaled_pca["PC2"]**2+scaled_pca["PC3"]**2+scaled_pca["PC4"]**2+scaled_pca["PC5"]**2+scaled_pca["PC6"]**2+scaled_pca["PC7"]**2+scaled_pca["PC8"]**2+scaled_pca["PC9"]**2+scaled_pca["PC10"]**2)**0.5
# scaled_pca.append(scaled_pca.iloc[].sum(),ignore_index=True)

In [89]:
scaled_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,MAHAL
0,3.886418,2.402793,0.405834,3.232402,-1.348003,1.646906,0.890411,2.060934,1.821408,1.092090,6.750605
1,3.855244,2.327662,0.183243,0.029360,-1.279632,1.419915,-0.022196,-0.536169,0.452531,1.209069,5.091520
2,3.869382,2.362044,0.286734,1.458590,-1.318131,1.530169,0.413565,0.683081,1.101118,1.172634,5.483168
3,3.478222,2.555112,0.741473,-0.191367,-2.204669,1.344911,0.139961,-0.631358,0.599313,0.609636,5.199175
4,3.456595,2.573077,0.800366,-0.096169,-2.280425,1.242367,0.173362,-0.708613,0.640328,0.457545,5.207108
5,3.603068,3.100882,1.576572,13.480360,-0.677898,-1.213253,0.930603,-1.598210,0.389955,0.828811,14.594316
6,3.856503,2.327967,0.177697,-0.032732,-1.288885,1.443792,0.098109,-0.257655,0.610084,1.203345,5.095754
7,3.581651,2.968948,1.111569,6.946734,-0.737924,-1.076143,1.274030,-1.432798,0.640536,0.955433,8.822550
8,3.480418,2.563898,0.771455,0.232185,-2.204081,1.347061,0.151602,-0.555246,0.631146,0.599446,5.205521
9,3.854545,2.324602,0.170970,-0.138324,-1.281261,1.422017,-0.001849,-0.509655,0.471894,1.211296,5.091472
