In [1]:
import tensorflow as tf

In [2]:
def my_dnn(x):
    """Return a deep neural network
    """
    
    dense1 = tf.layers.dense(
        x, 1024, activation = tf.tanh, kernel_initializer = tf.random_normal_initializer()
    )
    batch1 = tf.layers.batch_normalization(dense1)
    dropout1 = tf.layers.dropout(batch1, rate = 0.1)
    
    dense2 = tf.layers.dense(
        dropout1, 512, activation = tf.nn.relu, kernel_initializer = tf.random_normal_initializer()
    )
    batch2 = tf.layers.batch_normalization(dense2)
    dropout2 = tf.layers.dropout(batch2, rate = 0.3)
    
    
    dense3 = tf.layers.dense(
        dropout2, 256, activation = tf.nn.relu, kernel_initializer = tf.random_normal_initializer()
    )
    batch3 = tf.layers.batch_normalization(dense3)
    dropout3 = tf.layers.dropout(batch3, rate = 0.2)
    
    dense4 = tf.layers.dense(
        dropout3, 256, activation = tf.nn.relu, kernel_initializer = tf.random_normal_initializer()
    )
    batch4 = tf.layers.batch_normalization(dense4)
    dropout4 = tf.layers.dropout(batch4, rate = 0.1)    
    
    y_pred = tf.layers.dense(dropout4, 1)

    return y_pred

In [3]:
import pandas as pd
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python", 
                       parse_dates=["transactiondate"])

propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [4]:
# apply label encoder on properties
from sklearn.preprocessing import LabelEncoder
for column in df_properties.columns:
    df_properties[column] = df_properties[column].fillna(-1)
    if 'object' == df_properties[column].dtype:
        labelEncoder = LabelEncoder()
        target = list(df_properties[column].values)
        labelEncoder.fit(target)
        df_properties[column] = labelEncoder.transform(target)
        
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);

inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"])
inter['transaction_year'] = inter['transactiondate'].dt.year
inter['transaction_month'] = inter['transactiondate'].dt.month
inter['transaction_day'] = inter['transactiondate'].dt.day

import numpy as np
np.random.seed(1)
datasetSize = inter.shape[0]
trainRatio = .8
trainIndex = set(np.random.choice(datasetSize, int(datasetSize * trainRatio), replace=False))
testIndex = set(range(datasetSize)) - trainIndex
# cast to list to indexing dataframe
trainIndex = list(trainIndex)
testIndex = list(testIndex)

columns_to_drop = [
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode','fireplacecnt', 'fireplaceflag'
]

df_trainx = inter.iloc[trainIndex]
trainy = df_trainx['logerror'].astype(float)
df_trainx = df_trainx.drop(columns_to_drop, axis=1)
df_testx = inter.iloc[testIndex]
testy = df_testx['logerror'].astype(float)
df_testx = df_testx.drop(columns_to_drop, axis=1)

from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
df_trainx.iloc[::] = standardScaler.fit_transform(df_trainx.iloc[::])
df_testx.iloc[::] = standardScaler.transform(df_testx.iloc[::])

In [5]:
x_train = df_trainx.values
x_test = df_testx.values

y_train = trainy.values
y_train = y_train.reshape(-1, 1)
y_test =testy.values
y_test = y_test.reshape(-1, 1)

In [6]:
x = tf.placeholder(tf.float32, [None, 56])
y = tf.placeholder(tf.float32, [None, 1])
y_pred = my_dnn(x)


dataset = tf.contrib.data.Dataset.from_tensor_slices((x, y))
batch_size = 50
dataset = dataset.repeat()
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()


with tf.name_scope('loss'):
    mse = tf.losses.mean_squared_error(labels=y, predictions=y_pred)
    
    
with tf.name_scope('adam_optimizer'):
    train_step = tf.train.AdamOptimizer(0.1).minimize(mse)
    
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(iterator.initializer, feed_dict={x: x_train, y: y_train})
    for i in range(16000):
        batch = iterator.get_next()
        if i % 1000 == 0:
            train_loss = mse.eval(feed_dict={x: batch[0].eval(), y: batch[1].eval()})
            print("batch {}, mean squared error: {}".format(i, train_loss))
        train_step.run(feed_dict={x: batch[0].eval(), y: batch[1].eval()})
    print('test set mean squared error: {}'.format(mse.eval(
        feed_dict={x: x_test, y: y_test})))

batch 0, mean squared error: 9619296.0
batch 1000, mean squared error: 6899.24755859375
batch 2000, mean squared error: 3095.6279296875
batch 3000, mean squared error: 1412.7529296875
batch 4000, mean squared error: 766.9017333984375
batch 5000, mean squared error: 204.11065673828125


KeyboardInterrupt: 