In [1]:
import tensorflow as tf
import pandas as pd
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python", 
                       parse_dates=["transactiondate"])

In [2]:
propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [3]:
# apply label encoder on properties
from sklearn.preprocessing import LabelEncoder
for column in df_properties.columns:
    df_properties[column] = df_properties[column].fillna(-1)
    if 'object' == df_properties[column].dtype:
        labelEncoder = LabelEncoder()
        target = list(df_properties[column].values)
        labelEncoder.fit(target)
        df_properties[column] = labelEncoder.transform(target)

In [4]:
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);
inter.shape;

In [5]:
inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"])
inter['transaction_year'] = inter['transactiondate'].dt.year
inter['transaction_month'] = inter['transactiondate'].dt.month
inter['transaction_day'] = inter['transactiondate'].dt.day

In [6]:
import numpy as np
np.random.seed(1)
datasetSize = inter.shape[0]
trainRatio = .8
trainIndex = set(np.random.choice(datasetSize, int(datasetSize * trainRatio), replace=False))
testIndex = set(range(datasetSize)) - trainIndex
# cast to list to indexing dataframe
trainIndex = list(trainIndex)
testIndex = list(testIndex)

In [7]:
columns_to_drop = [
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode','fireplacecnt', 'fireplaceflag'
]

df_trainx = inter.iloc[trainIndex]
trainy = df_trainx['logerror'].astype(float)
df_trainx = df_trainx.drop(columns_to_drop, axis=1)
df_testx = inter.iloc[testIndex]
testy = df_testx['logerror'].astype(float)
df_testx = df_testx.drop(columns_to_drop, axis=1)

In [8]:
from sklearn.preprocessing import MinMaxScaler
standardScaler = MinMaxScaler()
df_trainx.iloc[::] = standardScaler.fit_transform(df_trainx.iloc[::])
df_testx.iloc[::] = standardScaler.transform(df_testx.iloc[::])

In [9]:
def input_fn(df_trainx, trainy, num_epochs, shuffle=True, batch_size=32, num_threads=1):
    return tf.estimator.inputs.pandas_input_fn(
        x=df_trainx,
        y=trainy,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=num_threads)

In [10]:
def build_model_columns():

    airconditioningtypeid = tf.feature_column.numeric_column("airconditioningtypeid")
    architecturalstyletypeid = tf.feature_column.numeric_column("architecturalstyletypeid")
    basementsqft = tf.feature_column.numeric_column("basementsqft")
    bathroomcnt = tf.feature_column.numeric_column("bathroomcnt")
    bedroomcnt = tf.feature_column.numeric_column("bedroomcnt")
    buildingclasstypeid = tf.feature_column.numeric_column("buildingclasstypeid")
    buildingqualitytypeid = tf.feature_column.numeric_column("buildingqualitytypeid")
    calculatedbathnbr = tf.feature_column.numeric_column("calculatedbathnbr")
    decktypeid = tf.feature_column.numeric_column("decktypeid")
    finishedfloor1squarefeet = tf.feature_column.numeric_column("finishedfloor1squarefeet")
    calculatedfinishedsquarefeet = tf.feature_column.numeric_column("calculatedfinishedsquarefeet")
    finishedsquarefeet12 = tf.feature_column.numeric_column("finishedsquarefeet12")
    finishedsquarefeet13 = tf.feature_column.numeric_column("finishedsquarefeet13")
    finishedsquarefeet15 = tf.feature_column.numeric_column("finishedsquarefeet15")
    finishedsquarefeet50 = tf.feature_column.numeric_column("finishedsquarefeet50")
    finishedsquarefeet6 = tf.feature_column.numeric_column("finishedsquarefeet6")
    fips = tf.feature_column.numeric_column("fips")
    fullbathcnt = tf.feature_column.numeric_column("fullbathcnt")
    garagecarcnt = tf.feature_column.numeric_column("garagecarcnt")
    garagetotalsqft = tf.feature_column.numeric_column("garagetotalsqft")
    hashottuborspa = tf.feature_column.numeric_column("hashottuborspa")
    heatingorsystemtypeid = tf.feature_column.numeric_column("heatingorsystemtypeid")
    latitude = tf.feature_column.numeric_column("latitude")
    longitude = tf.feature_column.numeric_column("longitude")
    lotsizesquarefeet = tf.feature_column.numeric_column("lotsizesquarefeet")
    poolcnt = tf.feature_column.numeric_column("poolcnt")
    poolsizesum = tf.feature_column.numeric_column("poolsizesum")
    pooltypeid10 = tf.feature_column.numeric_column("pooltypeid10")
    pooltypeid2 = tf.feature_column.numeric_column("pooltypeid2")
    pooltypeid7 = tf.feature_column.numeric_column("pooltypeid7")
    propertylandusetypeid = tf.feature_column.numeric_column("propertylandusetypeid")
    rawcensustractandblock = tf.feature_column.numeric_column("rawcensustractandblock")
    regionidcity = tf.feature_column.numeric_column("regionidcity")
    regionidcounty = tf.feature_column.numeric_column("regionidcounty")
    regionidneighborhood = tf.feature_column.numeric_column("regionidneighborhood")
    regionidzip = tf.feature_column.numeric_column("regionidzip")
    roomcnt = tf.feature_column.numeric_column("roomcnt")
    storytypeid = tf.feature_column.numeric_column("storytypeid")
    threequarterbathnbr = tf.feature_column.numeric_column("threequarterbathnbr")
    typeconstructiontypeid = tf.feature_column.numeric_column("typeconstructiontypeid")
    unitcnt = tf.feature_column.numeric_column("unitcnt")
    yardbuildingsqft17 = tf.feature_column.numeric_column("yardbuildingsqft17")
    yardbuildingsqft26 = tf.feature_column.numeric_column("yardbuildingsqft26")
    yearbuilt = tf.feature_column.numeric_column("yearbuilt")
    numberofstories = tf.feature_column.numeric_column("numberofstories")
    structuretaxvaluedollarcnt = tf.feature_column.numeric_column("structuretaxvaluedollarcnt")
    taxvaluedollarcnt = tf.feature_column.numeric_column("taxvaluedollarcnt")
    assessmentyear = tf.feature_column.numeric_column("assessmentyear")
    landtaxvaluedollarcnt = tf.feature_column.numeric_column("landtaxvaluedollarcnt")
    taxamount = tf.feature_column.numeric_column("taxamount")
    taxdelinquencyflag = tf.feature_column.numeric_column("taxdelinquencyflag")
    taxdelinquencyyear = tf.feature_column.numeric_column("taxdelinquencyyear")
    censustractandblock = tf.feature_column.numeric_column("censustractandblock")
    transaction_year = tf.feature_column.numeric_column("transaction_year")
    transaction_month = tf.feature_column.numeric_column("transaction_month")
    transaction_day = tf.feature_column.numeric_column("transaction_day")

    numeric_features = [
        airconditioningtypeid, architecturalstyletypeid, basementsqft,
        bathroomcnt, bedroomcnt, buildingclasstypeid,
        buildingqualitytypeid, calculatedbathnbr, decktypeid,
        finishedfloor1squarefeet, calculatedfinishedsquarefeet,
        finishedsquarefeet12, finishedsquarefeet13, finishedsquarefeet15,
        finishedsquarefeet50, finishedsquarefeet6, fips, fullbathcnt,
        garagecarcnt, garagetotalsqft, hashottuborspa,
        heatingorsystemtypeid, latitude, longitude, lotsizesquarefeet,
        poolcnt, poolsizesum, pooltypeid10, pooltypeid2, pooltypeid7,
        propertylandusetypeid, rawcensustractandblock, regionidcity,
        regionidcounty, regionidneighborhood, regionidzip, roomcnt,
        storytypeid, threequarterbathnbr, typeconstructiontypeid,
        unitcnt, yardbuildingsqft17, yardbuildingsqft26, yearbuilt,
        numberofstories, structuretaxvaluedollarcnt, taxvaluedollarcnt,
        assessmentyear, landtaxvaluedollarcnt, taxamount,
        taxdelinquencyflag, taxdelinquencyyear, censustractandblock,
        transaction_year, transaction_month, transaction_day
    ]
    
    return numeric_features

In [11]:
def build_estimator(model_dir, model_type):
    numeric_features = build_model_columns()
    hidden_units = [1024, 512, 256, 128]

    if model_type == 'wide':
        return tf.estimator.LinearRegressor(
                model_dir=model_dir,
                feature_columns=numeric_features)
    elif model_type == 'deep':
        return tf.estimator.DNNRegressor(
                model_dir=model_dir,
                feature_columns=numeric_features,
                hidden_units=hidden_units,
                dropout=0.5)
    else:
        return tf.estimator.DNNLinearCombinedRegressor(
                model_dir=model_dir,
                linear_feature_columns=numeric_features,
                dnn_feature_columns=numeric_features,
                dnn_hidden_units=hidden_units,
                dnn_dropout=0.5)

In [12]:
import tempfile
model_dir = tempfile.mkdtemp()
m = build_estimator(model_dir, 'combined')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpx2s6kbiq', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [13]:
m.train(input_fn=input_fn(df_trainx, trainy, num_epochs=5))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpx2s6kbiq/model.ckpt.
INFO:tensorflow:loss = 6.17213, step = 1
INFO:tensorflow:global_step/sec: 99.9272
INFO:tensorflow:loss = 1.13739, step = 101 (1.003 sec)
INFO:tensorflow:global_step/sec: 99.6753
INFO:tensorflow:loss = 0.30496, step = 201 (1.003 sec)
INFO:tensorflow:global_step/sec: 99.0383
INFO:tensorflow:loss = 0.295025, step = 301 (1.009 sec)
INFO:tensorflow:global_step/sec: 105.043
INFO:tensorflow:loss = 0.310695, step = 401 (0.953 sec)
INFO:tensorflow:global_step/sec: 97.4373
INFO:tensorflow:loss = 0.363833, step = 501 (1.026 sec)
INFO:tensorflow:global_step/sec: 101.813
INFO:tensorflow:loss = 0.974346, step = 601 (0.981 sec)
INFO:tensorflow:global_step/sec: 101.426
INFO:tensorflow:loss = 0.160299, step = 701 (0.988 sec)
INFO:tensorflow:global_step/sec: 96.8258
INFO:tensorflow:loss = 0.377431, step = 801 (1.032 sec)
INFO:tensorflow:global_step/sec: 100.087
INFO:tensorflow:loss = 0.

INFO:tensorflow:loss = 0.338881, step = 8301 (1.002 sec)
INFO:tensorflow:global_step/sec: 98.7968
INFO:tensorflow:loss = 0.577625, step = 8401 (1.014 sec)
INFO:tensorflow:global_step/sec: 104.098
INFO:tensorflow:loss = 0.283999, step = 8501 (0.959 sec)
INFO:tensorflow:global_step/sec: 97.79
INFO:tensorflow:loss = 0.62813, step = 8601 (1.022 sec)
INFO:tensorflow:global_step/sec: 100.866
INFO:tensorflow:loss = 1.09484, step = 8701 (0.992 sec)
INFO:tensorflow:global_step/sec: 102.995
INFO:tensorflow:loss = 0.217807, step = 8801 (0.972 sec)
INFO:tensorflow:global_step/sec: 97.7984
INFO:tensorflow:loss = 0.212617, step = 8901 (1.021 sec)
INFO:tensorflow:global_step/sec: 101.507
INFO:tensorflow:loss = 0.229537, step = 9001 (0.985 sec)
INFO:tensorflow:global_step/sec: 96.5538
INFO:tensorflow:loss = 1.984, step = 9101 (1.036 sec)
INFO:tensorflow:global_step/sec: 103.344
INFO:tensorflow:loss = 0.381411, step = 9201 (0.968 sec)
INFO:tensorflow:global_step/sec: 100.402
INFO:tensorflow:loss = 0.36

<tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor at 0x7f731e1a74a8>

In [14]:
results = m.evaluate(input_fn=input_fn(df_testx, testy, shuffle = False, num_threads=1, num_epochs = 1))

INFO:tensorflow:Starting evaluation at 2017-11-27-05:51:52
INFO:tensorflow:Restoring parameters from /tmp/tmpx2s6kbiq/model.ckpt-11285
INFO:tensorflow:Finished evaluation at 2017-11-27-05:51:56
INFO:tensorflow:Saving dict for global step 11285: average_loss = 0.0232907, global_step = 11285, loss = 0.744273


In [15]:
df_trainx.shape

(72220, 56)