In [1]:
import tensorflow as tf
import pandas as pd
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python")

In [2]:
propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [3]:
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);
inter.shape;

In [4]:
inter.isnull().sum(axis=0);

In [5]:
inter.dropna(axis=1, thresh = int(inter.shape[0] * .9), inplace=True);
inter.shape

(90275, 26)

In [6]:
# fill na in float columns with 0, and string columns with empty string
float_columns_with_na = [
    'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fullbathcnt', 
    'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 
    'taxamount', 'censustractandblock'
]

string_columns_with_na = [
    'propertycountylandusecode', 'regionidcity', 'regionidzip'
]
inter[float_columns_with_na] = inter[float_columns_with_na].fillna(0)
inter[string_columns_with_na] = inter[string_columns_with_na].fillna('')

In [7]:
inter.isnull().sum(axis=0)

parcelid                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
propertycountylandusecode       0
propertylandusetypeid           0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
logerror                        0
transactiondate                 0
dtype: int64

In [8]:
import numpy as np
np.random.seed(1)
datasetSize = inter.shape[0]
trainRatio = .8
trainIndex = set(np.random.choice(datasetSize, int(datasetSize * trainRatio), replace=False))
testIndex = set(range(datasetSize)) - trainIndex
# cast to list to indexing dataframe
trainIndex = list(trainIndex)
testIndex = list(testIndex)

In [9]:
df_trainx = inter.iloc[trainIndex]
trainy = df_trainx['logerror'].astype(float)
df_trainx = df_trainx.drop(['logerror', 'transactiondate'], axis=1)
df_testx = inter.iloc[testIndex]
testy = df_testx['logerror'].astype(float)
df_testx = df_testx.drop(['logerror', 'transactiondate'], axis=1)

df_trainx.set_index('parcelid');
df_testx.set_index('parcelid');

In [10]:
def input_fn(df_trainx, trainy, num_epochs, shuffle=True, batch_size=100, num_threads=1):
    return tf.estimator.inputs.pandas_input_fn(
        x=df_trainx,
        y=trainy,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=num_threads)

In [11]:
def build_model_columns():

    # continous features no bukcet. To-do: use buckets to handle -1s used to fill nas
    bathroomcnt = tf.feature_column.numeric_column("bathroomcnt")
    bedroomcnt = tf.feature_column.numeric_column("bedroomcnt")
    calculatedbathnbr = tf.feature_column.numeric_column('calculatedbathnbr')
    calculatedfinishedsquarefeet = tf.feature_column.numeric_column('calculatedfinishedsquarefeet')
    finishedsquarefeet12 = tf.feature_column.numeric_column("finishedsquarefeet12")
    fullbathcnt = tf.feature_column.numeric_column("fullbathcnt")
    latitude = tf.feature_column.numeric_column("latitude")
    longitude = tf.feature_column.numeric_column("longitude")
    # treat rawcensustractandblock as numerical data
    rawcensustractandblock = tf.feature_column.numeric_column("rawcensustractandblock")
    # census causes nan in training
    censustractandblock = tf.feature_column.numeric_column("censustractandblock")
    roomcnt = tf.feature_column.numeric_column("roomcnt")
    yearbuilt = tf.feature_column.numeric_column("yearbuilt")
    structuretaxvaluedollarcnt = tf.feature_column.numeric_column("structuretaxvaluedollarcnt")
    taxvaluedollarcnt = tf.feature_column.numeric_column("taxvaluedollarcnt")
    assessmentyear = tf.feature_column.numeric_column("assessmentyear")
    landtaxvaluedollarcnt = tf.feature_column.numeric_column("landtaxvaluedollarcnt")
    taxamount = tf.feature_column.numeric_column("taxamount")

    numeric_features = [
        bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, finishedsquarefeet12,
        fullbathcnt, latitude, longitude, rawcensustractandblock, roomcnt, yearbuilt, structuretaxvaluedollarcnt,
        taxvaluedollarcnt, assessmentyear, landtaxvaluedollarcnt, taxamount
    ]
    
    # categorical features 
    fips = tf.feature_column.categorical_column_with_vocabulary_list(
        "fips", [
            "06037", "06059", "06111"
        ])
    propertycountylandusecode = tf.feature_column.categorical_column_with_hash_bucket(
        "propertycountylandusecode", hash_bucket_size=100)
    propertylandusetypeid = tf.feature_column.categorical_column_with_hash_bucket(
        "propertylandusetypeid", hash_bucket_size=20)
    regionidcity = tf.feature_column.categorical_column_with_hash_bucket(
        "regionidcity", hash_bucket_size=300)
    regionidcounty = tf.feature_column.categorical_column_with_vocabulary_list(
        "regionidcounty", [
            "1286", "2061", "3101"
        ])
    regionidzip = tf.feature_column.categorical_column_with_hash_bucket(
        "regionidzip", hash_bucket_size=500)

    categorical_features = [
        fips, propertycountylandusecode, propertylandusetypeid, regionidcity, regionidcounty, regionidzip
    ]
    
    wide_columns = numeric_features + categorical_features
    
    deep_columns = [
        tf.feature_column.indicator_column(fips),
        tf.feature_column.embedding_column(propertycountylandusecode, dimension=7),
        tf.feature_column.embedding_column(propertylandusetypeid, dimension=6),
        tf.feature_column.embedding_column(regionidcity, dimension=8),
        tf.feature_column.indicator_column(regionidcounty),
        tf.feature_column.embedding_column(regionidzip, dimension=9),
    ]
    
    deep_columns += numeric_features
    return wide_columns, deep_columns

In [20]:
def build_estimator(model_dir, model_type):
    wide_columns, deep_columns = build_model_columns()
    hidden_units = [1000, 750, 500, 250]

    if model_type == 'wide':
        return tf.estimator.LinearRegressor(
                model_dir=model_dir,
                feature_columns=wide_columns)
    elif model_type == 'deep':
        return tf.estimator.DNNRegressor(
                model_dir=model_dir,
                feature_columns=deep_columns,
                hidden_units=hidden_units)
    else:
        return tf.estimator.DNNLinearCombinedRegressor(
                model_dir=model_dir,
                linear_feature_columns=wide_columns,
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=hidden_units)

In [21]:
import tempfile
model_dir = tempfile.mkdtemp()
m = build_estimator(model_dir, 'combined')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpsba459fz', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [22]:
m.train(input_fn=input_fn(df_trainx, trainy, num_epochs=None), steps = 1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpsba459fz/model.ckpt.
INFO:tensorflow:loss = 1.34194e+14, step = 1
INFO:tensorflow:global_step/sec: 52.8358
INFO:tensorflow:loss = 4.04384e+09, step = 101 (1.893 sec)
INFO:tensorflow:global_step/sec: 52.0846
INFO:tensorflow:loss = 5.71326e+09, step = 201 (1.920 sec)
INFO:tensorflow:global_step/sec: 47.6231
INFO:tensorflow:loss = 1.24063e+09, step = 301 (2.100 sec)
INFO:tensorflow:global_step/sec: 54.3059
INFO:tensorflow:loss = 1.20395e+09, step = 401 (1.841 sec)
INFO:tensorflow:global_step/sec: 54.8917
INFO:tensorflow:loss = 1.00864e+09, step = 501 (1.822 sec)
INFO:tensorflow:global_step/sec: 53.7153
INFO:tensorflow:loss = 7.84098e+08, step = 601 (1.867 sec)
INFO:tensorflow:global_step/sec: 54.5765
INFO:tensorflow:loss = 7.6294e+08, step = 701 (1.826 sec)
INFO:tensorflow:global_step/sec: 53.9483
INFO:tensorflow:loss = 1.24793e+09, step = 801 (1.854 sec)
INFO:tensorflow:global_step/sec: 54.2

<tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor at 0x7f011ab9a390>

In [23]:
results = m.evaluate(input_fn=input_fn(df_testx, testy, num_threads=1, num_epochs = 1))

INFO:tensorflow:Starting evaluation at 2017-11-26-21:57:28
INFO:tensorflow:Restoring parameters from /tmp/tmpsba459fz/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-11-26-21:57:31
INFO:tensorflow:Saving dict for global step 1000: average_loss = 9.15472e+06, global_step = 1000, loss = 9.13196e+08
