In [1]:
import tensorflow as tf
import pandas as pd
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python", 
                       parse_dates=["transactiondate"])

In [2]:
propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [3]:
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);
inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"])
inter['transaction_year'] = inter['transactiondate'].dt.year
inter['transaction_month'] = inter['transactiondate'].dt.month
inter['transaction_day'] = inter['transactiondate'].dt.day

columns_to_drop = [
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode','fireplacecnt', 'fireplaceflag'
]

y = inter['logerror']

inter = inter.drop(columns_to_drop, axis=1)
inter.shape

(90275, 56)

In [4]:
numeric_cols = inter.select_dtypes(exclude=['object'])
numeric_cols = numeric_cols.fillna(-1)

from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
numeric_cols.iloc[::] = standardScaler.fit_transform(numeric_cols.iloc[::])
numeric_cols.iloc[::] = standardScaler.transform(numeric_cols.iloc[::])

inter[numeric_cols.columns] = numeric_cols
numeric_features = [tf.feature_column.numeric_column(col) for col in numeric_cols.columns]

In [5]:
categorical_cols = inter.select_dtypes(include=['object'])
categorical_cols = categorical_cols.fillna('none')
inter[categorical_cols.columns] = categorical_cols
complex_features = ["regionidcity", "regionidneighborhood", "regionidzip"]
simple_categorical_features = [
    tf.feature_column.categorical_column_with_hash_bucket(col, hash_bucket_size=100)
    for col in categorical_cols if col not in complex_features
]
complex_categorical_features = [
    tf.feature_column.categorical_column_with_hash_bucket(col, hash_bucket_size=500)
    for col in complex_features
]

deep_indicator_columns = [
    tf.feature_column.indicator_column(col) for col in simple_categorical_features
]

deep_embedding_columns = [
    tf.feature_column.embedding_column(col, dimension=8) for col in complex_categorical_features
]

deep_columns = numeric_features + deep_indicator_columns + deep_embedding_columns

In [6]:
def input_fn(x, y, num_epochs, shuffle=True, batch_size=32, num_threads=1):
    return tf.estimator.inputs.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=num_threads)

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inter, y, test_size=0.2, random_state=42)

In [22]:
def build_estimator(model_dir, model_type):
    hidden_units = [360, 180, 90, 45]

    if model_type == 'wide':
        return tf.estimator.LinearRegressor(
                model_dir=model_dir,
                feature_columns=numeric_features)
    elif model_type == 'deep':
        return tf.estimator.DNNRegressor(
                model_dir=model_dir,
                feature_columns=deep_columns,
                hidden_units=hidden_units,
                dropout=0.3)
    else:
        return tf.estimator.DNNLinearCombinedRegressor(
                model_dir=model_dir,
                linear_feature_columns=numeric_features,
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=hidden_units,
                dnn_dropout=0.3)

In [26]:
import tempfile
model_dir = tempfile.mkdtemp()
m = build_estimator(model_dir, 'deep')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmpv0fdw_8g', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [27]:
m.train(input_fn=input_fn(x_train, y_train, num_epochs=5))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmpv0fdw_8g/model.ckpt.
INFO:tensorflow:loss = 524086.0, step = 1
INFO:tensorflow:global_step/sec: 66.216
INFO:tensorflow:loss = 3292.54, step = 101 (1.511 sec)
INFO:tensorflow:global_step/sec: 70.3612
INFO:tensorflow:loss = 366.695, step = 201 (1.421 sec)
INFO:tensorflow:global_step/sec: 67.5037
INFO:tensorflow:loss = 1118.47, step = 301 (1.482 sec)
INFO:tensorflow:global_step/sec: 69.9515
INFO:tensorflow:loss = 10.3473, step = 401 (1.429 sec)
INFO:tensorflow:global_step/sec: 68.6722
INFO:tensorflow:loss = 834.917, step = 501 (1.456 sec)
INFO:tensorflow:global_step/sec: 69.389
INFO:tensorflow:loss = 4.44745, step = 601 (1.441 sec)
INFO:tensorflow:global_step/sec: 67.3022
INFO:tensorflow:loss = 24.6419, step = 701 (1.486 sec)
INFO:tensorflow:global_step/sec: 67.4497
INFO:tensorflow:loss = 47.388, step = 801 (1.483 sec)
INFO:tensorflow:global_step/se

INFO:tensorflow:loss = 1.2615, step = 8301 (1.469 sec)
INFO:tensorflow:global_step/sec: 67.9305
INFO:tensorflow:loss = 0.876951, step = 8401 (1.472 sec)
INFO:tensorflow:global_step/sec: 69.258
INFO:tensorflow:loss = 0.816775, step = 8501 (1.444 sec)
INFO:tensorflow:global_step/sec: 71.9257
INFO:tensorflow:loss = 0.938844, step = 8601 (1.390 sec)
INFO:tensorflow:global_step/sec: 69.7069
INFO:tensorflow:loss = 0.226004, step = 8701 (1.435 sec)
INFO:tensorflow:global_step/sec: 67.6638
INFO:tensorflow:loss = 1.46434, step = 8801 (1.478 sec)
INFO:tensorflow:global_step/sec: 69.5462
INFO:tensorflow:loss = 0.418412, step = 8901 (1.438 sec)
INFO:tensorflow:global_step/sec: 68.6747
INFO:tensorflow:loss = 0.938118, step = 9001 (1.456 sec)
INFO:tensorflow:global_step/sec: 67.8526
INFO:tensorflow:loss = 0.234134, step = 9101 (1.474 sec)
INFO:tensorflow:global_step/sec: 70.2441
INFO:tensorflow:loss = 2.2735, step = 9201 (1.424 sec)
INFO:tensorflow:global_step/sec: 70.5292
INFO:tensorflow:loss = 6.7

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x11c5d9748>

In [28]:
import numpy as np
y_pred = list(m.predict(input_fn=input_fn(x_test, y_test, shuffle = False, num_threads=1, num_epochs = 1)))

y_pred = np.array([x["predictions"][0] for x in y_pred])
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
print("test set mean absolute error is {}".format(mae))

INFO:tensorflow:Restoring parameters from /var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmpv0fdw_8g/model.ckpt-11285
test set mean absolute error is 0.06867904644577298


In [29]:
with open('xgbresults', 'r') as f:
    xgbresults = f.read().split("\n")[:-1]
xgbresults = np.array(xgbresults).astype(np.float)

In [30]:
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
print("test set mean absolute error is {}".format(mae))

test set mean absolute error is 0.06867904644577298
