In [1]:
import tensorflow as tf
import pandas as pd
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python", 
                       parse_dates=["transactiondate"])

In [2]:
propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [3]:
# apply label encoder on properties
from sklearn.preprocessing import LabelEncoder
for column in df_properties.columns:
    df_properties[column] = df_properties[column].fillna(-1)
    if 'object' == df_properties[column].dtype:
        labelEncoder = LabelEncoder()
        target = list(df_properties[column].values)
        labelEncoder.fit(target)
        df_properties[column] = labelEncoder.transform(target)

In [4]:
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);
inter.shape;

In [5]:
inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"])
inter['transaction_year'] = inter['transactiondate'].dt.year
inter['transaction_month'] = inter['transactiondate'].dt.month
inter['transaction_day'] = inter['transactiondate'].dt.day

In [38]:
import numpy as np
np.random.seed(1)
datasetSize = inter.shape[0]
trainRatio = .8
trainIndex = set(np.random.choice(datasetSize, int(datasetSize * trainRatio), replace=False))
testIndex = set(range(datasetSize)) - trainIndex
# cast to list to indexing dataframe
trainIndex = list(trainIndex)
testIndex = list(testIndex)

In [39]:
columns_to_drop = [
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode','fireplacecnt', 'fireplaceflag'
]

df_trainx = inter.iloc[trainIndex]
trainy = df_trainx['logerror'].astype(float)
df_trainx = df_trainx.drop(columns_to_drop, axis=1)
df_testx = inter.iloc[testIndex]
testy = df_testx['logerror'].astype(float)
df_testx = df_testx.drop(columns_to_drop, axis=1)

In [40]:
from sklearn.preprocessing import MinMaxScaler
standardScaler = MinMaxScaler()
df_trainx.iloc[::] = standardScaler.fit_transform(df_trainx.iloc[::])
df_testx.iloc[::] = standardScaler.transform(df_testx.iloc[::])

In [72]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(max_samples = 1024, random_state = 2)
train = df_trainx.values
clf.fit(train)
y_inoutliners = clf.predict(train) 
y_inoutliners = pd.DataFrame(y_inoutliners, columns=["inliners"])
index = y_inoutliners[y_inoutliners["inliners"] == 1].index.values
df_trainx = df_trainx.iloc[index]
df_trainx.reset_index(drop = True, inplace = True)
df_trainx.shape
trainy = trainy.iloc[index]
trainy.reset_index(drop = True, inplace = True)

In [73]:
def input_fn(df_trainx, trainy, num_epochs, shuffle=True, batch_size=32, num_threads=1):
    return tf.estimator.inputs.pandas_input_fn(
        x=df_trainx,
        y=trainy,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=num_threads)

In [74]:
def build_model_columns():

    airconditioningtypeid = tf.feature_column.numeric_column("airconditioningtypeid")
    architecturalstyletypeid = tf.feature_column.numeric_column("architecturalstyletypeid")
    basementsqft = tf.feature_column.numeric_column("basementsqft")
    bathroomcnt = tf.feature_column.numeric_column("bathroomcnt")
    bedroomcnt = tf.feature_column.numeric_column("bedroomcnt")
    buildingclasstypeid = tf.feature_column.numeric_column("buildingclasstypeid")
    buildingqualitytypeid = tf.feature_column.numeric_column("buildingqualitytypeid")
    calculatedbathnbr = tf.feature_column.numeric_column("calculatedbathnbr")
    decktypeid = tf.feature_column.numeric_column("decktypeid")
    finishedfloor1squarefeet = tf.feature_column.numeric_column("finishedfloor1squarefeet")
    calculatedfinishedsquarefeet = tf.feature_column.numeric_column("calculatedfinishedsquarefeet")
    finishedsquarefeet12 = tf.feature_column.numeric_column("finishedsquarefeet12")
    finishedsquarefeet13 = tf.feature_column.numeric_column("finishedsquarefeet13")
    finishedsquarefeet15 = tf.feature_column.numeric_column("finishedsquarefeet15")
    finishedsquarefeet50 = tf.feature_column.numeric_column("finishedsquarefeet50")
    finishedsquarefeet6 = tf.feature_column.numeric_column("finishedsquarefeet6")
    fips = tf.feature_column.numeric_column("fips")
    fullbathcnt = tf.feature_column.numeric_column("fullbathcnt")
    garagecarcnt = tf.feature_column.numeric_column("garagecarcnt")
    garagetotalsqft = tf.feature_column.numeric_column("garagetotalsqft")
    hashottuborspa = tf.feature_column.numeric_column("hashottuborspa")
    heatingorsystemtypeid = tf.feature_column.numeric_column("heatingorsystemtypeid")
    latitude = tf.feature_column.numeric_column("latitude")
    longitude = tf.feature_column.numeric_column("longitude")
    lotsizesquarefeet = tf.feature_column.numeric_column("lotsizesquarefeet")
    poolcnt = tf.feature_column.numeric_column("poolcnt")
    poolsizesum = tf.feature_column.numeric_column("poolsizesum")
    pooltypeid10 = tf.feature_column.numeric_column("pooltypeid10")
    pooltypeid2 = tf.feature_column.numeric_column("pooltypeid2")
    pooltypeid7 = tf.feature_column.numeric_column("pooltypeid7")
    propertylandusetypeid = tf.feature_column.numeric_column("propertylandusetypeid")
    rawcensustractandblock = tf.feature_column.numeric_column("rawcensustractandblock")
    regionidcity = tf.feature_column.numeric_column("regionidcity")
    regionidcounty = tf.feature_column.numeric_column("regionidcounty")
    regionidneighborhood = tf.feature_column.numeric_column("regionidneighborhood")
    regionidzip = tf.feature_column.numeric_column("regionidzip")
    roomcnt = tf.feature_column.numeric_column("roomcnt")
    storytypeid = tf.feature_column.numeric_column("storytypeid")
    threequarterbathnbr = tf.feature_column.numeric_column("threequarterbathnbr")
    typeconstructiontypeid = tf.feature_column.numeric_column("typeconstructiontypeid")
    unitcnt = tf.feature_column.numeric_column("unitcnt")
    yardbuildingsqft17 = tf.feature_column.numeric_column("yardbuildingsqft17")
    yardbuildingsqft26 = tf.feature_column.numeric_column("yardbuildingsqft26")
    yearbuilt = tf.feature_column.numeric_column("yearbuilt")
    numberofstories = tf.feature_column.numeric_column("numberofstories")
    structuretaxvaluedollarcnt = tf.feature_column.numeric_column("structuretaxvaluedollarcnt")
    taxvaluedollarcnt = tf.feature_column.numeric_column("taxvaluedollarcnt")
    assessmentyear = tf.feature_column.numeric_column("assessmentyear")
    landtaxvaluedollarcnt = tf.feature_column.numeric_column("landtaxvaluedollarcnt")
    taxamount = tf.feature_column.numeric_column("taxamount")
    taxdelinquencyflag = tf.feature_column.numeric_column("taxdelinquencyflag")
    taxdelinquencyyear = tf.feature_column.numeric_column("taxdelinquencyyear")
    censustractandblock = tf.feature_column.numeric_column("censustractandblock")
    transaction_year = tf.feature_column.numeric_column("transaction_year")
    transaction_month = tf.feature_column.numeric_column("transaction_month")
    transaction_day = tf.feature_column.numeric_column("transaction_day")

    numeric_features = [
        airconditioningtypeid, architecturalstyletypeid, basementsqft,
        bathroomcnt, bedroomcnt, buildingclasstypeid,
        buildingqualitytypeid, calculatedbathnbr, decktypeid,
        finishedfloor1squarefeet, calculatedfinishedsquarefeet,
        finishedsquarefeet12, finishedsquarefeet13, finishedsquarefeet15,
        finishedsquarefeet50, finishedsquarefeet6, fips, fullbathcnt,
        garagecarcnt, garagetotalsqft, hashottuborspa,
        heatingorsystemtypeid, latitude, longitude, lotsizesquarefeet,
        poolcnt, poolsizesum, pooltypeid10, pooltypeid2, pooltypeid7,
        propertylandusetypeid, rawcensustractandblock, regionidcity,
        regionidcounty, regionidneighborhood, regionidzip, roomcnt,
        storytypeid, threequarterbathnbr, typeconstructiontypeid,
        unitcnt, yardbuildingsqft17, yardbuildingsqft26, yearbuilt,
        numberofstories, structuretaxvaluedollarcnt, taxvaluedollarcnt,
        assessmentyear, landtaxvaluedollarcnt, taxamount,
        taxdelinquencyflag, taxdelinquencyyear, censustractandblock,
        transaction_year, transaction_month, transaction_day
    ]
    
    return numeric_features

In [75]:
def build_estimator(model_dir, model_type):
    numeric_features = build_model_columns()
    hidden_units = [1024, 512, 256, 128]

    if model_type == 'wide':
        return tf.estimator.LinearRegressor(
                model_dir=model_dir,
                feature_columns=numeric_features)
    elif model_type == 'deep':
        return tf.estimator.DNNRegressor(
                model_dir=model_dir,
                feature_columns=numeric_features,
                hidden_units=hidden_units,
                dropout=0.5)
    else:
        return tf.estimator.DNNLinearCombinedRegressor(
                model_dir=model_dir,
                linear_feature_columns=numeric_features,
                dnn_feature_columns=numeric_features,
                dnn_hidden_units=hidden_units,
                dnn_dropout=0.5)

In [76]:
import tempfile
model_dir = tempfile.mkdtemp()
m = build_estimator(model_dir, 'deep')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmp_hp30caj', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [77]:
m.train(input_fn=input_fn(df_trainx, trainy, num_epochs=5))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmp_hp30caj/model.ckpt.
INFO:tensorflow:loss = 0.882255, step = 1
INFO:tensorflow:global_step/sec: 76.011
INFO:tensorflow:loss = 0.136304, step = 101 (1.316 sec)
INFO:tensorflow:global_step/sec: 80.523
INFO:tensorflow:loss = 0.117347, step = 201 (1.242 sec)
INFO:tensorflow:global_step/sec: 77.618
INFO:tensorflow:loss = 0.0727718, step = 301 (1.288 sec)
INFO:tensorflow:global_step/sec: 77.5056
INFO:tensorflow:loss = 0.0781385, step = 401 (1.290 sec)
INFO:tensorflow:global_step/sec: 71.1167
INFO:tensorflow:loss = 0.0750263, step = 501 (1.406 sec)
INFO:tensorflow:global_step/sec: 72.9878
INFO:tensorflow:loss = 0.0981763, step = 601 (1.370 sec)
INFO:tensorflow:global_step/sec: 57.1786
INFO:tensorflow:loss = 0.107004, step = 701 (1.751 sec)
INFO:tensorflow:global_step/sec: 65.0956
INFO:tensorflow:loss = 0.0949838, step = 801 (1.534 sec)
INFO:tensorflow:g

INFO:tensorflow:loss = 0.115643, step = 8201 (1.249 sec)
INFO:tensorflow:global_step/sec: 80.2084
INFO:tensorflow:loss = 0.0598548, step = 8301 (1.247 sec)
INFO:tensorflow:global_step/sec: 81.1102
INFO:tensorflow:loss = 0.0675995, step = 8401 (1.233 sec)
INFO:tensorflow:global_step/sec: 78.1714
INFO:tensorflow:loss = 0.0932097, step = 8501 (1.279 sec)
INFO:tensorflow:global_step/sec: 78.9789
INFO:tensorflow:loss = 0.0712742, step = 8601 (1.266 sec)
INFO:tensorflow:global_step/sec: 78.9141
INFO:tensorflow:loss = 0.0507378, step = 8701 (1.267 sec)
INFO:tensorflow:global_step/sec: 79.7961
INFO:tensorflow:loss = 0.0821358, step = 8801 (1.253 sec)
INFO:tensorflow:global_step/sec: 78.131
INFO:tensorflow:loss = 0.0796325, step = 8901 (1.280 sec)
INFO:tensorflow:global_step/sec: 80.5671
INFO:tensorflow:loss = 0.103183, step = 9001 (1.241 sec)
INFO:tensorflow:global_step/sec: 80.2514
INFO:tensorflow:loss = 0.0724166, step = 9101 (1.246 sec)
INFO:tensorflow:Saving checkpoints for 9139 into /var/

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x113617e80>

In [78]:
# results = m.evaluate(input_fn=input_fn(df_testx, testy, shuffle = False, num_threads=1, num_epochs = 1))

In [79]:
y_pred = list(m.predict(input_fn=input_fn(df_testx, testy, shuffle = False, num_threads=1, num_epochs = 1)))

INFO:tensorflow:Restoring parameters from /var/folders/dx/pz80lsmn42xftdx1wckt0zh00000gn/T/tmp_hp30caj/model.ckpt-9139


In [80]:
y_pred = np.array([x["predictions"][0] for x in y_pred])

In [81]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_true=testy, y_pred=y_pred)

In [82]:
print("test set mean absolute error is {}".format(mae))

test set mean absolute error is 0.06708782437684511


In [69]:
with open('xgbresults', 'r') as f:
    xgbresults = f.read().split("\n")[:-1]

In [70]:
import numpy as np

xgbresults = np.array(xgbresults)

In [71]:
xgbresults.astype(np.float)

array([ 0.00397682,  0.0197311 ,  0.01801938, ...,  0.02489141,
        0.02957025, -0.03961992])