In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np



In [2]:
trainFile = "train_2016_v2.csv"
df_train = pd.read_csv(trainFile, header = 0, skipinitialspace=True, engine="python", 
                       parse_dates=["transactiondate"])

propertiesFile = "properties_2016.csv"
# use float dtype to handle na in pandas
columnDtypes = {'parcelid':int, 'airconditioningtypeid':str, 'architecturalstyletypeid':str, 'basementsqft':float,
                'bathroomcnt':float, 'bedroomcnt':float, 'buildingclasstypeid':str, 'buildingqualitytypeid':str,
                'calculatedbathnbr':float, 'decktypeid':str, 'finishedfloor1squarefeet':float, 'calculatedfinishedsquarefeet':float,
                'finishedsquarefeet12':float, 'finishedsquarefeet13':float, 'finishedsquarefeet15':float, 'finishedsquarefeet50':float,
                'finishedsquarefeet6':float, 'fips':str, 'fireplacecnt':float, 'fullbathcnt':float, 'garagecarcnt':float, 'garagetotalsqft':float,
                'hashottuborspa':str, 'heatingorsystemtypeid':str, 'latitude':float, 'longitude':float, 'lotsizesquarefeet':float, 'poolcnt':float,
                'poolsizesum':float, 'pooltypeid10':str, 'pooltypeid2':str, 'pooltypeid7':str, 'propertycountylandusecode':str,
                'propertylandusetypeid':str, 'propertyzoningdesc':str, 'rawcensustractandblock':float, 'regionidcity':str,
                'regionidcounty':str, 'regionidneighborhood':str, 'regionidzip':str, 'roomcnt':float, 'storytypeid':float,
                'threequarterbathnbr': float, 'typeconstructiontypeid':float, 'unitcnt':float, 'yardbuildingsqft17':float,
                'yardbuildingsqft26':float, 'yearbuilt':float, 'numberofstories': float, 'fireplaceflag':str, 'structuretaxvaluedollarcnt':float,
                'taxvaluedollarcnt': float, 'assessmentyear':float, 'landtaxvaluedollarcnt': float, 'taxamount':float, 'taxdelinquencyflag':str,
                'taxdelinquencyyear': float, 'censustractandblock':float}
df_properties = pd.read_csv(propertiesFile, header = 0, skipinitialspace=True, dtype=columnDtypes, 
                            engine="c")

In [3]:
# apply label encoder on properties
from sklearn.preprocessing import LabelEncoder
for column in df_properties.columns:
    df_properties[column] = df_properties[column].fillna(-1)
    if 'object' == df_properties[column].dtype:
        labelEncoder = LabelEncoder()
        target = list(df_properties[column].values)
        labelEncoder.fit(target)
        df_properties[column] = labelEncoder.transform(target)
        
inter = pd.merge(df_properties, df_train, how="inner", on=["parcelid"]);

inter['transactiondate'] = pd.to_datetime(df_train["transactiondate"])
inter['transaction_year'] = inter['transactiondate'].dt.year
inter['transaction_month'] = inter['transactiondate'].dt.month
inter['transaction_day'] = inter['transactiondate'].dt.day

import numpy as np
np.random.seed(1)
datasetSize = inter.shape[0]
trainRatio = .8
trainIndex = set(np.random.choice(datasetSize, int(datasetSize * trainRatio), replace=False))
testIndex = set(range(datasetSize)) - trainIndex
# cast to list to indexing dataframe
trainIndex = list(trainIndex)
testIndex = list(testIndex)

columns_to_drop = [
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode'
]

df_trainx = inter.iloc[trainIndex]
trainy = df_trainx['logerror'].astype(float)
df_trainx = df_trainx.drop(columns_to_drop, axis=1)
df_testx = inter.iloc[testIndex]
testy = df_testx['logerror'].astype(float)
df_testx = df_testx.drop(columns_to_drop, axis=1)

from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
df_trainx.iloc[::] = standardScaler.fit_transform(df_trainx.iloc[::])
df_testx.iloc[::] = standardScaler.transform(df_testx.iloc[::])

In [4]:
train = xgb.DMatrix(df_trainx, label=trainy)
test = xgb.DMatrix(df_testx, label=testy)

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(train, 'train'), (test, 'test')]
clf = xgb.train(params, train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-mae:0.487248	test-mae:0.487455
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:0.401584	test-mae:0.401683
[20]	train-mae:0.332203	test-mae:0.332191
[30]	train-mae:0.276187	test-mae:0.276067
[40]	train-mae:0.231109	test-mae:0.230957
[50]	train-mae:0.194949	test-mae:0.194821
[60]	train-mae:0.166115	test-mae:0.165936
[70]	train-mae:0.143201	test-mae:0.142897
[80]	train-mae:0.125131	test-mae:0.124721
[90]	train-mae:0.111002	test-mae:0.110526
[100]	train-mae:0.100073	test-mae:0.099536
[110]	train-mae:0.091729	test-mae:0.091154
[120]	train-mae:0.085426	test-mae:0.08482
[130]	train-mae:0.080736	test-mae:0.080107
[140]	train-mae:0.077303	test-mae:0.076627
[150]	train-mae:0.074779	test-mae:0.074071
[160]	train-mae:0.072942	test-mae:0.072208
[170]	train-mae:0.071606	test-mae:0.070852
[180]	train-mae:0.070639	test-mae:0.069874
[190]	train-mae:0.06993	test-mae:0.069161
[200]	train

In [5]:
y_pred = clf.predict(test)

In [6]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true = testy.values, y_pred =y_pred)


0.067219126546435054

In [8]:
with open ('xgbresults', 'w') as f:
    for y in y_pred:
        f.write("{}\n".format(y))