In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [4]:
print('Loading data ...')

train = pd.read_csv('data/train_2016_v2.csv')
prop = pd.read_csv('data/properties_2016.csv')

Loading data ...


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


In [6]:
print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

Creating training set ...
(90275, 55) (90275,)


In [7]:
print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

Building DMatrix...


In [8]:
print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 0

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

Training ...
[0]	train-mae:0.487235	valid-mae:0.486996
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.401383	valid-mae:0.401468
[20]	train-mae:0.331843	valid-mae:0.332125
[30]	train-mae:0.275689	valid-mae:0.276125
[40]	train-mae:0.230503	valid-mae:0.231019
[50]	train-mae:0.194266	valid-mae:0.194872
[60]	train-mae:0.165369	valid-mae:0.16606
[70]	train-mae:0.142402	valid-mae:0.143187
[80]	train-mae:0.124278	valid-mae:0.12517
[90]	train-mae:0.110099	valid-mae:0.111126
[100]	train-mae:0.099146	valid-mae:0.100324
[110]	train-mae:0.090778	valid-mae:0.092075
[120]	train-mae:0.084456	valid-mae:0.085868
[130]	train-mae:0.079763	valid-mae:0.081295
[140]	train-mae:0.076329	valid-mae:0.07798
[150]	train-mae:0.073832	valid-mae:0.075582
[160]	train-mae:0.072008	valid-mae:0.073843
[170]	train-mae:0.070688	valid-mae:0.07259
[180]	train-mae:0.069742	valid-mae:0.071698
[190]	train-mae:0.0690

In [9]:
print('Predicting on test ...')
d_test = xgb.DMatrix(x_test)

p_test = clf.predict(d_test)
mean_absolute_error(y_test, p_test)

Predicting on test ...


0.068312834545696449