In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

## read data

In [9]:
train = pd.read_csv('data/train_2016.csv')
prop = pd.read_csv('data/properties_2016.csv')
sample = pd.read_csv('data/sample_submission.csv')

## merge data frame

In [10]:
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')

del train; prop; gc.collect()

49

## clearn data:

* drop outliers

In [11]:
df_train = df_train[df_train.logerror > -0.4]
df_train = df_train[df_train.logerror < 0.4]
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values

## Building DMatrix...

In [12]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

Building DMatrix...


21

## Training ......

In [20]:
params = {}
params['eta'] = 0.01
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['min_child_weight'] = 20
params['colsample_bytree'] = 0.2
params['max_depth'] = 12
params['lambda'] = 0.3
params['alpha'] = 0.6
params['silent'] = 1


watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-mae:0.488901	valid-mae:0.48149
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.442157	valid-mae:0.434769
[20]	train-mae:0.399881	valid-mae:0.392512
[30]	train-mae:0.36171	valid-mae:0.354366
[40]	train-mae:0.327365	valid-mae:0.32009
[50]	train-mae:0.296491	valid-mae:0.289292
[60]	train-mae:0.268757	valid-mae:0.261687
[70]	train-mae:0.243866	valid-mae:0.236925
[80]	train-mae:0.221555	valid-mae:0.214757
[90]	train-mae:0.201567	valid-mae:0.194952
[100]	train-mae:0.183689	valid-mae:0.177264
[110]	train-mae:0.167722	valid-mae:0.161481
[120]	train-mae:0.15347	valid-mae:0.147386
[130]	train-mae:0.140767	valid-mae:0.134834
[140]	train-mae:0.129462	valid-mae:0.123703
[150]	train-mae:0.119433	valid-mae:0.113841
[160]	train-mae:0.110555	valid-mae:0.105106
[170]	train-mae:0.102707	valid-mae:0.097404
[180]	train-mae:0.095792	valid-mae:0.090649
[190]	train-mae:0.089717	valid-mae: