## XGboost on train_v2

In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.metrics import mean_absolute_error as mae
#%%

## settings

In [4]:
path2traindata='../../data/train_2016_v2.csv'
path2property='../../data/properties_2016.csv'
path2sample='../../data/sample_submission.csv'
path2submission='./output/submissions/'

### loading data

In [5]:
# loading data
train = pd.read_csv(path2traindata)
properties = pd.read_csv(path2property)

for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)

# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.4 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

  interactivity=interactivity, compiler=compiler, result=result)


Shape train: (90275, 57)
Shape test: (2985217, 57)
After removing outliers:
Shape train: (88431, 57)
Shape test: (2985217, 57)


In [21]:
#%%
# xgboost params
xgb_params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}


dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

# cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=50,
                   num_boost_round=200,
                   early_stopping_rounds=50,
                   verbose_eval=10, 
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

[0]	train-mae:0.0528301	test-mae:0.0528421
[10]	train-mae:0.0523636	test-mae:0.0525028
[20]	train-mae:0.0521413	test-mae:0.0524008
[30]	train-mae:0.0519888	test-mae:0.0523475
[40]	train-mae:0.051875	test-mae:0.0523225
[50]	train-mae:0.0517748	test-mae:0.0523078
[60]	train-mae:0.0516861	test-mae:0.0522989
[70]	train-mae:0.0516016	test-mae:0.0522955
[80]	train-mae:0.0515201	test-mae:0.0522864
[90]	train-mae:0.0514431	test-mae:0.0522829
[100]	train-mae:0.0513677	test-mae:0.0522738
[110]	train-mae:0.051295	test-mae:0.0522689
[120]	train-mae:0.0512256	test-mae:0.0522631
[130]	train-mae:0.0511574	test-mae:0.0522607
[140]	train-mae:0.0510898	test-mae:0.052262
[150]	train-mae:0.0510218	test-mae:0.0522572
[160]	train-mae:0.0509556	test-mae:0.0522538
[170]	train-mae:0.0508904	test-mae:0.0522557
[180]	train-mae:0.0508261	test-mae:0.0522573
[190]	train-mae:0.0507634	test-mae:0.0522574
200


In [22]:
#%%
pred = model.predict(dtest)
y_pred=[]

for i,predict in enumerate(pred):
    y_pred.append(str(round(predict,4)))
y_pred=np.array(y_pred)

output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})
# set col 'ParceID' to first col
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]

#from datetime import datetime
#output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

#%% create sumission

#sub.to_csv('lgb_starter.csv', index=False, float_format='%.4f')
import datetime
now = datetime.datetime.now()
info='xgb_'
suffix = info + '_' + str(now.strftime("%Y-%m-%d-%H-%M"))
sub_file = os.path.join(path2submission, 'submission_' + suffix + '.csv')

output.to_csv(sub_file, index=False, float_format='%.4f')
print(output.head())    

   ParcelId   201610   201611   201612   201710   201711   201712
0  10754147  -0.0666  -0.0666  -0.0666  -0.0666  -0.0666  -0.0666
1  10759547  -0.0201  -0.0201  -0.0201  -0.0201  -0.0201  -0.0201
2  10843547  -0.0001  -0.0001  -0.0001  -0.0001  -0.0001  -0.0001
3  10859147   0.1075   0.1075   0.1075   0.1075   0.1075   0.1075
4  10879947   0.0494   0.0494   0.0494   0.0494   0.0494   0.0494
