In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [3]:
# -----------------导入数据，dtrain表示训练数据，dval表示验证数据
# -----------------train_label表示训练标签，val_label表示测试标签
data_train = pd.read_csv('data_train.csv')
data_val = pd.read_csv('data_val.csv')
data_train = data_train[data_train['clickTime'] // 10000 < 20]
data_train['clickTime'] = data_train['clickTime'] // 100 % 100 * 60 \
                          + data_train['clickTime'] % 100
train_label = data_train['label']

data_val = data_val[data_val['clickTime'] // 10000 == 20]
data_val['clickTime'] = data_val['clickTime'] // 100 % 100 * 60 \
                        + data_val['clickTime'] % 100
val_label = data_val['label']

data_train.drop(['instanceID', 'label', 'conversionTime'], axis=1, inplace=True)
data_val.drop(['instanceID', 'label', 'conversionTime'], axis=1, inplace=True)
train_label = train_label.values
dtrain = data_train.values
val_label = val_label.values
dval = data_val.values

In [4]:
# ------------------获得训练用的xgb数据格式-----------------------------
xgb_dtrain = xgb.DMatrix(dtrain, label=train_label, missing=0.0)
xgb_dval = xgb.DMatrix(dval, label=val_label, missing=0.0)

In [63]:
# ------------------ 设置参数 ------------------------------------
param = {'bst:max_depth': 5, 
         'bst:subsample': 0.8,
         'bst:min_child_weight': 1,
         'bst:colsample_bytree': 0.2,
         'bst:eta': 0.2, 
         'bst:gamma': 0.2,
         'bst"min_child_leaf': 1,
         'bst:scale_pos_weight': 0,
         #'booster': 'gbtree',
         'silent': 0,
         'lambda': 10,
         'objective': 'binary:logistic',
         'eval_metric': 'logloss',
         'seed':55 }
plst = list(param.items())

In [64]:
evallist = [(xgb_dtrain, 'train'),(xgb_dval, 'eval')]

In [65]:
num_round =100000
print plst
bst = xgb.train(plst, xgb_dtrain, num_round, evallist, early_stopping_rounds=10)

[('bst:colsample_bytree', 0.2), ('bst:subsample', 0.8), ('eval_metric', 'logloss'), ('bst:scale_pos_weight', 0), ('bst:eta', 0.2), ('bst"min_child_leaf', 1), ('silent', 0), ('bst:gamma', 0.2), ('bst:max_depth', 5), ('seed', 55), ('objective', 'binary:logistic'), ('bst:min_child_weight', 1), ('lambda', 10)]
[0]	train-logloss:0.463055	eval-logloss:0.462051
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 10 rounds.
[1]	train-logloss:0.337166	eval-logloss:0.334813
[2]	train-logloss:0.259538	eval-logloss:0.256319
[3]	train-logloss:0.209136	eval-logloss:0.205202
[4]	train-logloss:0.175575	eval-logloss:0.17109
[5]	train-logloss:0.152999	eval-logloss:0.148126
[6]	train-logloss:0.137702	eval-logloss:0.132453
[7]	train-logloss:0.127353	eval-logloss:0.121927
[8]	train-logloss:0.120316	eval-logloss:0.114679
[9]	train-logloss:0.115656	eval-logloss:0.109967
[10]	train-logloss:0.112504	eval-logloss:0.106785
[11]

In [66]:
# ------保存模型-----------------
bst.dump_model('dump099073.raw.txt')
# bst.dump_model('dump.raw.txt', 'featmap.txt')

In [67]:
# --------------读入数据并调用模型进行预测-----------------
test = pd.read_csv('test_all.csv')
test_ = test.drop(['instanceID', 'label'], axis=1)
xgb_dtest = xgb.DMatrix(test_.values, missing=0.0)
ypred = bst.predict(xgb_dtest, ntree_limit=bst.best_iteration)

In [68]:
# ----------------保存预测结果-------------------------
test['pred'] = ypred
test[['instanceID','pred']].to_csv('099073.csv',index=None)