In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split

## load data

In [2]:
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')

## make xgboost DMatrix

In [10]:
train, valid = train_test_split(train_df, test_size = 0.2, random_state=1)

y_train = train.label
x_train = train.drop(['label'], axis=1)

y_valid = valid.label
x_valid = valid.drop(['label'], axis=1)

xgb_train = xgb.DMatrix(x_train, label=y_train)
xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
xgb_test = xgb.DMatrix(test_df)

In [11]:
print x_train.shape, x_valid.shape

(33600, 784) (8400, 784)


## xgb params

In [12]:
params={
'booster':'gbtree',
'objective': 'multi:softmax', #多分类的问题
'num_class':10, # 类别数，与 multisoftmax 并用
'gamma':0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
'max_depth':12, # 构建树的深度，越大越容易过拟合
'colsample_bytree':0.7, # 生成树时进行的列采样
'min_child_weight':10, 
# 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
#，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
#这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。 
'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
'eta': 0.037, # 如同学习率
'seed':1000,
'nthread':4,# cpu 线程数
#'eval_metric': 'auc'
}
watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
clf = xgb.train(params, xgb_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-merror:0.093601	valid-merror:0.120714
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 100 rounds.
[10]	train-merror:0.038304	valid-merror:0.064643
[20]	train-merror:0.03006	valid-merror:0.058452
[30]	train-merror:0.024881	valid-merror:0.054167
[40]	train-merror:0.021012	valid-merror:0.052024
[50]	train-merror:0.018333	valid-merror:0.048929
[60]	train-merror:0.015952	valid-merror:0.046667
[70]	train-merror:0.013601	valid-merror:0.045
[80]	train-merror:0.012202	valid-merror:0.04381
[90]	train-merror:0.010149	valid-merror:0.042857
[100]	train-merror:0.008363	valid-merror:0.04131
[110]	train-merror:0.006994	valid-merror:0.039881
[120]	train-merror:0.005595	valid-merror:0.038929
[130]	train-merror:0.004702	valid-merror:0.037857
[140]	train-merror:0.00369	valid-merror:0.036667
[150]	train-merror:0.002649	valid-merror:0.035714
[160]	train-merror:0.002262	valid-merror:0.035476
[170]	train-merror

In [53]:
pre_test = clf.predict(xgb_test)
submit_df = pd.read_csv('data/sample_submission.csv')
submit_df.Label = pre_test.astype(int)
submit_df.to_csv('data/xgboost.csv',index=None,encoding='utf-8')
submit_df.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,3
