In [87]:
import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error



data_path = 'data/'

train = pd.read_csv(data_path+'d_train_20180102.csv',encoding='gb2312')
test = pd.read_csv(data_path+'d_test_A_20180102.csv',encoding='gb2312')

def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train,test])

    data['性别'] = data['性别'].map({'男':1,'女':0})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-10-09')).dt.days
    data['年龄 * 血红蛋白'] = data['年龄'] + data['血红蛋白'] 
    data['年龄 * 中性粒细胞%'] = data['年龄'] + data['中性粒细胞%']
    data['年龄 * 红细胞平均血红蛋白浓度'] = data['年龄'] + data['红细胞平均血红蛋白浓度'] 
    data['年龄 * *碱性磷酸酶'] = data['年龄'] + data['*碱性磷酸酶']  
    data.fillna(data.median(axis=0),inplace=True)

    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]

    return train_feat,test_feat



train_feat,test_feat = make_feat(train,test)

predictors = [f for f in test_feat.columns if f not in ['id','血糖']]


def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label,pred)*0.5
    return ('0.5mse',score,False)

print('开始训练...')
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

print('开始CV 5折训练...')
t0 = time.time()
train_preds = np.zeros(train_feat.shape[0])
test_preds = np.zeros((test_feat.shape[0], 5))
kf = KFold(len(train_feat), n_folds = 5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = train_feat.iloc[train_index]
    train_feat2 = train_feat.iloc[test_index]
    lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖'],categorical_feature=['性别'])
    lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖'])
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=100)
    feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    train_preds[test_index] += gbm.predict(train_feat2[predictors])
    test_preds[:,i] = gbm.predict(test_feat[predictors])
print('线下得分：    {}'.format(mean_squared_error(train_feat['血糖'],train_preds)*0.5))
print('CV训练用时{}秒'.format(time.time() - t0))

submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),header=None,
                  index=False, float_format='%.4f')

开始训练...
开始CV 5折训练...
第0次训练...




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 3.00344	valid_0's 0.5mse: 1.50172
[200]	valid_0's l2: 2.88653	valid_0's 0.5mse: 1.44326
[300]	valid_0's l2: 2.83623	valid_0's 0.5mse: 1.41812
[400]	valid_0's l2: 2.81338	valid_0's 0.5mse: 1.40669
[500]	valid_0's l2: 2.80013	valid_0's 0.5mse: 1.40007
[600]	valid_0's l2: 2.79346	valid_0's 0.5mse: 1.39673
[700]	valid_0's l2: 2.79183	valid_0's 0.5mse: 1.39592
Early stopping, best iteration is:
[623]	valid_0's l2: 2.79043	valid_0's 0.5mse: 1.39521
第1次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 1.99659	valid_0's 0.5mse: 0.998295
[200]	valid_0's l2: 1.9235	valid_0's 0.5mse: 0.96175
[300]	valid_0's l2: 1.89079	valid_0's 0.5mse: 0.945394
[400]	valid_0's l2: 1.87089	valid_0's 0.5mse: 0.935446
[500]	valid_0's l2: 1.86065	valid_0's 0.5mse: 0.930325
[600]	valid_0's l2: 1.85754	valid_0's 0.5mse: 0.928768
[700]	valid_0's l2: 1.85588	valid_0's 0.5mse: 0.927939
[800]	valid_0's

In [96]:
predict_data1 = pd.read_csv("sub20180115_193923.csv", header=None)
predict_data2 = pd.read_csv("sub20180115_172937.csv",header=None)
predict_data = pd.concat([predict_data1, predict_data2], axis=1)
predict_data['finall'] = predict_data.mean(axis=1)

In [98]:
predict_data['finall'].to_csv("result2018_1_15.csv",header=None,index=False)

### 0.9654828715994404

In [88]:
 pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)

年龄 * 中性粒细胞%         593
体检日期                582
年龄 * 血红蛋白           570
甘油三酯                501
年龄 * 红细胞平均血红蛋白浓度    496
红细胞平均体积             452
尿酸                  446
*r-谷氨酰基转换酶          445
年龄                  438
白细胞计数               372
年龄 * *碱性磷酸酶         371
红细胞体积分布宽度           366
尿素                  364
红细胞计数               334
血小板体积分布宽度           324
淋巴细胞%               319
*天门冬氨酸氨基转换酶         303
*丙氨酸氨基转换酶           287
*碱性磷酸酶              279
血红蛋白                276
总胆固醇                275
血小板平均体积             258
红细胞压积               255
红细胞平均血红蛋白浓度         249
血小板计数               226
单核细胞%               218
低密度脂蛋白胆固醇           216
中性粒细胞%              213
*总蛋白                202
白蛋白                 194
肌酐                  193
红细胞平均血红蛋白量          193
高密度脂蛋白胆固醇           193
嗜酸细胞%               188
血小板比积               150
*球蛋白                143
白球比例                110
性别                   93
嗜碱细胞%                69
乙肝核心抗体               43
乙肝表面抗体               34
乙肝e抗体           