In [17]:
import numpy as np
import pandas as pd
from sklearn import *
import xgboost as xgb
import lightgbm as lgb
from multiprocessing import *
from sklearn import grid_search
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error

In [79]:
def transform_df(df):
    df = pd.DataFrame(df)
    dcols = [c for c in df.columns if c not in  ['id',"血糖"]]
    df['negative_one_vals'] = np.sum((df[dcols] == -1).values, axis = 1)
    df['体检日期'] = (pd.to_datetime(df['体检日期']) - pd.to_datetime("2017-10-09")).dt.days
    for c in df.columns:
        if c not in ['id','年龄','血糖']:
            df['年龄 * ' + str(c)] = df['年龄'] * df[c]  
    return df

In [4]:
def multi_transform(df):
    print("Init　Shape: ", df.shape)
    p = Pool(cpu_count())
    df = p.map(transform_df, np.array_split(df, cpu_count()))
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    p.close()
    p.join()
    print("After Shape: ", df.shape)
    return df

In [80]:
train = pd.read_csv("data/d_train_20180102.csv",encoding='gbk')
test = pd.read_csv("data/d_test_A_20180102.csv", encoding='gbk')
cols = [c for c in train.columns if c not in ['id',"血糖"]]
train['性别'] = 1 * (train['性别'] == "男")
test['性别'] = 1 * (test['性别'] == "男")
print(len(cols))
### ---------------------
d_median = train.median(axis=0)
d_mean = train.mean(axis=0)
train = train.fillna(-1)
one_hot = {c : list(train[c].unique()) for c in train.columns if c not in  ['id',"血糖"]}

40


In [81]:
params = {'eta': 0.2, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'reg:gamma', 'eval_metric': 'rmse', 'seed': 99, 'silent': True}

In [82]:
col = [c for c in train.columns if c not in ['id','血糖']]

In [83]:
x1, x2, y1, y2 = model_selection.train_test_split(train[col], train['血糖'],test_size = 0.25, random_state = 99)
x1 = multi_transform(x1)
x2 = multi_transform(x2)
test = multi_transform(test)

Init　Shape:  (4231, 40)
After Shape:  (4231, 81)
Init　Shape:  (1411, 40)
After Shape:  (1411, 81)
Init　Shape:  (1000, 41)
After Shape:  (1000, 82)


In [84]:
def feval_xgb(pred, y):
    y = y.get_label()
    score = mean_squared_error(y, pred) * 0.5 * -1
    return  "feval", score

In [85]:
cols = [c for c in x1.columns if c not in ['id',"血糖"]]

In [86]:
watchlist= [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2),'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 5000, watchlist,feval= feval_xgb,
                 maximize=True,verbose_eval=100, early_stopping_rounds = 200)
test['target'] = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit+45)

[0]	train-feval:-13.9944	valid-feval:-13.4309
Multiple eval metrics have been passed: 'valid-feval' will be used for early stopping.

Will train until valid-feval hasn't improved in 200 rounds.
[100]	train-feval:-0.334421	valid-feval:-0.801508
[200]	train-feval:-0.150517	valid-feval:-0.805892
Stopping. Best iteration:
[37]	train-feval:-0.637869	valid-feval:-0.776545



In [87]:
sorted(model.get_fscore().items(), key=lambda value: value[1], reverse=True)

[('年龄 * 体检日期', 87),
 ('白细胞计数', 85),
 ('年龄 * 白细胞计数', 77),
 ('体检日期', 74),
 ('*天门冬氨酸氨基转换酶', 74),
 ('红细胞平均血红蛋白浓度', 72),
 ('红细胞计数', 70),
 ('年龄 * 甘油三酯', 70),
 ('*总蛋白', 67),
 ('*r-谷氨酰基转换酶', 66),
 ('红细胞体积分布宽度', 65),
 ('年龄 * 单核细胞%', 63),
 ('尿素', 63),
 ('年龄 * 嗜碱细胞%', 61),
 ('尿酸', 60),
 ('*碱性磷酸酶', 59),
 ('*丙氨酸氨基转换酶', 58),
 ('年龄 * 中性粒细胞%', 58),
 ('红细胞平均体积', 57),
 ('单核细胞%', 57),
 ('年龄 * 嗜酸细胞%', 54),
 ('血小板平均体积', 54),
 ('年龄', 52),
 ('淋巴细胞%', 51),
 ('红细胞压积', 48),
 ('年龄 * 血小板计数', 47),
 ('肌酐', 46),
 ('白蛋白', 46),
 ('血小板体积分布宽度', 45),
 ('血红蛋白', 45),
 ('年龄 * *丙氨酸氨基转换酶', 44),
 ('年龄 * 血小板体积分布宽度', 43),
 ('总胆固醇', 43),
 ('嗜酸细胞%', 41),
 ('年龄 * 尿素', 41),
 ('年龄 * 血小板比积', 41),
 ('年龄 * *r-谷氨酰基转换酶', 40),
 ('年龄 * *天门冬氨酸氨基转换酶', 39),
 ('中性粒细胞%', 38),
 ('年龄 * 红细胞计数', 38),
 ('低密度脂蛋白胆固醇', 38),
 ('年龄 * 血红蛋白', 38),
 ('甘油三酯', 37),
 ('*球蛋白', 36),
 ('白球比例', 35),
 ('红细胞平均血红蛋白量', 35),
 ('年龄 * 性别', 35),
 ('年龄 * *碱性磷酸酶', 35),
 ('年龄 * 血小板平均体积', 34),
 ('高密度脂蛋白胆固醇', 34),
 ('血小板计数', 33),
 ('年龄 * 肌酐', 33),
 ('年龄 * 淋巴细胞%', 32),
 ('年龄 * 高