In [7]:
import numpy as np
import pandas as pd
import xgboost
import sklearn

import matplotlib.pyplot as plt

In [8]:
# step1：去掉数据不完整的行
df = pd.read_csv("2017.csv").dropna()
# step2：性别、是否吸烟等特征转one-hot
df = pd.get_dummies(df, columns=['sex','SMK_STAT_TYPE_RSPS_CD'])
# step3: 去掉creatinine高于2.5的样本
df = df.loc[(df["CREATININE "]<=2.5)&(df["CREATININE "]>=0.0)]
# step4: 切分训练集与测试集
df_train = df[0:900000]; df_test = df[900000:990112] 
# step5: 选取特征x与目标y
y_train = df_train["CREATININE "].values.reshape(-1, 1)
x_train = df_train.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values
y_test = df_test["CREATININE "].values.reshape(-1, 1)
x_test = df_test.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values

In [10]:
# step6: 使用自定义loss_func训练模型，计算预测值与真实值的R2
mean_value = df["CREATININE "].mean()

weights = [1] * 10

intervals = [0.1 * i * (df['CREATININE '].max() - df['CREATININE '].min()) + df['CREATININE '].min() for i in range(0, 11)]
intervals[10] += 0.001

def custom_loss(true,pred):
    
    error = pred - true
    
    global weights, intervals

    #统计特定区间内误差总和及样本数量，并计算均值
    error_sum, sample_count = [0] * 10, [0] * 10
    
    for i in range(len(true)):
        
        error_abs = abs(true[i] - pred[i])
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                error_sum[j] += error_abs
                sample_count[j] += 1
                break

    weights = [error_sum[j] / sample_count[j] if sample_count[j] > 0 else 0 for j in range(10)]

    #调整特定区间内样本的梯度
    for i in range(len(true)):
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                
                error[i] = error[i] * (weights[j] / np.mean(weights)); 
                
                break
                
    grad = error
    
    return grad, np.ones_like(pred)

model = xgboost.XGBRegressor(
            max_depth=10, n_estimators=500, learning_rate=0.05, base_score = mean_value,
            min_child_weight=100, nthread=4, subsample=1, early_stopping_rounds=10, 
            colsample_bytree=1, reg_alpha=0, reg_lambda=0, gamma=0, 
            eval_metric='rmse', objective=custom_loss, seed=10
        )

model.fit(
    x_train, y_train, verbose=1, 
    eval_set=[(x_train, y_train),(x_test, y_test)],  
)

print("R2指数：",model.score(x_test, y_test))

[0]	validation_0-rmse:0.20213	validation_1-rmse:0.20039
[1]	validation_0-rmse:0.20067	validation_1-rmse:0.19896
[2]	validation_0-rmse:0.19926	validation_1-rmse:0.19757
[3]	validation_0-rmse:0.19790	validation_1-rmse:0.19623
[4]	validation_0-rmse:0.19658	validation_1-rmse:0.19493
[5]	validation_0-rmse:0.19530	validation_1-rmse:0.19367
[6]	validation_0-rmse:0.19407	validation_1-rmse:0.19246
[7]	validation_0-rmse:0.19287	validation_1-rmse:0.19129
[8]	validation_0-rmse:0.19171	validation_1-rmse:0.19015
[9]	validation_0-rmse:0.19059	validation_1-rmse:0.18905
[10]	validation_0-rmse:0.18950	validation_1-rmse:0.18799
[11]	validation_0-rmse:0.18844	validation_1-rmse:0.18696
[12]	validation_0-rmse:0.18742	validation_1-rmse:0.18597
[13]	validation_0-rmse:0.18643	validation_1-rmse:0.18501
[14]	validation_0-rmse:0.18547	validation_1-rmse:0.18407
[15]	validation_0-rmse:0.18454	validation_1-rmse:0.18317
[16]	validation_0-rmse:0.18363	validation_1-rmse:0.18230
[17]	validation_0-rmse:0.18276	validation