In [7]:
import numpy as np
import pandas as pd
import xgboost
import sklearn

import matplotlib.pyplot as plt

In [8]:
# step1：去掉数据不完整的行
df = pd.read_csv("2017.csv").dropna()
# step2：性别、是否吸烟等特征转one-hot
df = pd.get_dummies(df, columns=['sex','SMK_STAT_TYPE_RSPS_CD'])
# step3: 去掉creatinine高于2.5的样本
df = df.loc[(df["CREATININE "]<=2.5)&(df["CREATININE "]>=0.0)]
# step4: 切分训练集与测试集
df_train = df[0:900000]; df_test = df[900000:990112] 
# step5: 选取特征x与目标y
y_train = df_train["CREATININE "].values.reshape(-1, 1)
x_train = df_train.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values
y_test = df_test["CREATININE "].values.reshape(-1, 1)
x_test = df_test.drop(columns=["id", "sido", "CREATININE ", "GFR", "stage", "height", "weight"]).values

In [12]:
# step6: 使用自定义loss_func与自定义eval_metric训练模型，计算预测值与真实值的R2
mean_value = df["CREATININE "].mean()

weights = [1] * 10

intervals = [0.1 * i * (df['CREATININE '].max() - df['CREATININE '].min()) + df['CREATININE '].min() for i in range(0, 11)]
intervals[10] += 0.001

def custom_loss(true,pred):
    
    error = pred - true
    
    global weights, intervals
    
    for i in range(len(true)):
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                error[i] = error[i] * (weights[j] / np.mean(weights))
                break
                
    grad = error
    
    return grad, np.ones_like(pred)

def custom_eval(pred, true):

    error_sum, sample_count = [0] * 10, [0] * 10
    
    global weights, intervals

    for i in range(len(true)):
        
        error_abs = abs(true[i] - pred[i])
        
        for j in range(0, len(intervals)-1):
            
            if intervals[j] <= true[i] < intervals[j+1]:
                error_sum[j] += error_abs
                sample_count[j] += 1
                break

    weights = [error_sum[j] / sample_count[j] if sample_count[j] > 0 else 0 for j in range(10)]
    
    return np.mean(abs(pred-true))

model = xgboost.XGBRegressor(
            max_depth=10, n_estimators=500, learning_rate=0.05, base_score = mean_value,
            min_child_weight=100, nthread=4, subsample=1, early_stopping_rounds=10, 
            colsample_bytree=1, reg_alpha=0, reg_lambda=0, gamma=0, 
            eval_metric=custom_eval, objective=custom_loss,seed=10
        )

model.fit(
    x_train, y_train, verbose=1, 
    eval_set=[(x_train, y_train),(x_test, y_test)],  
    
)

print("R2指数：",model.score(x_test, y_test))

[0]	validation_0-rmse:0.19994	validation_0-custom_eval:0.15819	validation_1-rmse:0.19820	validation_1-custom_eval:0.15734
[1]	validation_0-rmse:0.19628	validation_0-custom_eval:0.15425	validation_1-rmse:0.19464	validation_1-custom_eval:0.15349
[2]	validation_0-rmse:0.19464	validation_0-custom_eval:0.15156	validation_1-rmse:0.19310	validation_1-custom_eval:0.15087
[3]	validation_0-rmse:0.19420	validation_0-custom_eval:0.15192	validation_1-rmse:0.19273	validation_1-custom_eval:0.15128
[4]	validation_0-rmse:0.19444	validation_0-custom_eval:0.15314	validation_1-rmse:0.19304	validation_1-custom_eval:0.15255
[5]	validation_0-rmse:0.19505	validation_0-custom_eval:0.15431	validation_1-rmse:0.19372	validation_1-custom_eval:0.15377
[6]	validation_0-rmse:0.19586	validation_0-custom_eval:0.15539	validation_1-rmse:0.19457	validation_1-custom_eval:0.15489
[7]	validation_0-rmse:0.19677	validation_0-custom_eval:0.15641	validation_1-rmse:0.19552	validation_1-custom_eval:0.15595
[8]	validation_0-rmse:0.